def test_custom_metrics_multiple_results(aggregator, check): con = mock.MagicMock() cursor = mock.MagicMock() data = [["tag_value1", "1"], ["tag_value2", "2"]] cursor.fetchall.side_effect = lambda: iter(data) con.cursor.return_value = cursor custom_queries = [{ "metric_prefix": "oracle.test1", "query": "mocked", "columns": [{ "name": "tag_name", "type": "tag" }, { "name": "metric", "type": "gauge" }], "tags": ["query_tags1"], }] check.instance['custom_queries'] = custom_queries check._fix_custom_queries() check._connection = con query_manager = QueryManager(check, check.execute_query_raw, tags=['custom_tag']) query_manager.compile_queries() query_manager.execute() aggregator.assert_metric( "oracle.test1.metric", value=1, count=1, tags=["tag_name:tag_value1", "query_tags1", "custom_tag"]) aggregator.assert_metric( "oracle.test1.metric", value=2, count=1, tags=["tag_name:tag_value2", "query_tags1", "custom_tag"])
class TeradataCheck(AgentCheck, ConfigMixin): __NAMESPACE__ = 'teradata' def __init__(self, name, init_config, instances): super(TeradataCheck, self).__init__(name, init_config, instances) self._connect_params = None self._connection = None self._tags = [] self._query_errors = 0 self._tables_filter = None manager_queries = deepcopy(DEFAULT_QUERIES) if is_affirmative(self.instance.get('collect_res_usage_metrics', False)): manager_queries.extend(COLLECT_RES_USAGE) if is_affirmative( self.instance.get('collect_table_disk_metrics', False)): manager_queries.extend(COLLECT_ALL_SPACE) self._query_manager = QueryManager( self, self._execute_query_raw, queries=manager_queries, tags=self._tags, error_handler=self._executor_error_handler, ) self.check_initializations.append(self.initialize_config) self.check_initializations.append(self._query_manager.compile_queries) def check(self, _): # type: (Any) -> None self._query_errors = 0 try: with self.connect() as conn: if conn: self._connection = conn self._query_manager.execute() self.submit_health_checks() except Exception as e: self.service_check(SERVICE_CHECK_CONNECT, ServiceCheck.CRITICAL, tags=self._tags) raise e def initialize_config(self): # type: (Any) -> None self._connect_params = json.dumps({ 'host': self.config.server, 'account': self.config.account, 'database': self.config.database, 'dbs_port': str(self.config.port), 'logmech': self.config.auth_mechanism, 'logdata': self.config.auth_data, 'user': self.config.username, 'password': self.config.password, 'https_port': str(self.config.https_port), 'sslmode': self.config.ssl_mode, 'sslprotocol': self.config.ssl_protocol, }) global_tags = [ 'teradata_server:{}'.format(self.instance.get('server')), 'teradata_port:{}'.format(self.instance.get('port', 1025)), ] self._tags = list(self.config.tags) self._tags.extend(global_tags) self._query_manager.tags = self._tags self._tables_filter = create_tables_filter(self.config.tables) def _execute_query_raw(self, query): # type: (AnyStr) -> Iterable[Sequence] with closing(self._connection.cursor()) as cursor: query = query.format(self.config.database) cursor.execute(query) if cursor.rowcount < 1: self._query_errors += 1 self.log.warning('Failed to fetch records from query: `%s`.', query) return None for row in cursor.fetchall(): query_name = re.search(r'(DBC.[^\s]+)', query).group(1) try: yield self._queries_processor(row, query_name) except Exception as e: self.log.debug( 'Unable to process row returned from query "%s", skipping row %s. %s', query_name, row, e) yield row def _executor_error_handler(self, error): # type: (AnyStr) -> AnyStr self._query_errors += 1 return error @contextmanager def connect(self): # type: () -> Iterator[teradatasql.connection] conn = None if TERADATASQL_IMPORT_ERROR: self.log.error( 'Teradata SQL Driver module is unavailable. Please double check your installation and refer to the ' 'Datadog documentation for more information. %s', TERADATASQL_IMPORT_ERROR, ) raise TERADATASQL_IMPORT_ERROR self.log.info('Connecting to Teradata database %s on server %s.', self.config.database, self.config.server) try: conn = teradatasql.connect(self._connect_params) self.log.info('Connected to Teradata.') yield conn except Exception as e: self.log.error('Unable to connect to Teradata. %s.', e) raise e finally: if conn: conn.close() def submit_health_checks(self): # type: () -> None connect_status = ServiceCheck.OK query_status = ServiceCheck.CRITICAL if self._query_errors else ServiceCheck.OK self.service_check(SERVICE_CHECK_QUERY, query_status, tags=self._tags) self.service_check(SERVICE_CHECK_CONNECT, connect_status, tags=self._tags) def _queries_processor(self, row, query_name): # type: (Sequence, AnyStr) -> Sequence """ Validate timestamps, filter tables, and normalize empty tags. """ unprocessed_row = row # Return database version immediately if query_name == 'DBC.DBCInfoV': submit_version(self, row) return unprocessed_row # Only Resource Usage rows include timestamps and also do not include tags. if query_name == 'DBC.ResSpmaView': processed_row = timestamp_validator(self, unprocessed_row) return processed_row # Only AllSpaceV rows include table tags if (query_name == 'DBC.AllSpaceV' and is_affirmative(self.config.collect_table_disk_metrics) and self._tables_filter): tables_filtered_row = filter_tables(self._tables_filter, unprocessed_row) if tables_filtered_row: processed_row = tags_normalizer(tables_filtered_row, query_name) return processed_row # Discard row if empty (table is filtered out) return tables_filtered_row processed_row = tags_normalizer(unprocessed_row, query_name) self.log.trace('Row processor returned: %s. \nFrom query: "%s"', processed_row, query_name) return processed_row
class RethinkDBCheck(AgentCheck): """ Collect metrics from a RethinkDB cluster. """ __NAMESPACE__ = 'rethinkdb' SERVICE_CHECK_CONNECT = 'can_connect' def __init__(self, *args, **kwargs): # type: (*Any, **Any) -> None super(RethinkDBCheck, self).__init__(*args, **kwargs) self._config = Config(cast(Instance, self.instance)) if self._config.password: self.register_secret(self._config.password) self._conn = None # type: Optional[rethinkdb.net.Connection] manager_queries = [ queries.ClusterMetrics, queries.ServerMetrics, queries.DatabaseConfigMetrics, queries.DatabaseTableMetrics, queries.TableConfigMetrics, queries.ReplicaMetrics, queries.ShardMetrics, queries.JobMetrics, queries.CurrentIssuesMetrics, ] # type: list if self.is_metadata_collection_enabled: manager_queries.append(queries.VersionMetadata) self._query_manager = QueryManager( self, executor=self._execute_raw_query, queries=manager_queries, tags=self._config.tags, ) self._query_funcs = {} # type: Dict[str, Callable] self.check_initializations.append(self._query_manager.compile_queries) def _execute_raw_query(self, query): # type: (str) -> List[tuple] query_func = self._query_funcs.get(query) if query_func is None: # QueryManager only supports `str` queries. # So here's the workaround: we make `query` refer to the import paths of query functions, then import here. # Cache the results so imports only happen on the first check run. module_name, _, func_name = query.partition(':') module = importlib.import_module(module_name, package='datadog_checks.rethinkdb') query_func = getattr(module, func_name) self._query_funcs[query] = query_func return query_func(self._conn) @contextmanager def connect_submitting_service_checks(self): # type: () -> Iterator[None] config = self._config tags = config.service_check_tags try: with rethinkdb.r.connect( host=config.host, port=config.port, user=config.user, password=config.password, ssl={'ca_certs': config.tls_ca_cert} if config.tls_ca_cert is not None else {}, ) as conn: self._conn = conn yield except rethinkdb.errors.ReqlDriverError as exc: message = 'Could not connect to RethinkDB server: {!r}'.format(exc) self.log.error(message) self.service_check(self.SERVICE_CHECK_CONNECT, self.CRITICAL, tags=tags, message=message) raise except Exception as exc: message = 'Unexpected error while executing RethinkDB check: {!r}'.format(exc) self.log.error(message) self.service_check(self.SERVICE_CHECK_CONNECT, self.CRITICAL, tags=tags, message=message) raise else: self.service_check(self.SERVICE_CHECK_CONNECT, self.OK, tags=tags) finally: self._conn = None def collect_metrics(self): # Exposed for mocking purposes. # type: () -> None self._query_manager.execute() def check(self, instance): # type: (Any) -> None with self.connect_submitting_service_checks(): self.collect_metrics()
class SQLServer(AgentCheck): __NAMESPACE__ = 'sqlserver' def __init__(self, name, init_config, instances): super(SQLServer, self).__init__(name, init_config, instances) self.connection = None self.failed_connections = {} self.instance_metrics = [] self.instance_per_type_metrics = defaultdict(list) self.do_check = True self.autodiscovery = is_affirmative(self.instance.get('database_autodiscovery')) self.autodiscovery_include = self.instance.get('autodiscovery_include', ['.*']) self.autodiscovery_exclude = self.instance.get('autodiscovery_exclude', []) self._compile_patterns() self.autodiscovery_interval = self.instance.get('autodiscovery_interval', DEFAULT_AUTODISCOVERY_INTERVAL) self.databases = set() self.ad_last_check = 0 self.proc = self.instance.get('stored_procedure') self.proc_type_mapping = {'gauge': self.gauge, 'rate': self.rate, 'histogram': self.histogram} self.custom_metrics = init_config.get('custom_metrics', []) # use QueryManager to process custom queries self._query_manager = QueryManager(self, self.execute_query_raw, queries=[], tags=self.instance.get("tags", [])) self.check_initializations.append(self.config_checks) self.check_initializations.append(self._query_manager.compile_queries) self.check_initializations.append(self.initialize_connection) def config_checks(self): if self.autodiscovery and self.instance.get('database'): self.log.warning( 'sqlserver `database_autodiscovery` and `database` options defined in same instance - ' 'autodiscovery will take precedence.' ) if not self.autodiscovery and (self.autodiscovery_include or self.autodiscovery_exclude): self.log.warning( "Autodiscovery is disabled, autodiscovery_include and autodiscovery_exclude will be ignored" ) def initialize_connection(self): self.connection = Connection(self.init_config, self.instance, self.handle_service_check) # Pre-process the list of metrics to collect try: # check to see if the database exists before we try any connections to it db_exists, context = self.connection.check_database() if db_exists: if self.instance.get('stored_procedure') is None: with self.connection.open_managed_default_connection(): with self.connection.get_managed_cursor() as cursor: self.autodiscover_databases(cursor) self._make_metric_list_to_collect(self.custom_metrics) else: # How much do we care that the DB doesn't exist? ignore = is_affirmative(self.instance.get("ignore_missing_database", False)) if ignore is not None and ignore: # not much : we expect it. leave checks disabled self.do_check = False self.log.warning("Database %s does not exist. Disabling checks for this instance.", context) else: # yes we do. Keep trying msg = "Database {} does not exist. Please resolve invalid database and restart agent".format( context ) raise ConfigurationError(msg) except SQLConnectionError as e: self.log.exception("Error connecting to database: %s", e) except ConfigurationError: raise except Exception as e: self.log.exception("Initialization exception %s", e) def handle_service_check(self, status, host, database, message=None): custom_tags = self.instance.get("tags", []) if custom_tags is None: custom_tags = [] service_check_tags = ['host:{}'.format(host), 'db:{}'.format(database)] service_check_tags.extend(custom_tags) service_check_tags = list(set(service_check_tags)) self.service_check(SERVICE_CHECK_NAME, status, tags=service_check_tags, message=message, raw=True) def _compile_patterns(self): self._include_patterns = self._compile_valid_patterns(self.autodiscovery_include) self._exclude_patterns = self._compile_valid_patterns(self.autodiscovery_exclude) def _compile_valid_patterns(self, patterns): valid_patterns = [] for pattern in patterns: # Ignore empty patterns as they match everything if not pattern: continue try: re.compile(pattern, re.IGNORECASE) except Exception: self.log.warning('%s is not a valid regular expression and will be ignored', pattern) else: valid_patterns.append(pattern) if valid_patterns: return re.compile('|'.join(valid_patterns), re.IGNORECASE) else: # create unmatchable regex - https://stackoverflow.com/a/1845097/2157429 return re.compile(r'(?!x)x') def autodiscover_databases(self, cursor): if not self.autodiscovery: return False now = time.time() if now - self.ad_last_check > self.autodiscovery_interval: self.log.info('Performing database autodiscovery') cursor.execute(AUTODISCOVERY_QUERY) all_dbs = set(row.name for row in cursor.fetchall()) excluded_dbs = set([d for d in all_dbs if self._exclude_patterns.match(d)]) included_dbs = set([d for d in all_dbs if self._include_patterns.match(d)]) self.log.debug( 'Autodiscovered databases: %s, excluding: %s, including: %s', all_dbs, excluded_dbs, included_dbs ) # keep included dbs but remove any that were explicitly excluded filtered_dbs = all_dbs.intersection(included_dbs) - excluded_dbs self.log.debug('Resulting filtered databases: %s', filtered_dbs) self.ad_last_check = now if filtered_dbs != self.databases: self.log.debug('Databases updated from previous autodiscovery check.') self.databases = filtered_dbs return True return False def _make_metric_list_to_collect(self, custom_metrics): """ Store the list of metrics to collect by instance_key. Will also create and cache cursors to query the db. """ metrics_to_collect = [] tags = self.instance.get('tags', []) # Load instance-level (previously Performance) metrics) # If several check instances are querying the same server host, it can be wise to turn these off # to avoid sending duplicate metrics if is_affirmative(self.instance.get('include_instance_metrics', True)): self._add_performance_counters( chain(INSTANCE_METRICS, INSTANCE_METRICS_TOTAL), metrics_to_collect, tags, db=None ) # populated through autodiscovery if self.databases: for db in self.databases: self._add_performance_counters(INSTANCE_METRICS_TOTAL, metrics_to_collect, tags, db=db) # Load database statistics for name, table, column in DATABASE_METRICS: # include database as a filter option db_names = self.databases or [self.instance.get('database', self.connection.DEFAULT_DATABASE)] for db_name in db_names: cfg = {'name': name, 'table': table, 'column': column, 'instance_name': db_name, 'tags': tags} metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load AlwaysOn metrics if is_affirmative(self.instance.get('include_ao_metrics', False)): for name, table, column in AO_METRICS + AO_METRICS_PRIMARY + AO_METRICS_SECONDARY: db_name = 'master' cfg = { 'name': name, 'table': table, 'column': column, 'instance_name': db_name, 'tags': tags, 'ao_database': self.instance.get('ao_database', None), 'availability_group': self.instance.get('availability_group', None), 'only_emit_local': is_affirmative(self.instance.get('only_emit_local', False)), } metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load FCI metrics if is_affirmative(self.instance.get('include_fci_metrics', False)): for name, table, column in FCI_METRICS: cfg = { 'name': name, 'table': table, 'column': column, 'tags': tags, } metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load metrics from scheduler and task tables, if enabled if is_affirmative(self.instance.get('include_task_scheduler_metrics', False)): for name, table, column in TASK_SCHEDULER_METRICS: cfg = {'name': name, 'table': table, 'column': column, 'tags': tags} metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load DB Fragmentation metrics if is_affirmative(self.instance.get('include_db_fragmentation_metrics', False)): db_fragmentation_object_names = self.instance.get('db_fragmentation_object_names', []) db_names = self.databases or [self.instance.get('database', self.connection.DEFAULT_DATABASE)] if not db_fragmentation_object_names: self.log.debug( "No fragmentation object names specified, will return fragmentation metrics for all " "object_ids of current database(s): %s", db_names, ) for db_name in db_names: for name, table, column in DATABASE_FRAGMENTATION_METRICS: cfg = { 'name': name, 'table': table, 'column': column, 'instance_name': db_name, 'tags': tags, 'db_fragmentation_object_names': db_fragmentation_object_names, } metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load any custom metrics from conf.d/sqlserver.yaml for cfg in custom_metrics: sql_type = None base_name = None custom_tags = tags + cfg.get('tags', []) cfg['tags'] = custom_tags db_table = cfg.get('table', DEFAULT_PERFORMANCE_TABLE) if db_table not in VALID_TABLES: self.log.error('%s has an invalid table name: %s', cfg['name'], db_table) continue if cfg.get('database', None) and cfg.get('database') != self.instance.get('database'): self.log.debug( 'Skipping custom metric %s for database %s, check instance configured for database %s', cfg['name'], cfg.get('database'), self.instance.get('database'), ) continue if db_table == DEFAULT_PERFORMANCE_TABLE: user_type = cfg.get('type') if user_type is not None and user_type not in VALID_METRIC_TYPES: self.log.error('%s has an invalid metric type: %s', cfg['name'], user_type) sql_type = None try: if user_type is None: sql_type, base_name = self.get_sql_type(cfg['counter_name']) except Exception: self.log.warning("Can't load the metric %s, ignoring", cfg['name'], exc_info=True) continue metrics_to_collect.append( self.typed_metric( cfg_inst=cfg, table=db_table, base_name=base_name, user_type=user_type, sql_type=sql_type ) ) else: for column in cfg['columns']: metrics_to_collect.append( self.typed_metric( cfg_inst=cfg, table=db_table, base_name=base_name, sql_type=sql_type, column=column ) ) self.instance_metrics = metrics_to_collect self.log.debug("metrics to collect %s", metrics_to_collect) # create an organized grouping of metric names to their metric classes for m in metrics_to_collect: cls = m.__class__.__name__ name = m.sql_name or m.column self.log.debug("Adding metric class %s named %s", cls, name) self.instance_per_type_metrics[cls].append(name) if m.base_name: self.instance_per_type_metrics[cls].append(m.base_name) def _add_performance_counters(self, metrics, metrics_to_collect, tags, db=None): for name, counter_name, instance_name in metrics: try: sql_type, base_name = self.get_sql_type(counter_name) cfg = { 'name': name, 'counter_name': counter_name, 'instance_name': db or instance_name, 'tags': tags, } metrics_to_collect.append( self.typed_metric( cfg_inst=cfg, table=DEFAULT_PERFORMANCE_TABLE, base_name=base_name, sql_type=sql_type ) ) except SQLConnectionError: raise except Exception: self.log.warning("Can't load the metric %s, ignoring", name, exc_info=True) continue def get_sql_type(self, counter_name): """ Return the type of the performance counter so that we can report it to Datadog correctly If the sql_type is one that needs a base (PERF_RAW_LARGE_FRACTION and PERF_AVERAGE_BULK), the name of the base counter will also be returned """ with self.connection.get_managed_cursor() as cursor: cursor.execute(COUNTER_TYPE_QUERY, (counter_name,)) (sql_type,) = cursor.fetchone() if sql_type == PERF_LARGE_RAW_BASE: self.log.warning("Metric %s is of type Base and shouldn't be reported this way", counter_name) base_name = None if sql_type in [PERF_AVERAGE_BULK, PERF_RAW_LARGE_FRACTION]: # This is an ugly hack. For certains type of metric (PERF_RAW_LARGE_FRACTION # and PERF_AVERAGE_BULK), we need two metrics: the metrics specified and # a base metrics to get the ratio. There is no unique schema so we generate # the possible candidates and we look at which ones exist in the db. candidates = ( counter_name + " base", counter_name.replace("(ms)", "base"), counter_name.replace("Avg ", "") + " base", ) try: cursor.execute(BASE_NAME_QUERY, candidates) base_name = cursor.fetchone().counter_name.strip() self.log.debug("Got base metric: %s for metric: %s", base_name, counter_name) except Exception as e: self.log.warning("Could not get counter_name of base for metric: %s", e) return sql_type, base_name def typed_metric(self, cfg_inst, table, base_name=None, user_type=None, sql_type=None, column=None): """ Create the appropriate BaseSqlServerMetric object, each implementing its method to fetch the metrics properly. If a `type` was specified in the config, it is used to report the value directly fetched from SQLServer. Otherwise, it is decided based on the sql_type, according to microsoft's documentation. """ if table == DEFAULT_PERFORMANCE_TABLE: metric_type_mapping = { PERF_COUNTER_BULK_COUNT: (self.rate, metrics.SqlSimpleMetric), PERF_COUNTER_LARGE_RAWCOUNT: (self.gauge, metrics.SqlSimpleMetric), PERF_LARGE_RAW_BASE: (self.gauge, metrics.SqlSimpleMetric), PERF_RAW_LARGE_FRACTION: (self.gauge, metrics.SqlFractionMetric), PERF_AVERAGE_BULK: (self.gauge, metrics.SqlIncrFractionMetric), } if user_type is not None: # user type overrides any other value metric_type = getattr(self, user_type) cls = metrics.SqlSimpleMetric else: metric_type, cls = metric_type_mapping[sql_type] else: # Lookup metrics classes by their associated table metric_type_str, cls = metrics.TABLE_MAPPING[table] metric_type = getattr(self, metric_type_str) return cls(cfg_inst, base_name, metric_type, column, self.log) def check(self, _): if self.do_check: if self.proc: self.do_stored_procedure_check() else: self.collect_metrics() else: self.log.debug("Skipping check") def collect_metrics(self): """Fetch the metrics from all of the associated database tables.""" with self.connection.open_managed_default_connection(): with self.connection.get_managed_cursor() as cursor: # initiate autodiscovery or if the server was down at check __init__ key could be missing. if self.autodiscover_databases(cursor) or not self.instance_metrics: self._make_metric_list_to_collect(self.custom_metrics) instance_results = {} # Execute the `fetch_all` operations first to minimize the database calls for cls, metric_names in six.iteritems(self.instance_per_type_metrics): if not metric_names: instance_results[cls] = None, None else: try: rows, cols = getattr(metrics, cls).fetch_all_values(cursor, metric_names, self.log) except Exception as e: self.log.error("Error running `fetch_all` for metrics %s - skipping. Error: %s", cls, e) rows, cols = None, None instance_results[cls] = rows, cols # Using the cached data, extract and report individual metrics for metric in self.instance_metrics: if type(metric) is metrics.SqlIncrFractionMetric: # special case, since it uses the same results as SqlFractionMetric rows, cols = instance_results['SqlFractionMetric'] if rows is not None: metric.fetch_metric(rows, cols) else: rows, cols = instance_results[metric.__class__.__name__] if rows is not None: metric.fetch_metric(rows, cols) # reuse connection for any custom queries self._query_manager.execute() def execute_query_raw(self, query): with self.connection.get_managed_cursor() as cursor: cursor.execute(query) return cursor.fetchall() def do_stored_procedure_check(self): """ Fetch the metrics from the stored proc """ proc = self.proc guardSql = self.instance.get('proc_only_if') custom_tags = self.instance.get("tags", []) if (guardSql and self.proc_check_guard(guardSql)) or not guardSql: self.connection.open_db_connections(self.connection.DEFAULT_DB_KEY) cursor = self.connection.get_cursor(self.connection.DEFAULT_DB_KEY) try: self.log.debug("Calling Stored Procedure : %s", proc) if self.connection.get_connector() == 'adodbapi': cursor.callproc(proc) else: # pyodbc does not support callproc; use execute instead. # Reference: https://github.com/mkleehammer/pyodbc/wiki/Calling-Stored-Procedures call_proc = '{{CALL {}}}'.format(proc) cursor.execute(call_proc) rows = cursor.fetchall() self.log.debug("Row count (%s) : %s", proc, cursor.rowcount) for row in rows: tags = [] if row.tags is None or row.tags == '' else row.tags.split(',') tags.extend(custom_tags) if row.type.lower() in self.proc_type_mapping: self.proc_type_mapping[row.type](row.metric, row.value, tags, raw=True) else: self.log.warning( '%s is not a recognised type from procedure %s, metric %s', row.type, proc, row.metric ) except Exception as e: self.log.warning("Could not call procedure %s: %s", proc, e) raise e self.connection.close_cursor(cursor) self.connection.close_db_connections(self.connection.DEFAULT_DB_KEY) else: self.log.info("Skipping call to %s due to only_if", proc) def proc_check_guard(self, sql): """ check to see if the guard SQL returns a single column containing 0 or 1 We return true if 1, else False """ self.connection.open_db_connections(self.connection.PROC_GUARD_DB_KEY) cursor = self.connection.get_cursor(self.connection.PROC_GUARD_DB_KEY) should_run = False try: cursor.execute(sql, ()) result = cursor.fetchone() should_run = result[0] == 1 except Exception as e: self.log.error("Failed to run proc_only_if sql %s : %s", sql, e) self.connection.close_cursor(cursor) self.connection.close_db_connections(self.connection.PROC_GUARD_DB_KEY) return should_run
class Oracle(AgentCheck): __NAMESPACE__ = 'oracle' ORACLE_DRIVER_CLASS = "oracle.jdbc.OracleDriver" JDBC_CONNECT_STRING = "jdbc:oracle:thin:@//{}/{}" CX_CONNECT_STRING = "{}/{}@//{}/{}" SERVICE_CHECK_NAME = 'can_connect' def __init__(self, name, init_config, instances): super(Oracle, self).__init__(name, init_config, instances) ( self._server, self._user, self._password, self._service, self._jdbc_driver, self._tags, only_custom_queries, ) = self._get_config(self.instance) self.check_initializations.append(self.validate_config) self._connection = None manager_queries = [] if not only_custom_queries: manager_queries.extend([ queries.ProcessMetrics, queries.SystemMetrics, queries.TableSpaceMetrics ]) self._fix_custom_queries() self._query_manager = QueryManager( self, self.execute_query_raw, queries=manager_queries, tags=self._tags, ) self.check_initializations.append(self._query_manager.compile_queries) def _fix_custom_queries(self): """ For backward compatibility reasons, if a custom query specifies a `metric_prefix`, change the submission name to contain it. """ custom_queries = self.instance.get('custom_queries', []) global_custom_queries = self.init_config.get('global_custom_queries', []) for query in itertools.chain(custom_queries, global_custom_queries): prefix = query.get('metric_prefix') if prefix and prefix != self.__NAMESPACE__: if prefix.startswith(self.__NAMESPACE__ + '.'): prefix = prefix[len(self.__NAMESPACE__) + 1:] for column in query.get('columns', []): if column.get('type') != 'tag': column['name'] = '{}.{}'.format(prefix, column['name']) def validate_config(self): if not self._server or not self._user: raise ConfigurationError("Oracle host and user are needed") def execute_query_raw(self, query): with closing(self._connection.cursor()) as cursor: cursor.execute(query) # JDBC doesn't support iter protocol return cursor.fetchall() def check(self, _): self.create_connection() with closing(self._connection): self._query_manager.execute() self._connection = None def _get_config(self, instance): server = instance.get('server') user = instance.get('user') password = instance.get('password') service = instance.get('service_name') jdbc_driver = instance.get('jdbc_driver_path') tags = instance.get('tags') or [] only_custom_queries = instance.get('only_custom_queries', False) return server, user, password, service, jdbc_driver, tags, only_custom_queries def create_connection(self): service_check_tags = ['server:%s' % self._server] service_check_tags.extend(self._tags) try: # Check if the instantclient is available cx_Oracle.clientversion() except cx_Oracle.DatabaseError as e: # Fallback to JDBC use_oracle_client = False self.log.debug( 'Oracle instant client unavailable, falling back to JDBC: %s', e) connect_string = self.JDBC_CONNECT_STRING.format( self._server, self._service) else: use_oracle_client = True self.log.debug('Running cx_Oracle version %s', cx_Oracle.version) connect_string = self.CX_CONNECT_STRING.format( self._user, self._password, self._server, self._service) try: if use_oracle_client: connection = cx_Oracle.connect(connect_string) elif JDBC_IMPORT_ERROR: self.log.error( "Oracle client is unavailable and the integration is unable to import JDBC libraries. You may not " "have the Microsoft Visual C++ Runtime 2015 installed on your system. Please double check your " "installation and refer to the Datadog documentation for more information." ) raise JDBC_IMPORT_ERROR else: try: if jpype.isJVMStarted( ) and not jpype.isThreadAttachedToJVM(): jpype.attachThreadToJVM() jpype.java.lang.Thread.currentThread( ).setContextClassLoader( jpype.java.lang.ClassLoader.getSystemClassLoader()) connection = jdb.connect(self.ORACLE_DRIVER_CLASS, connect_string, [self._user, self._password], self._jdbc_driver) except Exception as e: if "Class {} not found".format( self.ORACLE_DRIVER_CLASS) in str(e): msg = """Cannot run the Oracle check until either the Oracle instant client or the JDBC Driver is available. For the Oracle instant client, see: http://www.oracle.com/technetwork/database/features/instant-client/index.html You will also need to ensure the `LD_LIBRARY_PATH` is also updated so the libs are reachable. For the JDBC Driver, see: http://www.oracle.com/technetwork/database/application-development/jdbc/downloads/index.html You will also need to ensure the jar is either listed in your $CLASSPATH or in the yaml configuration file of the check. """ self.log.error(msg) raise self.log.debug("Connected to Oracle DB") self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags) except Exception as e: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags) self.log.error(e) raise self._connection = connection
class ClickhouseCheck(AgentCheck): __NAMESPACE__ = 'clickhouse' SERVICE_CHECK_CONNECT = 'can_connect' def __init__(self, name, init_config, instances): super(ClickhouseCheck, self).__init__(name, init_config, instances) self._server = self.instance.get('server', '') self._port = self.instance.get('port') self._db = self.instance.get('db', 'default') self._user = self.instance.get('user', 'default') self._password = self.instance.get('password', '') self._connect_timeout = float(self.instance.get('connect_timeout', 10)) self._read_timeout = float(self.instance.get('read_timeout', 10)) self._compression = self.instance.get('compression', False) self._tls_verify = is_affirmative( self.instance.get('tls_verify', False)) self._tags = self.instance.get('tags', []) # Add global tags self._tags.append('server:{}'.format(self._server)) self._tags.append('port:{}'.format(self._port)) self._tags.append('db:{}'.format(self._db)) self._error_sanitizer = ErrorSanitizer(self._password) self.check_initializations.append(self.validate_config) # We'll connect on the first check run self._client = None self.check_initializations.append(self.create_connection) self._query_manager = QueryManager( self, self.execute_query_raw, queries=[ queries.SystemMetrics, queries.SystemEvents, queries.SystemAsynchronousMetrics, queries.SystemParts, queries.SystemReplicas, queries.SystemDictionaries, ], tags=self._tags, error_handler=self._error_sanitizer.clean, ) self.check_initializations.append(self._query_manager.compile_queries) def check(self, _): self._query_manager.execute() self.collect_version() def collect_version(self): version = list(self.execute_query_raw('SELECT version()'))[0][0] # The version comes in like `19.15.2.2` though sometimes there is no patch part version_parts = { name: part for name, part in zip(('year', 'major', 'minor', 'patch'), version.split('.')) } self.set_metadata('version', version, scheme='parts', final_scheme='calver', part_map=version_parts) def execute_query_raw(self, query): return self._client.execute_iter(query) def validate_config(self): if not self._server: raise ConfigurationError('the `server` setting is required') def create_connection(self): try: client = clickhouse_driver.Client( host=self._server, port=self._port, user=self._user, password=self._password, database=self._db, connect_timeout=self._connect_timeout, send_receive_timeout=self._read_timeout, sync_request_timeout=self._connect_timeout, compression=self._compression, secure=self._tls_verify, # Don't pollute the Agent logs settings={'calculate_text_stack_trace': False}, # Make every client unique for server logs client_name='datadog-{}'.format(self.check_id), ) client.connection.connect() except Exception as e: error = 'Unable to connect to ClickHouse: {}'.format( self._error_sanitizer.clean(self._error_sanitizer.scrub( str(e)))) self.service_check(self.SERVICE_CHECK_CONNECT, self.CRITICAL, message=error, tags=self._tags) # When an exception is raised in the context of another one, both will be printed. To avoid # this we set the context to None. https://www.python.org/dev/peps/pep-0409/ raise_from(type(e)(error), None) else: self.service_check(self.SERVICE_CHECK_CONNECT, self.OK, tags=self._tags) self._client = client
class Oracle(AgentCheck): __NAMESPACE__ = 'oracle' ORACLE_DRIVER_CLASS = "oracle.jdbc.OracleDriver" JDBC_CONNECTION_STRING = "jdbc:oracle:thin:@//{}/{}" JDBC_CONNECTION_STRING_TCPS = "jdbc:oracle:thin:@{}" SERVICE_CHECK_NAME = 'can_connect' SERVICE_CHECK_CAN_QUERY = "can_query" def __init__(self, name, init_config, instances): super(Oracle, self).__init__(name, init_config, instances) self._server = self.instance.get('server') self._user = self.instance.get('username') or self.instance.get('user') self._password = self.instance.get('password') self._service = self.instance.get('service_name') self._protocol = self.instance.get("protocol", PROTOCOL_TCP) self._jdbc_driver = self.instance.get('jdbc_driver_path') self._jdbc_truststore_path = self.instance.get('jdbc_truststore_path') self._jdbc_truststore_type = self.instance.get('jdbc_truststore_type') self._jdbc_truststore_password = self.instance.get( 'jdbc_truststore_password', '') self._tags = self.instance.get('tags') or [] self._service_check_tags = ['server:{}'.format(self._server)] self._service_check_tags.extend(self._tags) self._cached_connection = None manager_queries = [] if not self.instance.get('only_custom_queries', False): manager_queries.extend([ queries.ProcessMetrics, queries.SystemMetrics, queries.TableSpaceMetrics ]) self._fix_custom_queries() self._query_manager = QueryManager( self, self.execute_query_raw, queries=manager_queries, error_handler=self.handle_query_error, tags=self._tags, ) # Runtime validations are only py3, so this is for manually validating config on py2 if PY2: self.check_initializations.append(self.validate_config) self.check_initializations.append(self._query_manager.compile_queries) self._query_errors = 0 self._connection_errors = 0 def _fix_custom_queries(self): """ For backward compatibility reasons, if a custom query specifies a `metric_prefix`, change the submission name to contain it. """ custom_queries = self.instance.get('custom_queries', []) global_custom_queries = self.init_config.get('global_custom_queries', []) for query in itertools.chain(custom_queries, global_custom_queries): prefix = query.get('metric_prefix') if prefix and prefix != self.__NAMESPACE__: if prefix.startswith(self.__NAMESPACE__ + '.'): prefix = prefix[len(self.__NAMESPACE__) + 1:] for column in query.get('columns', []): if column.get('type') != 'tag': column['name'] = '{}.{}'.format(prefix, column['name']) def validate_config(self): if not self._server or not self._user: raise ConfigurationError("Oracle host and user are needed") if not self._protocol or self._protocol.upper() not in VALID_PROTOCOLS: raise ConfigurationError( "Protocol %s is not valid, must either be TCP or TCPS" % self._protocol) if self._jdbc_driver and self._protocol.upper() == PROTOCOL_TCPS: if not (self._jdbc_truststore_type and self._jdbc_truststore_path): raise ConfigurationError( "TCPS connections to Oracle via JDBC requires both `jdbc_truststore_type` and " "`jdbc_truststore_path` configuration options ") if self._jdbc_truststore_type and self._jdbc_truststore_type.upper( ) not in VALID_TRUSTSTORE_TYPES: raise ConfigurationError( "Truststore type %s is not valid, must be one of %s" % (self._jdbc_truststore_type, VALID_TRUSTSTORE_TYPES)) def execute_query_raw(self, query): with closing(self._connection.cursor()) as cursor: cursor.execute(query) # JDBC doesn't support iter protocol return cursor.fetchall() def handle_query_error(self, error): self._query_errors += 1 if self._cached_connection is None: self.log.debug( "Couldn't close the connection after a query failure because there was no connection" ) return error try: self._cached_connection.close() except Exception as e: self.log.warning( "Couldn't close the connection after a query failure: %s", str(e)) self._cached_connection = None return error def check(self, _): if self.instance.get('user'): self._log_deprecation('_config_renamed', 'user', 'username') self._query_errors = 0 self._connection_errors = 0 self._query_manager.execute() if self._query_errors: self.service_check(self.SERVICE_CHECK_CAN_QUERY, self.CRITICAL, tags=self._service_check_tags) else: self.service_check(self.SERVICE_CHECK_CAN_QUERY, self.OK, tags=self._service_check_tags) if self._connection_errors: self.service_check(self.SERVICE_CHECK_NAME, self.CRITICAL, tags=self._service_check_tags) else: self.service_check(self.SERVICE_CHECK_NAME, self.OK, tags=self._service_check_tags) @property def _connection(self): """Creates a connection or raises an exception""" if self._cached_connection is None: if self.can_use_oracle_client(): self._cached_connection = self._oracle_client_connect() elif JDBC_IMPORT_ERROR: self._connection_errors += 1 self.log.error( "Oracle client is unavailable and the integration is unable to import JDBC libraries. You may not " "have the Microsoft Visual C++ Runtime 2015 installed on your system. Please double check your " "installation and refer to the Datadog documentation for more information." ) raise JDBC_IMPORT_ERROR else: self._cached_connection = self._jdbc_connect() return self._cached_connection def can_use_oracle_client(self): try: # Check if the instantclient is available cx_Oracle.clientversion() except cx_Oracle.DatabaseError as e: # Fallback to JDBC self.log.debug( 'Oracle instant client unavailable, falling back to JDBC: %s', e) return False else: self.log.debug('Running cx_Oracle version %s', cx_Oracle.version) return True def _oracle_client_connect(self): dsn = self._get_dsn() self.log.debug("Connecting via Oracle Instant Client with DSN: %s", dsn) try: connection = cx_Oracle.connect(user=self._user, password=self._password, dsn=dsn) self.log.debug( "Connected to Oracle DB using Oracle Instant Client") return connection except cx_Oracle.DatabaseError as e: self._connection_errors += 1 self.log.error( "Failed to connect to Oracle DB using Oracle Instant Client, error: %s", str(e)) raise def _get_dsn(self): host = self._server port = 1521 try: if ':' in self._server: host, port = self._server.split(':') port = int(port) except Exception: self._connection_errors += 1 raise ConfigurationError( 'server needs to be in the <HOST>:<PORT> format, "%s"" provided' % self._server) if self._protocol == PROTOCOL_TCPS: dsn = '(DESCRIPTION=(ADDRESS=(PROTOCOL={})(HOST={})(PORT={}))(CONNECT_DATA=(SERVICE_NAME={})))'.format( self._protocol, host, port, self._service) return dsn else: return cx_Oracle.makedsn(host, port, service_name=self._service) def _jdbc_connect(self): jdbc_connect_properties = { 'user': self._user, 'password': self._password } if self._protocol == PROTOCOL_TCPS: connect_string = self.JDBC_CONNECTION_STRING_TCPS.format( self._get_dsn()) jdbc_connect_properties[ 'javax.net.ssl.trustStoreType'] = self._jdbc_truststore_type jdbc_connect_properties[ 'javax.net.ssl.trustStorePassword'] = self._jdbc_truststore_password jdbc_connect_properties[ 'javax.net.ssl.trustStore'] = self._jdbc_truststore_path else: connect_string = self.JDBC_CONNECTION_STRING.format( self._server, self._service) self.log.debug("Connecting via JDBC with connection string: %s", connect_string) try: with jdbc_lock: if jpype.isJVMStarted() and not jpype.isThreadAttachedToJVM(): jpype.attachThreadToJVM() jpype.java.lang.Thread.currentThread( ).setContextClassLoader( jpype.java.lang.ClassLoader.getSystemClassLoader()) connection = jdb.connect(self.ORACLE_DRIVER_CLASS, connect_string, jdbc_connect_properties, self._jdbc_driver) self.log.debug("Connected to Oracle DB using JDBC connector") return connection except Exception as e: self._connection_errors += 1 if "Class {} not found".format(self.ORACLE_DRIVER_CLASS) in str(e): msg = """Cannot run the Oracle check until either the Oracle instant client or the JDBC Driver is available. For the Oracle instant client, see: http://www.oracle.com/technetwork/database/features/instant-client/index.html You will also need to ensure the `LD_LIBRARY_PATH` is also updated so the libs are reachable. For the JDBC Driver, see: http://www.oracle.com/technetwork/database/application-development/jdbc/downloads/index.html You will also need to ensure the jar is either listed in your $CLASSPATH or in the yaml configuration file of the check. """ self.log.error(msg) raise
class SnowflakeCheck(AgentCheck): """ Collect Snowflake account usage metrics """ __NAMESPACE__ = 'snowflake' SERVICE_CHECK_CONNECT = 'can_connect' def __init__(self, *args, **kwargs): super(SnowflakeCheck, self).__init__(*args, **kwargs) self._config = Config(self.instance) self._conn = None self.proxy_host = self.init_config.get('proxy_host', None) self.proxy_port = self.init_config.get('proxy_port', None) self.proxy_user = self.init_config.get('proxy_user', None) self.proxy_password = self.init_config.get('proxy_password', None) # Add default tags like account to all metrics self._tags = self._config.tags + ['account:{}'.format(self._config.account)] if self._config.password: self.register_secret(self._config.password) if self._config.private_key_password: self.register_secret(self._config.private_key_password) if self._config.role == 'ACCOUNTADMIN': self.log.info( 'Snowflake `role` is set as `ACCOUNTADMIN` which should be used cautiously, ' 'refer to docs about custom roles.' ) self.metric_queries = [] self.errors = [] for mgroup in self._config.metric_groups: try: if not self._config.aggregate_last_24_hours: for query in range(len(METRIC_GROUPS[mgroup])): METRIC_GROUPS[mgroup][query]['query'] = METRIC_GROUPS[mgroup][query]['query'].replace( 'DATEADD(hour, -24, current_timestamp())', 'date_trunc(day, current_date)' ) self.metric_queries.extend(METRIC_GROUPS[mgroup]) except KeyError: self.errors.append(mgroup) if self.errors: self.log.warning('Invalid metric_groups found in snowflake conf.yaml: %s', (', '.join(self.errors))) if not self.metric_queries and not self._config.custom_queries_defined: raise ConfigurationError('No valid metric_groups or custom query configured, please list at least one.') self._query_manager = QueryManager(self, self.execute_query_raw, queries=self.metric_queries, tags=self._tags) self.check_initializations.append(self._query_manager.compile_queries) def read_token(self): if self._config.token_path: self.log.debug("Renewing Snowflake client token") with open(self._config.token_path, 'rb', encoding="UTF-8") as f: self._config.token = f.read() return self._config.token def read_key(self): if self._config.private_key_path: self.log.debug("Reading Snowflake client key for key pair authentication") # https://docs.snowflake.com/en/user-guide/python-connector-example.html#using-key-pair-authentication-key-pair-rotation with open(self._config.private_key_path, "rb") as key: p_key = serialization.load_pem_private_key( key.read(), password=ensure_bytes(self._config.private_key_password), backend=default_backend() ) pkb = p_key.private_bytes( encoding=serialization.Encoding.DER, format=serialization.PrivateFormat.PKCS8, encryption_algorithm=serialization.NoEncryption(), ) return pkb return None def check(self, _): if self.instance.get('user'): self._log_deprecation('_config_renamed', 'user', 'username') self.connect() if self._conn is not None: # Execute queries self._query_manager.execute() self._collect_version() self.log.debug("Closing connection to Snowflake...") self._conn.close() def execute_query_raw(self, query): """ Executes query with timestamp from parts if comparing start_time field. """ with closing(self._conn.cursor()) as cursor: cursor.execute(query) if cursor.rowcount is None or cursor.rowcount < 1: self.log.debug("Failed to fetch records from query: `%s`", query) return [] return cursor.fetchall() def connect(self): self.log.debug( "Establishing a new connection to Snowflake: account=%s, user=%s, database=%s, schema=%s, warehouse=%s, " "role=%s, timeout=%s, authenticator=%s, ocsp_response_cache_filename=%s, proxy_host=%s, proxy_port=%s", self._config.account, self._config.user, self._config.database, self._config.schema, self._config.warehouse, self._config.role, self._config.login_timeout, self._config.authenticator, self._config.ocsp_response_cache_filename, self.proxy_host, self.proxy_port, ) try: conn = sf.connect( user=self._config.user, password=self._config.password, account=self._config.account, database=self._config.database, schema=self._config.schema, warehouse=self._config.warehouse, role=self._config.role, passcode_in_password=self._config.passcode_in_password, passcode=self._config.passcode, client_prefetch_threads=self._config.client_prefetch_threads, login_timeout=self._config.login_timeout, ocsp_response_cache_filename=self._config.ocsp_response_cache_filename, authenticator=self._config.authenticator, token=self.read_token(), private_key=self.read_key(), client_session_keep_alive=self._config.client_keep_alive, proxy_host=self.proxy_host, proxy_port=self.proxy_port, proxy_user=self.proxy_user, proxy_password=self.proxy_password, ) except Exception as e: msg = "Unable to connect to Snowflake: {}".format(e) self.service_check(self.SERVICE_CHECK_CONNECT, self.CRITICAL, message=msg, tags=self._tags) self.warning(msg) else: self.service_check(self.SERVICE_CHECK_CONNECT, self.OK, tags=self._tags) self._conn = conn @AgentCheck.metadata_entrypoint def _collect_version(self): try: raw_version = self.execute_query_raw("select current_version();") version = raw_version[0][0] except Exception as e: self.log.error("Error collecting version for Snowflake: %s", e) else: if version: self.set_metadata('version', version) # override def _normalize_tags_type(self, tags, device_name=None, metric_name=None): if self.disable_generic_tags: return super(SnowflakeCheck, self)._normalize_tags_type(tags, device_name, metric_name) # If disable_generic_tags is not enabled, for each generic tag we emmit both the generic and the non generic # version to ease transition. normalized_tags = [] for tag in tags: if tag is not None: try: tag = to_native_string(tag) except UnicodeError: self.log.warning('Encoding error with tag `%s` for metric `%s`, ignoring tag', tag, metric_name) continue normalized_tags.extend(list({tag, self.degeneralise_tag(tag)})) return normalized_tags
class VoltDBCheck(AgentCheck): __NAMESPACE__ = 'voltdb' def __init__(self, name, init_config, instances): # type: (str, dict, list) -> None super(VoltDBCheck, self).__init__(name, init_config, instances) self._config = Config(cast(Instance, self.instance), debug=self.log.debug) self.register_secret(self._config.password) self._client = Client( url=self._config.url, http_get=self.http.get, username=self._config.username, password=self._config.password, password_hashed=self._config.password_hashed, ) manager_queries = [ queries.CPUMetrics, queries.MemoryMetrics, queries.SnapshotStatusMetrics, queries.CommandLogMetrics, queries.ProcedureMetrics, queries.LatencyMetrics, queries.GCMetrics, queries.IOStatsMetrics, queries.TableMetrics, queries.IndexMetrics, ] if BASE_PARSED_VERSION < pkg_resources.parse_version('15.0.0'): # On Agent < 7.24.0 we must to pass `Query` objects instead of dicts. manager_queries = [Query(query) for query in manager_queries] # type: ignore self._query_manager = QueryManager( self, self._execute_query_raw, queries=manager_queries, tags=self._config.tags, ) self.check_initializations.append(self._query_manager.compile_queries) def _raise_for_status_with_details(self, response): # type: (requests.Response) -> None try: response.raise_for_status() except Exception as exc: message = 'Error response from VoltDB: {}'.format(exc) try: # Try including detailed error message from response. details = response.json()['statusstring'] except Exception: pass else: message += ' (details: {})'.format(details) raise_from(Exception(message), exc) def _fetch_version(self): # type: () -> Optional[str] # See: https://docs.voltdb.com/UsingVoltDB/sysprocsysteminfo.php#sysprocsysinforetvalovervw response = self._client.request('@SystemInformation', parameters=['OVERVIEW']) self._raise_for_status_with_details(response) data = response.json() rows = data['results'][0]['data'] # type: List[tuple] # NOTE: there will be one VERSION row per server in the cluster. # Arbitrarily use the first one we see. for _, column, value in rows: if column == 'VERSION': return self._transform_version(value) self.log.debug('VERSION column not found: %s', [column for _, column, _ in rows]) return None def _transform_version(self, raw): # type: (str) -> Optional[str] # VoltDB does not include .0 patch numbers (eg 10.0, not 10.0.0). # Need to ensure they're present so the version is always in 3 parts: major.minor.patch. try: major, rest = raw.split('.', 1) except ValueError: self.log.debug('Malformed version string: %s', raw) return None minor, found, patch = rest.partition('.') if not found: patch = '0' return '{}.{}.{}'.format(major, minor, patch) @AgentCheck.metadata_entrypoint def _submit_version(self, version): # type: (str) -> None self.set_metadata('version', version) def _check_can_connect_and_submit_version(self): # type () -> None host, port = self._config.netloc tags = ['host:{}'.format(host), 'port:{}'.format(port) ] + self._config.tags try: version = self._fetch_version() except Exception as exc: message = 'Unable to connect to VoltDB: {}'.format(exc) self.service_check('can_connect', self.CRITICAL, message=message, tags=tags) raise self.service_check('can_connect', self.OK, tags=tags) if version is not None: self._submit_version(version) def _execute_query_raw(self, query): # type: (str) -> List[tuple] # Ad-hoc format, close to the HTTP API format. # Eg 'A:[B, C]' -> '?Procedure=A&Parameters=[B, C]' procedure, _, parameters = query.partition(":") response = self._client.request(procedure, parameters=parameters) self._raise_for_status_with_details(response) data = response.json() return data['results'][0]['data'] def check(self, _): # type: (Any) -> None self._check_can_connect_and_submit_version() self._query_manager.execute()
class ProxysqlCheck(AgentCheck): SERVICE_CHECK_NAME = "can_connect" __NAMESPACE__ = "proxysql" def __init__(self, name, init_config, instances): super(ProxysqlCheck, self).__init__(name, init_config, instances) self.host = self.instance.get("host", "") self.port = int(self.instance.get("port", 0)) self.user = self.instance.get("username", "") self.password = str(self.instance.get("password", "")) if not all((self.host, self.port, self.user, self.password)): raise ConfigurationError( "ProxySQL host, port, username and password are needed") self.tls_verify = self.instance.get("tls_verify", False) self.validate_hostname = self.instance.get("validate_hostname", True) self.tls_ca_cert = self.instance.get("tls_ca_cert") self.connect_timeout = self.instance.get("connect_timeout", 10) self.read_timeout = self.instance.get("read_timeout") self.tags = self.instance.get("tags", []) self.tags.append("proxysql_server:{}".format(self.host)) self.tags.append("proxysql_port:{}".format(self.port)) manager_queries = [STATS_MYSQL_GLOBAL] if self.is_metadata_collection_enabled(): # Add the query to collect the ProxySQL version manager_queries.append(VERSION_METADATA) additional_metrics = self.instance.get("additional_metrics", []) for additional_group in additional_metrics: if additional_group not in ADDITIONAL_METRICS_MAPPING: raise ConfigurationError( "There is no additional metric group called '{}' for the ProxySQL integration, it should be one " "of ({})".format( additional_group, ", ".join(ADDITIONAL_METRICS_MAPPING), )) manager_queries.append( ADDITIONAL_METRICS_MAPPING[additional_group]) self._connection = None self._query_manager = QueryManager(self, self.execute_query_raw, queries=manager_queries, tags=self.tags) self.check_initializations.append(self._query_manager.compile_queries) def check(self, _): with self.connect() as conn: self._connection = conn self._query_manager.execute() def execute_query_raw(self, query): with closing(self._connection.cursor()) as cursor: cursor.execute(query) if cursor.rowcount < 1: self.log.warning("Failed to fetch records from query: `%s`.", query) return [] return cursor.fetchall() @contextmanager def connect(self): if self.tls_verify: # If ca_cert is None, will load the default certificates ssl_context = make_secure_ssl_client_context( ca_cert=self.tls_ca_cert, check_hostname=self.validate_hostname) else: ssl_context = make_insecure_ssl_client_context() db = None try: db = pymysql.connect( host=self.host, user=self.user, port=self.port, passwd=self.password, connect_timeout=self.connect_timeout, read_timeout=self.read_timeout, ssl=ssl_context, ) self.log.debug("Connected to ProxySQL") yield db except Exception: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=self.tags) self.log.exception("Can't connect to ProxySQL") raise else: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=self.tags) finally: if db: db.close()
class SnowflakeCheck(AgentCheck): """ Collect Snowflake account usage metrics """ __NAMESPACE__ = 'snowflake' SERVICE_CHECK_CONNECT = 'snowflake.can_connect' MONKEY_PATCH_LOCK = threading.Lock() def __init__(self, *args, **kwargs): super(SnowflakeCheck, self).__init__(*args, **kwargs) self.config = Config(self.instance) self._conn = None # Add default tags like account to all metrics self._tags = self.config.tags + [ 'account:{}'.format(self.config.account) ] if self.config.password: self.register_secret(self.config.password) if self.config.role == 'ACCOUNTADMIN': self.log.info( 'Snowflake `role` is set as `ACCOUNTADMIN` which should be used cautiously, ' 'refer to docs about custom roles.') self.metric_queries = [] self.errors = [] for mgroup in self.config.metric_groups: try: self.metric_queries.extend(METRIC_GROUPS[mgroup]) except KeyError: self.errors.append(mgroup) if self.errors: self.log.warning( 'Invalid metric_groups found in snowflake conf.yaml: %s', (', '.join(self.errors))) if not self.metric_queries: raise ConfigurationError( 'No valid metric_groups configured, please list at least one.') self._proxies = self.http.options['proxies'] # SKIP_HTTP_VALIDATION self._query_manager = QueryManager(self, self.execute_query_raw, queries=self.metric_queries, tags=self._tags) self.check_initializations.append(self._query_manager.compile_queries) def check(self, _): self.connect() if self._conn is not None: # Execute queries self._query_manager.execute() self._collect_version() self.log.debug("Closing connection to Snowflake...") self._conn.close() def execute_query_raw(self, query): """ Executes query with timestamp from parts if comparing start_time field. """ with closing(self._conn.cursor()) as cursor: cursor.execute(query) if cursor.rowcount is None or cursor.rowcount < 1: self.log.debug("Failed to fetch records from query: `%s`", query) return [] return cursor.fetchall() def connect(self): self.log.debug( "Establishing a new connection to Snowflake: account=%s, user=%s, database=%s, schema=%s, warehouse=%s, " "role=%s, login_timeout=%s, authenticator=%s, ocsp_response_cache_filename=%s", self.config.account, self.config.user, self.config.database, self.config.schema, self.config.warehouse, self.config.role, self.config.login_timeout, self.config.authenticator, self.config.ocsp_response_cache_filename, ) try: with self.MONKEY_PATCH_LOCK: # Monkey patch proxies to request_exec SnowflakeRestful._request_exec = self._make_snowflake_request_func( self._proxies, SnowflakeRestful._request_exec) conn = sf.connect( user=self.config.user, password=self.config.password, account=self.config.account, database=self.config.database, schema=self.config.schema, warehouse=self.config.warehouse, role=self.config.role, passcode_in_password=self.config.passcode_in_password, passcode=self.config.passcode, client_prefetch_threads=self.config. client_prefetch_threads, login_timeout=self.config.login_timeout, ocsp_response_cache_filename=self.config. ocsp_response_cache_filename, authenticator=self.config.authenticator, token=self.config.token, client_session_keep_alive=self.config.client_keep_alive, ) except Exception as e: msg = "Unable to connect to Snowflake: {}".format(e) self.service_check(self.SERVICE_CHECK_CONNECT, self.CRITICAL, message=msg, tags=self._tags) self.warning(msg) else: self.service_check(self.SERVICE_CHECK_CONNECT, self.OK, tags=self._tags) self._conn = conn def _make_snowflake_request_func(self, proxies, method): """ This is a workaround to include proxy config in the Snowflake connection. The current Snowflake logic applies global proxy configs via env vars. TODO: Remove when https://github.com/snowflakedb/snowflake-connector-python/pull/352 gets merged """ def _request_exec(*args, **kwargs): session = kwargs.get('session') or args[1] session.proxies = proxies try: return method(*args, **kwargs) except Exception as e: msg = "Encountered error while attempting to connect to Snowflake " if proxies: self.log.error("%s via proxy settings: %s", msg, str(e)) else: self.log.error("%s: %s", msg, str(e)) return return _request_exec @AgentCheck.metadata_entrypoint def _collect_version(self): try: raw_version = self.execute_query_raw("select current_version();") version = raw_version[0][0] except Exception as e: self.log.error("Error collecting version for Snowflake: %s", e) else: if version: self.set_metadata('version', version)
class SQLServer(AgentCheck): __NAMESPACE__ = 'sqlserver' SERVICE_CHECK_NAME = 'sqlserver.can_connect' # Default performance table metrics - Database Instance level # datadog metric name, counter name, instance name INSTANCE_METRICS = [ # SQLServer:General Statistics ('sqlserver.stats.connections', 'User Connections', '' ), # LARGE_RAWCOUNT ('sqlserver.stats.procs_blocked', 'Processes blocked', ''), # LARGE_RAWCOUNT # SQLServer:Access Methods ('sqlserver.access.page_splits', 'Page Splits/sec', ''), # BULK_COUNT # SQLServer:Memory Manager ('sqlserver.memory.memory_grants_pending', 'Memory Grants Pending', '' ), ('sqlserver.memory.total_server_memory', 'Total Server Memory (KB)', ''), # SQLServer:Buffer Manager ('sqlserver.buffer.cache_hit_ratio', 'Buffer cache hit ratio', '' ), # RAW_LARGE_FRACTION ('sqlserver.buffer.page_life_expectancy', 'Page life expectancy', ''), # LARGE_RAWCOUNT ('sqlserver.buffer.page_reads', 'Page reads/sec', ''), # LARGE_RAWCOUNT ('sqlserver.buffer.page_writes', 'Page writes/sec', ''), # LARGE_RAWCOUNT ('sqlserver.buffer.checkpoint_pages', 'Checkpoint pages/sec', ''), # BULK_COUNT # SQLServer:SQL Statistics ('sqlserver.stats.auto_param_attempts', 'Auto-Param Attempts/sec', ''), ('sqlserver.stats.failed_auto_param_attempts', 'Failed Auto-Params/sec', ''), ('sqlserver.stats.safe_auto_param_attempts', 'Safe Auto-Params/sec', ''), ('sqlserver.stats.batch_requests', 'Batch Requests/sec', ''), # BULK_COUNT ('sqlserver.stats.sql_compilations', 'SQL Compilations/sec', ''), # BULK_COUNT ('sqlserver.stats.sql_recompilations', 'SQL Re-Compilations/sec', ''), # BULK_COUNT ] # Performance table metrics, initially configured to track at instance-level only # With auto-discovery enabled, these metrics will be extended accordingly # datadog metric name, counter name, instance name INSTANCE_METRICS_TOTAL = [ # SQLServer:Locks ('sqlserver.stats.lock_waits', 'Lock Waits/sec', '_Total' ), # BULK_COUNT # SQLServer:Plan Cache ('sqlserver.cache.object_counts', 'Cache Object Counts', '_Total'), ('sqlserver.cache.pages', 'Cache Pages', '_Total'), # SQLServer:Databases ('sqlserver.database.backup_restore_throughput', 'Backup/Restore Throughput/sec', '_Total'), ('sqlserver.database.log_bytes_flushed', 'Log Bytes Flushed/sec', '_Total'), ('sqlserver.database.log_flushes', 'Log Flushes/sec', '_Total'), ('sqlserver.database.log_flush_wait', 'Log Flush Wait Time', '_Total'), ('sqlserver.database.transactions', 'Transactions/sec', '_Total'), # BULK_COUNT ('sqlserver.database.write_transactions', 'Write Transactions/sec', '_Total'), # BULK_COUNT ('sqlserver.database.active_transactions', 'Active Transactions', '_Total'), # BULK_COUNT ] # AlwaysOn metrics # datadog metric name, sql table, column name, tag AO_METRICS = [ ('sqlserver.ao.ag_sync_health', 'sys.dm_hadr_availability_group_states', 'synchronization_health'), ('sqlserver.ao.replica_sync_state', 'sys.dm_hadr_database_replica_states', 'synchronization_state'), ('sqlserver.ao.replica_failover_mode', 'sys.availability_replicas', 'failover_mode'), ('sqlserver.ao.replica_failover_readiness', 'sys.availability_replicas', 'is_failover_ready'), ] AO_METRICS_PRIMARY = [ ('sqlserver.ao.primary_replica_health', 'sys.dm_hadr_availability_group_states', 'primary_recovery_health'), ] AO_METRICS_SECONDARY = [ ('sqlserver.ao.secondary_replica_health', 'sys.dm_hadr_availability_group_states', 'secondary_recovery_health'), ] # AlwaysOn metrics for Failover Cluster Instances (FCI). # This is in a separate category than other AlwaysOn metrics # because FCI specifies a different SQLServer setup # compared to Availability Groups (AG). # datadog metric name, sql table, column name # FCI status enum: # 0 = Up, 1 = Down, 2 = Paused, 3 = Joining, -1 = Unknown FCI_METRICS = [ ('sqlserver.fci.status', 'sys.dm_os_cluster_nodes', 'status'), ('sqlserver.fci.is_current_owner', 'sys.dm_os_cluster_nodes', 'is_current_owner'), ] # Non-performance table metrics - can be database specific # datadog metric name, sql table, column name TASK_SCHEDULER_METRICS = [ ('sqlserver.scheduler.current_tasks_count', 'sys.dm_os_schedulers', 'current_tasks_count'), ('sqlserver.scheduler.current_workers_count', 'sys.dm_os_schedulers', 'current_workers_count'), ('sqlserver.scheduler.active_workers_count', 'sys.dm_os_schedulers', 'active_workers_count'), ('sqlserver.scheduler.runnable_tasks_count', 'sys.dm_os_schedulers', 'runnable_tasks_count'), ('sqlserver.scheduler.work_queue_count', 'sys.dm_os_schedulers', 'work_queue_count'), ('sqlserver.task.context_switches_count', 'sys.dm_os_tasks', 'context_switches_count'), ('sqlserver.task.pending_io_count', 'sys.dm_os_tasks', 'pending_io_count'), ('sqlserver.task.pending_io_byte_count', 'sys.dm_os_tasks', 'pending_io_byte_count'), ('sqlserver.task.pending_io_byte_average', 'sys.dm_os_tasks', 'pending_io_byte_average'), ] # Non-performance table metrics # datadog metric name, sql table, column name # Files State enum: # 0 = Online, 1 = Restoring, 2 = Recovering, 3 = Recovery_Pending, # 4 = Suspect, 5 = Unknown, 6 = Offline, 7 = Defunct # Database State enum: # 0 = Online, 1 = Restoring, 2 = Recovering, 3 = Recovery_Pending, # 4 = Suspect, 5 = Emergency, 6 = Offline, 7 = Copying, 10 = Offline_Secondary # Is Sync with Backup enum: # 0 = False, 1 = True DATABASE_METRICS = [ ('sqlserver.database.files.size', 'sys.database_files', 'size'), ('sqlserver.database.files.state', 'sys.database_files', 'state'), ('sqlserver.database.state', 'sys.databases', 'state'), ('sqlserver.database.is_sync_with_backup', 'sys.databases', 'is_sync_with_backup'), ('sqlserver.database.backup_count', 'msdb.dbo.backupset', 'backup_set_id_count'), ] DATABASE_FRAGMENTATION_METRICS = [ ( 'sqlserver.database.avg_fragmentation_in_percent', 'sys.dm_db_index_physical_stats', 'avg_fragmentation_in_percent', ), ('sqlserver.database.fragment_count', 'sys.dm_db_index_physical_stats', 'fragment_count'), ( 'sqlserver.database.avg_fragment_size_in_pages', 'sys.dm_db_index_physical_stats', 'avg_fragment_size_in_pages', ), ] def __init__(self, name, init_config, instances): super(SQLServer, self).__init__(name, init_config, instances) self.connection = None self.failed_connections = {} self.instance_metrics = [] self.instance_per_type_metrics = defaultdict(list) self.do_check = True self.autodiscovery = is_affirmative( self.instance.get('database_autodiscovery')) if self.autodiscovery and self.instance.get('database'): self.log.warning( 'sqlserver `database_autodiscovery` and `database` options defined in same instance - ' 'autodiscovery will take precedence.') self.autodiscovery_include = self.instance.get('autodiscovery_include', ['.*']) self.autodiscovery_exclude = self.instance.get('autodiscovery_exclude', []) self._compile_patterns() self.autodiscovery_interval = self.instance.get( 'autodiscovery_interval', DEFAULT_AUTODISCOVERY_INTERVAL) self.databases = set() self.ad_last_check = 0 self.proc = self.instance.get('stored_procedure') self.proc_type_mapping = { 'gauge': self.gauge, 'rate': self.rate, 'histogram': self.histogram } self.custom_metrics = init_config.get('custom_metrics', []) # use QueryManager to process custom queries self._query_manager = QueryManager(self, self.execute_query_raw, queries=[], tags=self.instance.get("tags", [])) self.check_initializations.append(self._query_manager.compile_queries) self.check_initializations.append(self.initialize_connection) def initialize_connection(self): self.connection = Connection(self.init_config, self.instance, self.handle_service_check, self.log) # Pre-process the list of metrics to collect try: # check to see if the database exists before we try any connections to it db_exists, context = self.connection.check_database() if db_exists: if self.instance.get('stored_procedure') is None: with self.connection.open_managed_default_connection(): with self.connection.get_managed_cursor() as cursor: self.autodiscover_databases(cursor) self._make_metric_list_to_collect(self.custom_metrics) else: # How much do we care that the DB doesn't exist? ignore = is_affirmative( self.instance.get("ignore_missing_database", False)) if ignore is not None and ignore: # not much : we expect it. leave checks disabled self.do_check = False self.log.warning( "Database %s does not exist. Disabling checks for this instance.", context) else: # yes we do. Keep trying msg = "Database {} does not exist. Please resolve invalid database and restart agent".format( context) raise ConfigurationError(msg) except SQLConnectionError as e: self.log.exception("Error connecting to database: %s", e) except ConfigurationError: raise except Exception as e: self.log.exception("Initialization exception %s", e) def handle_service_check(self, status, host, database, message=None): custom_tags = self.instance.get("tags", []) if custom_tags is None: custom_tags = [] service_check_tags = ['host:{}'.format(host), 'db:{}'.format(database)] service_check_tags.extend(custom_tags) service_check_tags = list(set(service_check_tags)) self.service_check(self.SERVICE_CHECK_NAME, status, tags=service_check_tags, message=message, raw=True) def _compile_patterns(self): self._include_patterns = self._compile_valid_patterns( self.autodiscovery_include) self._exclude_patterns = self._compile_valid_patterns( self.autodiscovery_exclude) def _compile_valid_patterns(self, patterns): valid_patterns = [] for pattern in patterns: # Ignore empty patterns as they match everything if not pattern: continue try: re.compile(pattern, re.IGNORECASE) except Exception: self.log.warning( '%s is not a valid regular expression and will be ignored', pattern) else: valid_patterns.append(pattern) if valid_patterns: return re.compile('|'.join(valid_patterns), re.IGNORECASE) else: # create unmatchable regex - https://stackoverflow.com/a/1845097/2157429 return re.compile(r'(?!x)x') def autodiscover_databases(self, cursor): if not self.autodiscovery: return False now = time.time() if now - self.ad_last_check > self.autodiscovery_interval: self.log.info('Performing database autodiscovery') cursor.execute(AUTODISCOVERY_QUERY) all_dbs = set(row.name for row in cursor.fetchall()) excluded_dbs = set( [d for d in all_dbs if self._exclude_patterns.match(d)]) included_dbs = set( [d for d in all_dbs if self._include_patterns.match(d)]) self.log.debug( 'Autodiscovered databases: %s, excluding: %s, including: %s', all_dbs, excluded_dbs, included_dbs) # keep included dbs but remove any that were explicitly excluded filtered_dbs = all_dbs.intersection(included_dbs) - excluded_dbs self.log.debug('Resulting filtered databases: %s', filtered_dbs) self.ad_last_check = now if filtered_dbs != self.databases: self.log.debug( 'Databases updated from previous autodiscovery check.') self.databases = filtered_dbs return True return False def _make_metric_list_to_collect(self, custom_metrics): """ Store the list of metrics to collect by instance_key. Will also create and cache cursors to query the db. """ metrics_to_collect = [] tags = self.instance.get('tags', []) # Load instance-level (previously Performance) metrics) # If several check instances are querying the same server host, it can be wise to turn these off # to avoid sending duplicate metrics if is_affirmative(self.instance.get('include_instance_metrics', True)): self._add_performance_counters(chain(self.INSTANCE_METRICS, self.INSTANCE_METRICS_TOTAL), metrics_to_collect, tags, db=None) # populated through autodiscovery if self.databases: for db in self.databases: self._add_performance_counters(self.INSTANCE_METRICS_TOTAL, metrics_to_collect, tags, db=db) # Load database statistics for name, table, column in self.DATABASE_METRICS: # include database as a filter option db_names = self.databases or [ self.instance.get('database', self.connection.DEFAULT_DATABASE) ] for db_name in db_names: cfg = { 'name': name, 'table': table, 'column': column, 'instance_name': db_name, 'tags': tags } metrics_to_collect.append( self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load AlwaysOn metrics if is_affirmative(self.instance.get('include_ao_metrics', False)): for name, table, column in self.AO_METRICS + self.AO_METRICS_PRIMARY + self.AO_METRICS_SECONDARY: db_name = 'master' cfg = { 'name': name, 'table': table, 'column': column, 'instance_name': db_name, 'tags': tags, 'ao_database': self.instance.get('ao_database', None), 'availability_group': self.instance.get('availability_group', None), 'only_emit_local': is_affirmative(self.instance.get('only_emit_local', False)), } metrics_to_collect.append( self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load FCI metrics if is_affirmative(self.instance.get('include_fci_metrics', False)): for name, table, column in self.FCI_METRICS: cfg = { 'name': name, 'table': table, 'column': column, 'tags': tags, } metrics_to_collect.append( self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load metrics from scheduler and task tables, if enabled if is_affirmative( self.instance.get('include_task_scheduler_metrics', False)): for name, table, column in self.TASK_SCHEDULER_METRICS: cfg = { 'name': name, 'table': table, 'column': column, 'tags': tags } metrics_to_collect.append( self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load DB Fragmentation metrics if is_affirmative( self.instance.get('include_db_fragmentation_metrics', False)): db_fragmentation_object_names = self.instance.get( 'db_fragmentation_object_names', []) db_names = self.databases or [ self.instance.get('database', self.connection.DEFAULT_DATABASE) ] if not db_fragmentation_object_names: self.log.debug( "No fragmentation object names specified, will return fragmentation metrics for all " "object_ids of current database(s): %s", db_names, ) for db_name in db_names: for name, table, column in self.DATABASE_FRAGMENTATION_METRICS: cfg = { 'name': name, 'table': table, 'column': column, 'instance_name': db_name, 'tags': tags, 'db_fragmentation_object_names': db_fragmentation_object_names, } metrics_to_collect.append( self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load any custom metrics from conf.d/sqlserver.yaml for cfg in custom_metrics: sql_type = None base_name = None custom_tags = tags + cfg.get('tags', []) cfg['tags'] = custom_tags db_table = cfg.get('table', DEFAULT_PERFORMANCE_TABLE) if db_table not in VALID_TABLES: self.log.error('%s has an invalid table name: %s', cfg['name'], db_table) continue if cfg.get('database', None) and cfg.get( 'database') != self.instance.get('database'): self.log.debug( 'Skipping custom metric %s for database %s, check instance configured for database %s', cfg['name'], cfg.get('database'), self.instance.get('database'), ) continue if db_table == DEFAULT_PERFORMANCE_TABLE: user_type = cfg.get('type') if user_type is not None and user_type not in VALID_METRIC_TYPES: self.log.error('%s has an invalid metric type: %s', cfg['name'], user_type) sql_type = None try: if user_type is None: sql_type, base_name = self.get_sql_type( cfg['counter_name']) except Exception: self.log.warning("Can't load the metric %s, ignoring", cfg['name'], exc_info=True) continue metrics_to_collect.append( self.typed_metric(cfg_inst=cfg, table=db_table, base_name=base_name, user_type=user_type, sql_type=sql_type)) else: for column in cfg['columns']: metrics_to_collect.append( self.typed_metric(cfg_inst=cfg, table=db_table, base_name=base_name, sql_type=sql_type, column=column)) self.instance_metrics = metrics_to_collect self.log.debug("metrics to collect %s", metrics_to_collect) # create an organized grouping of metric names to their metric classes for m in metrics_to_collect: cls = m.__class__.__name__ name = m.sql_name or m.column self.log.debug("Adding metric class %s named %s", cls, name) self.instance_per_type_metrics[cls].append(name) if m.base_name: self.instance_per_type_metrics[cls].append(m.base_name) def _add_performance_counters(self, metrics, metrics_to_collect, tags, db=None): for name, counter_name, instance_name in metrics: try: sql_type, base_name = self.get_sql_type(counter_name) cfg = { 'name': name, 'counter_name': counter_name, 'instance_name': db or instance_name, 'tags': tags, } metrics_to_collect.append( self.typed_metric(cfg_inst=cfg, table=DEFAULT_PERFORMANCE_TABLE, base_name=base_name, sql_type=sql_type)) except SQLConnectionError: raise except Exception: self.log.warning("Can't load the metric %s, ignoring", name, exc_info=True) continue def get_sql_type(self, counter_name): """ Return the type of the performance counter so that we can report it to Datadog correctly If the sql_type is one that needs a base (PERF_RAW_LARGE_FRACTION and PERF_AVERAGE_BULK), the name of the base counter will also be returned """ with self.connection.get_managed_cursor() as cursor: cursor.execute(COUNTER_TYPE_QUERY, (counter_name, )) (sql_type, ) = cursor.fetchone() if sql_type == PERF_LARGE_RAW_BASE: self.log.warning( "Metric %s is of type Base and shouldn't be reported this way", counter_name) base_name = None if sql_type in [PERF_AVERAGE_BULK, PERF_RAW_LARGE_FRACTION]: # This is an ugly hack. For certains type of metric (PERF_RAW_LARGE_FRACTION # and PERF_AVERAGE_BULK), we need two metrics: the metrics specified and # a base metrics to get the ratio. There is no unique schema so we generate # the possible candidates and we look at which ones exist in the db. candidates = ( counter_name + " base", counter_name.replace("(ms)", "base"), counter_name.replace("Avg ", "") + " base", ) try: cursor.execute(BASE_NAME_QUERY, candidates) base_name = cursor.fetchone().counter_name.strip() self.log.debug("Got base metric: %s for metric: %s", base_name, counter_name) except Exception as e: self.log.warning( "Could not get counter_name of base for metric: %s", e) return sql_type, base_name def typed_metric(self, cfg_inst, table, base_name=None, user_type=None, sql_type=None, column=None): """ Create the appropriate BaseSqlServerMetric object, each implementing its method to fetch the metrics properly. If a `type` was specified in the config, it is used to report the value directly fetched from SQLServer. Otherwise, it is decided based on the sql_type, according to microsoft's documentation. """ if table == DEFAULT_PERFORMANCE_TABLE: metric_type_mapping = { PERF_COUNTER_BULK_COUNT: (self.rate, metrics.SqlSimpleMetric), PERF_COUNTER_LARGE_RAWCOUNT: (self.gauge, metrics.SqlSimpleMetric), PERF_LARGE_RAW_BASE: (self.gauge, metrics.SqlSimpleMetric), PERF_RAW_LARGE_FRACTION: (self.gauge, metrics.SqlFractionMetric), PERF_AVERAGE_BULK: (self.gauge, metrics.SqlIncrFractionMetric), } if user_type is not None: # user type overrides any other value metric_type = getattr(self, user_type) cls = metrics.SqlSimpleMetric else: metric_type, cls = metric_type_mapping[sql_type] else: # Lookup metrics classes by their associated table metric_type_str, cls = metrics.TABLE_MAPPING[table] metric_type = getattr(self, metric_type_str) return cls(cfg_inst, base_name, metric_type, column, self.log) def check(self, _): if self.do_check: if self.proc: self.do_stored_procedure_check() else: self.collect_metrics() else: self.log.debug("Skipping check") def collect_metrics(self): """Fetch the metrics from all of the associated database tables.""" with self.connection.open_managed_default_connection(): with self.connection.get_managed_cursor() as cursor: # initiate autodiscovery or if the server was down at check __init__ key could be missing. if self.autodiscover_databases( cursor) or not self.instance_metrics: self._make_metric_list_to_collect(self.custom_metrics) instance_results = {} # Execute the `fetch_all` operations first to minimize the database calls for cls, metric_names in six.iteritems( self.instance_per_type_metrics): if not metric_names: instance_results[cls] = None, None else: rows, cols = getattr(metrics, cls).fetch_all_values( cursor, metric_names, self.log) instance_results[cls] = rows, cols # Using the cached data, extract and report individual metrics for metric in self.instance_metrics: if type(metric) is metrics.SqlIncrFractionMetric: # special case, since it uses the same results as SqlFractionMetric rows, cols = instance_results['SqlFractionMetric'] metric.fetch_metric(rows, cols) else: rows, cols = instance_results[ metric.__class__.__name__] metric.fetch_metric(rows, cols) # reuse connection for any custom queries self._query_manager.execute() def execute_query_raw(self, query): with self.connection.get_managed_cursor() as cursor: cursor.execute(query) return cursor.fetchall() def do_stored_procedure_check(self): """ Fetch the metrics from the stored proc """ proc = self.proc guardSql = self.instance.get('proc_only_if') custom_tags = self.instance.get("tags", []) if (guardSql and self.proc_check_guard(guardSql)) or not guardSql: self.connection.open_db_connections(self.connection.DEFAULT_DB_KEY) cursor = self.connection.get_cursor(self.connection.DEFAULT_DB_KEY) try: self.log.debug("Calling Stored Procedure : %s", proc) if self.connection.get_connector() == 'adodbapi': cursor.callproc(proc) else: # pyodbc does not support callproc; use execute instead. # Reference: https://github.com/mkleehammer/pyodbc/wiki/Calling-Stored-Procedures call_proc = '{{CALL {}}}'.format(proc) cursor.execute(call_proc) rows = cursor.fetchall() self.log.debug("Row count (%s) : %s", proc, cursor.rowcount) for row in rows: tags = [] if row.tags is None or row.tags == '' else row.tags.split( ',') tags.extend(custom_tags) if row.type.lower() in self.proc_type_mapping: self.proc_type_mapping[row.type](row.metric, row.value, tags, raw=True) else: self.log.warning( '%s is not a recognised type from procedure %s, metric %s', row.type, proc, row.metric) except Exception as e: self.log.warning("Could not call procedure %s: %s", proc, e) raise e self.connection.close_cursor(cursor) self.connection.close_db_connections( self.connection.DEFAULT_DB_KEY) else: self.log.info("Skipping call to %s due to only_if", proc) def proc_check_guard(self, sql): """ check to see if the guard SQL returns a single column containing 0 or 1 We return true if 1, else False """ self.connection.open_db_connections(self.connection.PROC_GUARD_DB_KEY) cursor = self.connection.get_cursor(self.connection.PROC_GUARD_DB_KEY) should_run = False try: cursor.execute(sql, ()) result = cursor.fetchone() should_run = result[0] == 1 except Exception as e: self.log.error("Failed to run proc_only_if sql %s : %s", sql, e) self.connection.close_cursor(cursor) self.connection.close_db_connections(self.connection.PROC_GUARD_DB_KEY) return should_run
class SQLServer(AgentCheck): __NAMESPACE__ = 'sqlserver' def __init__(self, name, init_config, instances): super(SQLServer, self).__init__(name, init_config, instances) self._resolved_hostname = None self._agent_hostname = None self.connection = None self.failed_connections = {} self.instance_metrics = [] self.instance_per_type_metrics = defaultdict(set) self.do_check = True self.tags = self.instance.get("tags", []) self.reported_hostname = self.instance.get('reported_hostname') self.autodiscovery = is_affirmative(self.instance.get('database_autodiscovery')) self.autodiscovery_include = self.instance.get('autodiscovery_include', ['.*']) self.autodiscovery_exclude = self.instance.get('autodiscovery_exclude', []) self.autodiscovery_db_service_check = is_affirmative(self.instance.get('autodiscovery_db_service_check', True)) self.min_collection_interval = self.instance.get('min_collection_interval', 15) self._compile_patterns() self.autodiscovery_interval = self.instance.get('autodiscovery_interval', DEFAULT_AUTODISCOVERY_INTERVAL) self.databases = set() self.ad_last_check = 0 self.proc = self.instance.get('stored_procedure') self.proc_type_mapping = {'gauge': self.gauge, 'rate': self.rate, 'histogram': self.histogram} self.custom_metrics = init_config.get('custom_metrics', []) # DBM self.dbm_enabled = self.instance.get('dbm', False) self.statement_metrics_config = self.instance.get('query_metrics', {}) or {} self.statement_metrics = SqlserverStatementMetrics(self) self.activity_config = self.instance.get('query_activity', {}) or {} self.activity = SqlserverActivity(self) self.cloud_metadata = {} aws = self.instance.get('aws', {}) gcp = self.instance.get('gcp', {}) azure = self.instance.get('azure', {}) if aws: self.cloud_metadata.update({'aws': aws}) if gcp: self.cloud_metadata.update({'gcp': gcp}) if azure: self.cloud_metadata.update({'azure': azure}) obfuscator_options_config = self.instance.get('obfuscator_options', {}) or {} self.obfuscator_options = to_native_string( json.dumps( { # Valid values for this can be found at # https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/semantic_conventions/database.md#connection-level-attributes 'dbms': 'mssql', 'replace_digits': is_affirmative( obfuscator_options_config.get( 'replace_digits', obfuscator_options_config.get('quantize_sql_tables', False), ) ), 'keep_sql_alias': is_affirmative(obfuscator_options_config.get('keep_sql_alias', True)), 'return_json_metadata': is_affirmative(obfuscator_options_config.get('collect_metadata', True)), 'table_names': is_affirmative(obfuscator_options_config.get('collect_tables', True)), 'collect_commands': is_affirmative(obfuscator_options_config.get('collect_commands', True)), 'collect_comments': is_affirmative(obfuscator_options_config.get('collect_comments', True)), } ) ) self.static_info_cache = TTLCache( maxsize=100, # cache these for a full day ttl=60 * 60 * 24, ) # Query declarations check_queries = [] if is_affirmative(self.instance.get('include_ao_metrics', False)): check_queries.extend( [ QUERY_AO_AVAILABILITY_GROUPS, QUERY_AO_FAILOVER_CLUSTER, QUERY_AO_FAILOVER_CLUSTER_MEMBER, ] ) if is_affirmative(self.instance.get('include_fci_metrics', False)): check_queries.extend([QUERY_FAILOVER_CLUSTER_INSTANCE]) self._check_queries = self._new_query_executor(check_queries) self.check_initializations.append(self._check_queries.compile_queries) self.server_state_queries = self._new_query_executor([QUERY_SERVER_STATIC_INFO]) self.check_initializations.append(self.server_state_queries.compile_queries) # use QueryManager to process custom queries self._query_manager = QueryManager( self, self.execute_query_raw, tags=self.tags, hostname=self.resolved_hostname ) self._dynamic_queries = None self.check_initializations.append(self.config_checks) self.check_initializations.append(self._query_manager.compile_queries) self.check_initializations.append(self.initialize_connection) def cancel(self): self.statement_metrics.cancel() self.activity.cancel() def config_checks(self): if self.autodiscovery and self.instance.get('database'): self.log.warning( 'sqlserver `database_autodiscovery` and `database` options defined in same instance - ' 'autodiscovery will take precedence.' ) if not self.autodiscovery and (self.autodiscovery_include or self.autodiscovery_exclude): self.log.warning( "Autodiscovery is disabled, autodiscovery_include and autodiscovery_exclude will be ignored" ) def split_sqlserver_host_port(self, host): """ Splits the host & port out of the provided SQL Server host connection string, returning (host, port). """ if not host: return host, None host_split = [s.strip() for s in host.split(',')] if len(host_split) == 1: return host_split[0], None if len(host_split) == 2: return host_split # else len > 2 s_host, s_port = host_split[0:2] self.log.warning( "invalid sqlserver host string has more than one comma: %s. using only 1st two items: host:%s, port:%s", host, s_host, s_port, ) return s_host, s_port def _new_query_executor(self, queries): return QueryExecutor( self.execute_query_raw, self, queries=queries, tags=self.tags, hostname=self.resolved_hostname, ) @property def resolved_hostname(self): if self._resolved_hostname is None: if self.reported_hostname: self._resolved_hostname = self.reported_hostname elif self.dbm_enabled: host, port = self.split_sqlserver_host_port(self.instance.get('host')) self._resolved_hostname = resolve_db_host(host) else: self._resolved_hostname = self.agent_hostname return self._resolved_hostname def load_static_information(self): expected_keys = {STATIC_INFO_VERSION, STATIC_INFO_MAJOR_VERSION, STATIC_INFO_ENGINE_EDITION} missing_keys = expected_keys - set(self.static_info_cache.keys()) if missing_keys: with self.connection.open_managed_default_connection(): with self.connection.get_managed_cursor() as cursor: if STATIC_INFO_VERSION not in self.static_info_cache: cursor.execute("select @@version") results = cursor.fetchall() if results and len(results) > 0 and len(results[0]) > 0 and results[0][0]: version = results[0][0] self.static_info_cache[STATIC_INFO_VERSION] = version self.static_info_cache[STATIC_INFO_MAJOR_VERSION] = parse_sqlserver_major_version(version) if not self.static_info_cache[STATIC_INFO_MAJOR_VERSION]: self.log.warning("failed to parse SQL Server major version from version: %s", version) else: self.log.warning("failed to load version static information due to empty results") if STATIC_INFO_ENGINE_EDITION not in self.static_info_cache: cursor.execute("SELECT CAST(ServerProperty('EngineEdition') AS INT) AS Edition") result = cursor.fetchone() if result: self.static_info_cache[STATIC_INFO_ENGINE_EDITION] = result else: self.log.warning("failed to load version static information due to empty results") def debug_tags(self): return self.tags + ['agent_hostname:{}'.format(self.agent_hostname)] def debug_stats_kwargs(self, tags=None): tags = tags if tags else [] return { "tags": self.debug_tags() + tags, "hostname": self.resolved_hostname, "raw": True, } @property def agent_hostname(self): # type: () -> str if self._agent_hostname is None: self._agent_hostname = datadog_agent.get_hostname() return self._agent_hostname def initialize_connection(self): self.connection = Connection(self.init_config, self.instance, self.handle_service_check) # Pre-process the list of metrics to collect try: # check to see if the database exists before we try any connections to it db_exists, context = self.connection.check_database() if db_exists: if self.instance.get('stored_procedure') is None: with self.connection.open_managed_default_connection(): with self.connection.get_managed_cursor() as cursor: self.autodiscover_databases(cursor) self._make_metric_list_to_collect(self.custom_metrics) else: # How much do we care that the DB doesn't exist? ignore = is_affirmative(self.instance.get("ignore_missing_database", False)) if ignore is not None and ignore: # not much : we expect it. leave checks disabled self.do_check = False self.log.warning("Database %s does not exist. Disabling checks for this instance.", context) else: # yes we do. Keep trying msg = "Database {} does not exist. Please resolve invalid database and restart agent".format( context ) raise ConfigurationError(msg) except SQLConnectionError as e: self.log.exception("Error connecting to database: %s", e) except ConfigurationError: raise except Exception as e: self.log.exception("Initialization exception %s", e) def handle_service_check(self, status, host, database, message=None, is_default=True): custom_tags = self.instance.get("tags", []) disable_generic_tags = self.instance.get('disable_generic_tags', False) if custom_tags is None: custom_tags = [] if disable_generic_tags: service_check_tags = ['sqlserver_host:{}'.format(host), 'db:{}'.format(database)] else: service_check_tags = ['host:{}'.format(host), 'sqlserver_host:{}'.format(host), 'db:{}'.format(database)] service_check_tags.extend(custom_tags) service_check_tags = list(set(service_check_tags)) if status is AgentCheck.OK: message = None if is_default: self.service_check(SERVICE_CHECK_NAME, status, tags=service_check_tags, message=message, raw=True) if self.autodiscovery and self.autodiscovery_db_service_check: self.service_check(DATABASE_SERVICE_CHECK_NAME, status, tags=service_check_tags, message=message, raw=True) def _compile_patterns(self): self._include_patterns = self._compile_valid_patterns(self.autodiscovery_include) self._exclude_patterns = self._compile_valid_patterns(self.autodiscovery_exclude) def _compile_valid_patterns(self, patterns): valid_patterns = [] for pattern in patterns: # Ignore empty patterns as they match everything if not pattern: continue try: re.compile(pattern, re.IGNORECASE) except Exception: self.log.warning('%s is not a valid regular expression and will be ignored', pattern) else: valid_patterns.append(pattern) if valid_patterns: return re.compile('|'.join(valid_patterns), re.IGNORECASE) else: # create unmatchable regex - https://stackoverflow.com/a/1845097/2157429 return re.compile(r'(?!x)x') def autodiscover_databases(self, cursor): if not self.autodiscovery: return False now = time.time() if now - self.ad_last_check > self.autodiscovery_interval: self.log.info('Performing database autodiscovery') cursor.execute(AUTODISCOVERY_QUERY) all_dbs = set(row.name for row in cursor.fetchall()) excluded_dbs = set([d for d in all_dbs if self._exclude_patterns.match(d)]) included_dbs = set([d for d in all_dbs if self._include_patterns.match(d)]) self.log.debug( 'Autodiscovered databases: %s, excluding: %s, including: %s', all_dbs, excluded_dbs, included_dbs ) # keep included dbs but remove any that were explicitly excluded filtered_dbs = all_dbs.intersection(included_dbs) - excluded_dbs self.log.debug('Resulting filtered databases: %s', filtered_dbs) self.ad_last_check = now if filtered_dbs != self.databases: self.log.debug('Databases updated from previous autodiscovery check.') self.databases = filtered_dbs return True return False def _make_metric_list_to_collect(self, custom_metrics): """ Store the list of metrics to collect by instance_key. Will also create and cache cursors to query the db. """ metrics_to_collect = [] tags = self.instance.get('tags', []) # Load instance-level (previously Performance) metrics) # If several check instances are querying the same server host, it can be wise to turn these off # to avoid sending duplicate metrics if is_affirmative(self.instance.get('include_instance_metrics', True)): common_metrics = INSTANCE_METRICS if not self.dbm_enabled: common_metrics = common_metrics + DBM_MIGRATED_METRICS self._add_performance_counters( chain(common_metrics, INSTANCE_METRICS_TOTAL), metrics_to_collect, tags, db=None ) # populated through autodiscovery if self.databases: for db in self.databases: self._add_performance_counters(INSTANCE_METRICS_TOTAL, metrics_to_collect, tags, db=db) # Load database statistics for name, table, column in DATABASE_METRICS: # include database as a filter option db_names = self.databases or [self.instance.get('database', self.connection.DEFAULT_DATABASE)] for db_name in db_names: cfg = {'name': name, 'table': table, 'column': column, 'instance_name': db_name, 'tags': tags} metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load AlwaysOn metrics if is_affirmative(self.instance.get('include_ao_metrics', False)): for name, table, column in AO_METRICS + AO_METRICS_PRIMARY + AO_METRICS_SECONDARY: db_name = 'master' cfg = { 'name': name, 'table': table, 'column': column, 'instance_name': db_name, 'tags': tags, 'ao_database': self.instance.get('ao_database', None), 'availability_group': self.instance.get('availability_group', None), 'only_emit_local': is_affirmative(self.instance.get('only_emit_local', False)), } metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load metrics from scheduler and task tables, if enabled if is_affirmative(self.instance.get('include_task_scheduler_metrics', False)): for name, table, column in TASK_SCHEDULER_METRICS: cfg = {'name': name, 'table': table, 'column': column, 'tags': tags} metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load sys.master_files metrics if is_affirmative(self.instance.get('include_master_files_metrics', False)): for name, table, column in DATABASE_MASTER_FILES: cfg = {'name': name, 'table': table, 'column': column, 'tags': tags} metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load DB Fragmentation metrics if is_affirmative(self.instance.get('include_db_fragmentation_metrics', False)): db_fragmentation_object_names = self.instance.get('db_fragmentation_object_names', []) db_names = self.databases or [self.instance.get('database', self.connection.DEFAULT_DATABASE)] if not db_fragmentation_object_names: self.log.debug( "No fragmentation object names specified, will return fragmentation metrics for all " "object_ids of current database(s): %s", db_names, ) for db_name in db_names: for name, table, column in DATABASE_FRAGMENTATION_METRICS: cfg = { 'name': name, 'table': table, 'column': column, 'instance_name': db_name, 'tags': tags, 'db_fragmentation_object_names': db_fragmentation_object_names, } metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column)) # Load any custom metrics from conf.d/sqlserver.yaml for cfg in custom_metrics: sql_type = None base_name = None custom_tags = tags + cfg.get('tags', []) cfg['tags'] = custom_tags db_table = cfg.get('table', DEFAULT_PERFORMANCE_TABLE) if db_table not in VALID_TABLES: self.log.error('%s has an invalid table name: %s', cfg['name'], db_table) continue if cfg.get('database', None) and cfg.get('database') != self.instance.get('database'): self.log.debug( 'Skipping custom metric %s for database %s, check instance configured for database %s', cfg['name'], cfg.get('database'), self.instance.get('database'), ) continue if db_table == DEFAULT_PERFORMANCE_TABLE: user_type = cfg.get('type') if user_type is not None and user_type not in VALID_METRIC_TYPES: self.log.error('%s has an invalid metric type: %s', cfg['name'], user_type) sql_type = None try: if user_type is None: sql_type, base_name = self.get_sql_type(cfg['counter_name']) except Exception: self.log.warning("Can't load the metric %s, ignoring", cfg['name'], exc_info=True) continue metrics_to_collect.append( self.typed_metric( cfg_inst=cfg, table=db_table, base_name=base_name, user_type=user_type, sql_type=sql_type ) ) else: for column in cfg['columns']: metrics_to_collect.append( self.typed_metric( cfg_inst=cfg, table=db_table, base_name=base_name, sql_type=sql_type, column=column ) ) self.instance_metrics = metrics_to_collect self.log.debug("metrics to collect %s", metrics_to_collect) # create an organized grouping of metric names to their metric classes for m in metrics_to_collect: cls = m.__class__.__name__ name = m.sql_name or m.column self.log.debug("Adding metric class %s named %s", cls, name) self.instance_per_type_metrics[cls].add(name) if m.base_name: self.instance_per_type_metrics[cls].add(m.base_name) def _add_performance_counters(self, metrics, metrics_to_collect, tags, db=None): if db is not None: tags = tags + ['database:{}'.format(db)] for name, counter_name, instance_name in metrics: try: sql_type, base_name = self.get_sql_type(counter_name) cfg = { 'name': name, 'counter_name': counter_name, 'instance_name': db or instance_name, 'tags': tags, } metrics_to_collect.append( self.typed_metric( cfg_inst=cfg, table=DEFAULT_PERFORMANCE_TABLE, base_name=base_name, sql_type=sql_type ) ) except SQLConnectionError: raise except Exception: self.log.warning("Can't load the metric %s, ignoring", name, exc_info=True) continue def get_sql_type(self, counter_name): """ Return the type of the performance counter so that we can report it to Datadog correctly If the sql_type is one that needs a base (PERF_RAW_LARGE_FRACTION and PERF_AVERAGE_BULK), the name of the base counter will also be returned """ with self.connection.get_managed_cursor() as cursor: cursor.execute(COUNTER_TYPE_QUERY, (counter_name,)) (sql_type,) = cursor.fetchone() if sql_type == PERF_LARGE_RAW_BASE: self.log.warning("Metric %s is of type Base and shouldn't be reported this way", counter_name) base_name = None if sql_type in [PERF_AVERAGE_BULK, PERF_RAW_LARGE_FRACTION]: # This is an ugly hack. For certains type of metric (PERF_RAW_LARGE_FRACTION # and PERF_AVERAGE_BULK), we need two metrics: the metrics specified and # a base metrics to get the ratio. There is no unique schema so we generate # the possible candidates and we look at which ones exist in the db. candidates = ( counter_name + " base", counter_name.replace("(ms)", "base"), counter_name.replace("Avg ", "") + " base", ) try: cursor.execute(BASE_NAME_QUERY, candidates) base_name = cursor.fetchone().counter_name.strip() self.log.debug("Got base metric: %s for metric: %s", base_name, counter_name) except Exception as e: self.log.warning("Could not get counter_name of base for metric: %s", e) return sql_type, base_name def typed_metric(self, cfg_inst, table, base_name=None, user_type=None, sql_type=None, column=None): """ Create the appropriate BaseSqlServerMetric object, each implementing its method to fetch the metrics properly. If a `type` was specified in the config, it is used to report the value directly fetched from SQLServer. Otherwise, it is decided based on the sql_type, according to microsoft's documentation. """ if table == DEFAULT_PERFORMANCE_TABLE: metric_type_mapping = { PERF_COUNTER_BULK_COUNT: (self.rate, metrics.SqlSimpleMetric), PERF_COUNTER_LARGE_RAWCOUNT: (self.gauge, metrics.SqlSimpleMetric), PERF_LARGE_RAW_BASE: (self.gauge, metrics.SqlSimpleMetric), PERF_RAW_LARGE_FRACTION: (self.gauge, metrics.SqlFractionMetric), PERF_AVERAGE_BULK: (self.gauge, metrics.SqlIncrFractionMetric), } if user_type is not None: # user type overrides any other value metric_type = getattr(self, user_type) cls = metrics.SqlSimpleMetric else: metric_type, cls = metric_type_mapping[sql_type] else: # Lookup metrics classes by their associated table metric_type_str, cls = metrics.TABLE_MAPPING[table] metric_type = getattr(self, metric_type_str) cfg_inst['hostname'] = self.resolved_hostname return cls(cfg_inst, base_name, metric_type, column, self.log) def check(self, _): if self.do_check: self.load_static_information() if self.proc: self.do_stored_procedure_check() else: self.collect_metrics() if self.autodiscovery and self.autodiscovery_db_service_check: for db_name in self.databases: if db_name != self.connection.DEFAULT_DATABASE: try: self.connection.check_database_conns(db_name) except Exception as e: # service_check errors on auto discovered databases should not abort the check self.log.warning("failed service check for auto discovered database: %s", e) if self.dbm_enabled: self.statement_metrics.run_job_loop(self.tags) self.activity.run_job_loop(self.tags) else: self.log.debug("Skipping check") @property def dynamic_queries(self): """ Initializes dynamic queries which depend on static information loaded from the database """ if self._dynamic_queries: return self._dynamic_queries major_version = self.static_info_cache.get(STATIC_INFO_MAJOR_VERSION) if not major_version: self.log.warning("missing major_version, cannot initialize dynamic queries") return None queries = [get_query_file_stats(major_version)] self._dynamic_queries = self._new_query_executor(queries) self._dynamic_queries.compile_queries() self.log.debug("initialized dynamic queries") return self._dynamic_queries def collect_metrics(self): """Fetch the metrics from all of the associated database tables.""" with self.connection.open_managed_default_connection(): with self.connection.get_managed_cursor() as cursor: # initiate autodiscovery or if the server was down at check __init__ key could be missing. if self.autodiscover_databases(cursor) or not self.instance_metrics: self._make_metric_list_to_collect(self.custom_metrics) instance_results = {} # Execute the `fetch_all` operations first to minimize the database calls for cls, metric_names in six.iteritems(self.instance_per_type_metrics): if not metric_names: instance_results[cls] = None, None else: try: db_names = self.databases or [ self.instance.get('database', self.connection.DEFAULT_DATABASE) ] rows, cols = getattr(metrics, cls).fetch_all_values( cursor, list(metric_names), self.log, databases=db_names ) except Exception as e: self.log.error("Error running `fetch_all` for metrics %s - skipping. Error: %s", cls, e) rows, cols = None, None instance_results[cls] = rows, cols # Using the cached data, extract and report individual metrics for metric in self.instance_metrics: if type(metric) is metrics.SqlIncrFractionMetric: # special case, since it uses the same results as SqlFractionMetric key = 'SqlFractionMetric' else: key = metric.__class__.__name__ if key not in instance_results: self.log.warning("No %s metrics found, skipping", str(key)) else: rows, cols = instance_results[key] if rows is not None: metric.fetch_metric(rows, cols) # Neither pyodbc nor adodbapi are able to read results of a query if the number of rows affected # statement are returned as part of the result set, so we disable for the entire connection # this is important mostly for custom_queries or the stored_procedure feature # https://docs.microsoft.com/en-us/sql/t-sql/statements/set-nocount-transact-sql with self.connection.get_managed_cursor() as cursor: cursor.execute("SET NOCOUNT ON") try: # Server state queries require VIEW SERVER STATE permissions, which some managed database # versions do not support. if self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION) not in [ ENGINE_EDITION_SQL_DATABASE, ]: self.server_state_queries.execute() self._check_queries.execute() if self.dynamic_queries: self.dynamic_queries.execute() # reuse connection for any custom queries self._query_manager.execute() finally: with self.connection.get_managed_cursor() as cursor: cursor.execute("SET NOCOUNT OFF") def execute_query_raw(self, query): with self.connection.get_managed_cursor() as cursor: cursor.execute(query) return cursor.fetchall() def do_stored_procedure_check(self): """ Fetch the metrics from the stored proc """ proc = self.proc guardSql = self.instance.get('proc_only_if') custom_tags = self.instance.get("tags", []) if (guardSql and self.proc_check_guard(guardSql)) or not guardSql: self.connection.open_db_connections(self.connection.DEFAULT_DB_KEY) cursor = self.connection.get_cursor(self.connection.DEFAULT_DB_KEY) try: self.log.debug("Calling Stored Procedure : %s", proc) if self.connection.get_connector() == 'adodbapi': cursor.callproc(proc) else: # pyodbc does not support callproc; use execute instead. # Reference: https://github.com/mkleehammer/pyodbc/wiki/Calling-Stored-Procedures call_proc = '{{CALL {}}}'.format(proc) cursor.execute(call_proc) rows = cursor.fetchall() self.log.debug("Row count (%s) : %s", proc, cursor.rowcount) for row in rows: tags = [] if row.tags is None or row.tags == '' else row.tags.split(',') tags.extend(custom_tags) if row.type.lower() in self.proc_type_mapping: self.proc_type_mapping[row.type](row.metric, row.value, tags, raw=True) else: self.log.warning( '%s is not a recognised type from procedure %s, metric %s', row.type, proc, row.metric ) except Exception as e: self.log.warning("Could not call procedure %s: %s", proc, e) raise e self.connection.close_cursor(cursor) self.connection.close_db_connections(self.connection.DEFAULT_DB_KEY) else: self.log.info("Skipping call to %s due to only_if", proc) def proc_check_guard(self, sql): """ check to see if the guard SQL returns a single column containing 0 or 1 We return true if 1, else False """ self.connection.open_db_connections(self.connection.PROC_GUARD_DB_KEY) cursor = self.connection.get_cursor(self.connection.PROC_GUARD_DB_KEY) should_run = False try: cursor.execute(sql, ()) result = cursor.fetchone() should_run = result[0] == 1 except Exception as e: self.log.error("Failed to run proc_only_if sql %s : %s", sql, e) self.connection.close_cursor(cursor) self.connection.close_db_connections(self.connection.PROC_GUARD_DB_KEY) return should_run
class MySql(AgentCheck): SERVICE_CHECK_NAME = 'mysql.can_connect' SLAVE_SERVICE_CHECK_NAME = 'mysql.replication.slave_running' DEFAULT_MAX_CUSTOM_QUERIES = 20 def __init__(self, name, init_config, instances): super(MySql, self).__init__(name, init_config, instances) self.qcache_stats = {} self.version = None self.config = MySQLConfig(self.instance) # Create a new connection on every check run self._conn = None self._query_manager = QueryManager(self, self.execute_query_raw, queries=[], tags=self.config.tags) self._statement_metrics = MySQLStatementMetrics(self.config) self.check_initializations.append(self._query_manager.compile_queries) self.innodb_stats = InnoDBMetrics() self.check_initializations.append(self.config.configuration_checks) def execute_query_raw(self, query): with closing(self._conn.cursor(pymysql.cursors.SSCursor)) as cursor: cursor.execute(query) for row in cursor.fetchall_unbuffered(): yield row @AgentCheck.metadata_entrypoint def _send_metadata(self): self.set_metadata('version', self.version.version + '+' + self.version.build) self.set_metadata('flavor', self.version.flavor) @classmethod def get_library_versions(cls): return {'pymysql': pymysql.__version__} def check(self, _): self._set_qcache_stats() with self._connect() as db: try: self._conn = db # version collection self.version = get_version(db) self._send_metadata() # Metric collection self._collect_metrics(db) self._collect_system_metrics(self.config.host, db, self.config.tags) if self.config.deep_database_monitoring: self._collect_statement_metrics(db, self.config.tags) # keeping track of these: self._put_qcache_stats() # Custom queries self._query_manager.execute() except Exception as e: self.log.exception("error!") raise e finally: self._conn = None def _set_qcache_stats(self): host_key = self._get_host_key() qcache_st = self.qcache_stats.get(host_key, (None, None, None)) self._qcache_hits = qcache_st[0] self._qcache_inserts = qcache_st[1] self._qcache_not_cached = qcache_st[2] def _put_qcache_stats(self): host_key = self._get_host_key() self.qcache_stats[host_key] = (self._qcache_hits, self._qcache_inserts, self._qcache_not_cached) def _get_host_key(self): if self.config.defaults_file: return self.config.defaults_file hostkey = self.config.host if self.config.mysql_sock: hostkey = "{0}:{1}".format(hostkey, self.config.mysql_sock) elif self.config.port: hostkey = "{0}:{1}".format(hostkey, self.config.port) return hostkey def _get_connection_args(self): ssl = dict(self.config.ssl) if self.config.ssl else None connection_args = { 'ssl': ssl, 'connect_timeout': self.config.connect_timeout, } if self.config.charset: connection_args['charset'] = self.config.charset if self.config.defaults_file != '': connection_args['read_default_file'] = self.config.defaults_file return connection_args connection_args.update({ 'user': self.config.user, 'passwd': self.config.password }) if self.config.mysql_sock != '': self.service_check_tags = [ 'server:{0}'.format(self.config.mysql_sock), 'port:unix_socket', ] + self.config.tags connection_args.update({'unix_socket': self.config.mysql_sock}) else: connection_args.update({'host': self.config.host}) if self.config.port: connection_args.update({'port': self.config.port}) return connection_args @contextmanager def _connect(self): service_check_tags = [ 'server:{0}'.format( (self.config.mysql_sock if self.config.mysql_sock != '' else self.config.host)), 'port:{}'.format( self.config.port if self.config.port else 'unix_socket'), ] + self.config.tags db = None try: connect_args = self._get_connection_args() db = pymysql.connect(**connect_args) self.log.debug("Connected to MySQL") self.service_check_tags = list(set(service_check_tags)) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags) yield db except Exception: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags) raise finally: if db: db.close() def _collect_metrics(self, db): # Get aggregate of all VARS we want to collect metrics = STATUS_VARS # collect results from db results = self._get_stats_from_status(db) results.update(self._get_stats_from_variables(db)) if not is_affirmative( self.config.options.get( 'disable_innodb_metrics', False)) and self._is_innodb_engine_enabled(db): results.update(self.innodb_stats.get_stats_from_innodb_status(db)) self.innodb_stats.process_innodb_stats(results, self.config.options, metrics) # Binary log statistics if self._get_variable_enabled(results, 'log_bin'): results['Binlog_space_usage_bytes'] = self._get_binary_log_stats( db) # Compute key cache utilization metric key_blocks_unused = collect_scalar('Key_blocks_unused', results) key_cache_block_size = collect_scalar('key_cache_block_size', results) key_buffer_size = collect_scalar('key_buffer_size', results) results['Key_buffer_size'] = key_buffer_size try: # can be null if the unit is missing in the user config (4 instead of 4G for eg.) if key_buffer_size != 0: key_cache_utilization = 1 - ( (key_blocks_unused * key_cache_block_size) / key_buffer_size) results['Key_cache_utilization'] = key_cache_utilization results['Key_buffer_bytes_used'] = collect_scalar( 'Key_blocks_used', results) * key_cache_block_size results['Key_buffer_bytes_unflushed'] = ( collect_scalar('Key_blocks_not_flushed', results) * key_cache_block_size) except TypeError as e: self.log.error( "Not all Key metrics are available, unable to compute: %s", e) metrics.update(VARIABLES_VARS) metrics.update(INNODB_VARS) metrics.update(BINLOG_VARS) if is_affirmative( self.config.options.get('extra_status_metrics', False)): self.log.debug("Collecting Extra Status Metrics") metrics.update(OPTIONAL_STATUS_VARS) if self.version.version_compatible((5, 6, 6)): metrics.update(OPTIONAL_STATUS_VARS_5_6_6) if is_affirmative(self.config.options.get('galera_cluster', False)): # already in result-set after 'SHOW STATUS' just add vars to collect self.log.debug("Collecting Galera Metrics.") metrics.update(GALERA_VARS) performance_schema_enabled = self._get_variable_enabled( results, 'performance_schema') above_560 = self.version.version_compatible((5, 6, 0)) if (is_affirmative( self.config.options.get('extra_performance_metrics', False)) and above_560 and performance_schema_enabled): # report avg query response time per schema to Datadog results[ 'perf_digest_95th_percentile_avg_us'] = self._get_query_exec_time_95th_us( db) results['query_run_time_avg'] = self._query_exec_time_per_schema( db) metrics.update(PERFORMANCE_VARS) if is_affirmative(self.config.options.get('schema_size_metrics', False)): # report avg query response time per schema to Datadog results['information_schema_size'] = self._query_size_per_schema( db) metrics.update(SCHEMA_VARS) if is_affirmative(self.config.options.get('replication', False)): replication_metrics = self._collect_replication_metrics( db, results, above_560) metrics.update(replication_metrics) self._check_replication_status(results) # "synthetic" metrics metrics.update(SYNTHETIC_VARS) self._compute_synthetic_results(results) # remove uncomputed metrics for k in SYNTHETIC_VARS: if k not in results: metrics.pop(k, None) # add duped metrics - reporting some as both rate and gauge dupes = [ ('Table_locks_waited', 'Table_locks_waited_rate'), ('Table_locks_immediate', 'Table_locks_immediate_rate'), ] for src, dst in dupes: if src in results: results[dst] = results[src] self._submit_metrics(metrics, results, self.config.tags) # Collect custom query metrics # Max of 20 queries allowed if isinstance(self.config.queries, list): for check in self.config.queries[:self.config.max_custom_queries]: total_tags = self.config.tags + check.get('tags', []) self._collect_dict(check['type'], {check['field']: check['metric']}, check['query'], db, tags=total_tags) if len(self.config.queries) > self.config.max_custom_queries: self.warning( "Maximum number (%s) of custom queries reached. Skipping the rest.", self.config.max_custom_queries) def _collect_replication_metrics(self, db, results, above_560): # Get replica stats is_mariadb = self.version.flavor == "MariaDB" replication_channel = self.config.options.get('replication_channel') results.update( self._get_replica_stats(db, is_mariadb, replication_channel)) nonblocking = is_affirmative( self.config.options.get('replication_non_blocking_status', False)) results.update(self._get_slave_status(db, above_560, nonblocking)) return REPLICA_VARS def _check_replication_status(self, results): # get slave running form global status page slave_running_status = AgentCheck.UNKNOWN # Slave_IO_Running: Whether the I/O thread for reading the source's binary log is running. # You want this to be Yes unless you have not yet started replication or have explicitly stopped it. slave_io_running = collect_type('Slave_IO_Running', results, dict) # Slave_SQL_Running: Whether the SQL thread for executing events in the relay log is running. slave_sql_running = collect_type('Slave_SQL_Running', results, dict) if slave_io_running: slave_io_running = any(v.lower().strip() == 'yes' for v in itervalues(slave_io_running)) if slave_sql_running: slave_sql_running = any(v.lower().strip() == 'yes' for v in itervalues(slave_sql_running)) binlog_running = results.get('Binlog_enabled', False) # slaves will only be collected iff user has PROCESS privileges. slaves = collect_scalar('Slaves_connected', results) if not (slave_io_running is None and slave_sql_running is None): if not slave_io_running and not slave_sql_running: self.log.debug( "Slave_IO_Running and Slave_SQL_Running are not ok") slave_running_status = AgentCheck.CRITICAL if not slave_io_running or not slave_sql_running: self.log.debug( "Either Slave_IO_Running or Slave_SQL_Running are not ok") slave_running_status = AgentCheck.WARNING if slave_running_status == AgentCheck.UNKNOWN: if self._is_master(slaves, results): # master if slaves > 0 and binlog_running: self.log.debug( "Host is master, there are replicas and binlog is running" ) slave_running_status = AgentCheck.OK else: slave_running_status = AgentCheck.WARNING else: # replica (or standalone) if not (slave_io_running is None and slave_sql_running is None): if slave_io_running and slave_sql_running: self.log.debug( "Slave_IO_Running and Slave_SQL_Running are ok") slave_running_status = AgentCheck.OK # deprecated in favor of service_check("mysql.replication.slave_running") self.gauge(self.SLAVE_SERVICE_CHECK_NAME, 1 if slave_running_status == AgentCheck.OK else 0, tags=self.config.tags) self.service_check(self.SLAVE_SERVICE_CHECK_NAME, slave_running_status, tags=self.service_check_tags) def _collect_statement_metrics(self, db, tags): tags = self.service_check_tags + tags metrics = self._statement_metrics.collect_per_statement_metrics(db) for metric_name, metric_value, metric_tags in metrics: self.count(metric_name, metric_value, tags=list(set(tags + metric_tags))) def _is_master(self, slaves, results): # master uuid only collected in slaves master_host = collect_string('Master_Host', results) if slaves > 0 or not master_host: return True return False def _submit_metrics(self, variables, db_results, tags): for variable, metric in iteritems(variables): metric_name, metric_type = metric for tag, value in collect_all_scalars(variable, db_results): metric_tags = list(tags) if tag: metric_tags.append(tag) if value is not None: if metric_type == RATE: self.rate(metric_name, value, tags=metric_tags) elif metric_type == GAUGE: self.gauge(metric_name, value, tags=metric_tags) elif metric_type == COUNT: self.count(metric_name, value, tags=metric_tags) elif metric_type == MONOTONIC: self.monotonic_count(metric_name, value, tags=metric_tags) def _collect_dict(self, metric_type, field_metric_map, query, db, tags): """ Query status and get a dictionary back. Extract each field out of the dictionary and stuff it in the corresponding metric. query: show status... field_metric_map: {"Seconds_behind_master": "mysqlSecondsBehindMaster"} """ try: with closing(db.cursor()) as cursor: cursor.execute(query) result = cursor.fetchone() if result is not None: for field, metric in list(iteritems(field_metric_map)): # Find the column name in the cursor description to identify the column index # http://www.python.org/dev/peps/pep-0249/ # cursor.description is a tuple of (column_name, ..., ...) try: col_idx = [ d[0].lower() for d in cursor.description ].index(field.lower()) self.log.debug("Collecting metric: %s", metric) if result[col_idx] is not None: self.log.debug("Collecting done, value %s", result[col_idx]) if metric_type == GAUGE: self.gauge(metric, float(result[col_idx]), tags=tags) elif metric_type == RATE: self.rate(metric, float(result[col_idx]), tags=tags) else: self.gauge(metric, float(result[col_idx]), tags=tags) else: self.log.debug( "Received value is None for index %d", col_idx) except ValueError: self.log.exception( "Cannot find %s in the columns %s", field, cursor.description) except Exception: self.warning("Error while running %s\n%s", query, traceback.format_exc()) self.log.exception("Error while running %s", query) def _collect_system_metrics(self, host, db, tags): pid = None # The server needs to run locally, accessed by TCP or socket if host in ["localhost", "127.0.0.1", "0.0.0.0"] or db.port == long(0): pid = self._get_server_pid(db) if pid: self.log.debug("System metrics for mysql w/ pid: %s", pid) # At last, get mysql cpu data out of psutil or procfs try: ucpu, scpu = None, None if PSUTIL_AVAILABLE: proc = psutil.Process(pid) ucpu = proc.cpu_times()[0] scpu = proc.cpu_times()[1] if ucpu and scpu: self.rate("mysql.performance.user_time", ucpu, tags=tags) # should really be system_time self.rate("mysql.performance.kernel_time", scpu, tags=tags) self.rate("mysql.performance.cpu_time", ucpu + scpu, tags=tags) except Exception: self.warning( "Error while reading mysql (pid: %s) procfs data\n%s", pid, traceback.format_exc()) def _get_pid_file_variable(self, db): """ Get the `pid_file` variable """ pid_file = None try: with closing(db.cursor()) as cursor: cursor.execute("SHOW VARIABLES LIKE 'pid_file'") pid_file = cursor.fetchone()[1] except Exception: self.warning("Error while fetching pid_file variable of MySQL.") return pid_file def _get_server_pid(self, db): pid = None # Try to get pid from pid file, it can fail for permission reason pid_file = self._get_pid_file_variable(db) if pid_file is not None: self.log.debug("pid file: %s", str(pid_file)) try: with open(pid_file, 'rb') as f: pid = int(f.readline()) except IOError: self.log.debug("Cannot read mysql pid file %s", pid_file) # If pid has not been found, read it from ps if pid is None and PSUTIL_AVAILABLE: for proc in psutil.process_iter(): try: if proc.name() == PROC_NAME: pid = proc.pid except (psutil.AccessDenied, psutil.ZombieProcess, psutil.NoSuchProcess): continue except Exception: self.log.exception( "Error while fetching mysql pid from psutil") return pid @classmethod def _get_stats_from_status(cls, db): with closing(db.cursor()) as cursor: cursor.execute("SHOW /*!50002 GLOBAL */ STATUS;") results = dict(cursor.fetchall()) return results @classmethod def _get_stats_from_variables(cls, db): with closing(db.cursor()) as cursor: cursor.execute("SHOW GLOBAL VARIABLES;") results = dict(cursor.fetchall()) return results def _get_binary_log_stats(self, db): try: with closing(db.cursor()) as cursor: cursor.execute("SHOW BINARY LOGS;") cursor_results = cursor.fetchall() master_logs = { result[0]: result[1] for result in cursor_results } binary_log_space = 0 for value in itervalues(master_logs): binary_log_space += value return binary_log_space except (pymysql.err.InternalError, pymysql.err.OperationalError) as e: self.warning( "Privileges error accessing the BINARY LOGS (must grant REPLICATION CLIENT): %s", e) return None def _is_innodb_engine_enabled(self, db): # Whether InnoDB engine is available or not can be found out either # from the output of SHOW ENGINES or from information_schema.ENGINES # table. Later is choosen because that involves no string parsing. try: with closing(db.cursor()) as cursor: cursor.execute(SQL_INNODB_ENGINES) return cursor.rowcount > 0 except (pymysql.err.InternalError, pymysql.err.OperationalError, pymysql.err.NotSupportedError) as e: self.warning( "Possibly innodb stats unavailable - error querying engines table: %s", e) return False def _get_replica_stats(self, db, is_mariadb, replication_channel): replica_results = defaultdict(dict) try: with closing(db.cursor(pymysql.cursors.DictCursor)) as cursor: if is_mariadb and replication_channel: cursor.execute( "SET @@default_master_connection = '{0}';".format( replication_channel)) cursor.execute("SHOW SLAVE STATUS;") elif replication_channel: cursor.execute( "SHOW SLAVE STATUS FOR CHANNEL '{0}';".format( replication_channel)) else: cursor.execute("SHOW SLAVE STATUS;") results = cursor.fetchall() self.log.debug("Getting replication status: %s", results) for slave_result in results: # MySQL <5.7 does not have Channel_Name. # For MySQL >=5.7 'Channel_Name' is set to an empty string by default channel = replication_channel or slave_result.get( 'Channel_Name') or 'default' for key, value in iteritems(slave_result): if value is not None: replica_results[key]['channel:{0}'.format( channel)] = value except (pymysql.err.InternalError, pymysql.err.OperationalError) as e: errno, msg = e.args if errno == 1617 and msg == "There is no master connection '{0}'".format( replication_channel): # MariaDB complains when you try to get slave status with a # connection name on the master, without connection name it # responds an empty string as expected. # Mysql behaves the same with or without connection name. pass else: self.warning( "Privileges error getting replication status (must grant REPLICATION CLIENT): %s", e) try: with closing(db.cursor(pymysql.cursors.DictCursor)) as cursor: cursor.execute("SHOW MASTER STATUS;") binlog_results = cursor.fetchone() if binlog_results: replica_results.update({'Binlog_enabled': True}) except (pymysql.err.InternalError, pymysql.err.OperationalError) as e: self.warning( "Privileges error getting binlog information (must grant REPLICATION CLIENT): %s", e) return replica_results def _get_slave_status(self, db, above_560, nonblocking): """ Retrieve the slaves' statuses using: 1. The `performance_schema.threads` table. Non-blocking, requires version > 5.6.0 2. The `information_schema.processlist` table. Blocking """ try: with closing(db.cursor()) as cursor: if above_560 and nonblocking: # Query `performance_schema.threads` instead of ` # information_schema.processlist` to avoid mutex impact on performance. cursor.execute(SQL_WORKER_THREADS) else: cursor.execute(SQL_PROCESS_LIST) slave_results = cursor.fetchall() slaves = 0 for _ in slave_results: slaves += 1 return {'Slaves_connected': slaves} except (pymysql.err.InternalError, pymysql.err.OperationalError) as e: self.warning( "Privileges error accessing the process tables (must grant PROCESS): %s", e) return {} @classmethod def _are_values_numeric(cls, array): return all(v.isdigit() for v in array) def _get_variable_enabled(self, results, var): enabled = collect_string(var, results) return enabled and enabled.lower().strip() == 'on' def _get_query_exec_time_95th_us(self, db): # Fetches the 95th percentile query execution time and returns the value # in microseconds try: with closing(db.cursor()) as cursor: cursor.execute(SQL_95TH_PERCENTILE) if cursor.rowcount < 1: self.warning( "Failed to fetch records from the perf schema \ 'events_statements_summary_by_digest' table.") return None row = cursor.fetchone() query_exec_time_95th_per = row[0] return query_exec_time_95th_per except (pymysql.err.InternalError, pymysql.err.OperationalError) as e: self.warning( "95th percentile performance metrics unavailable at this time: %s", e) return None def _query_exec_time_per_schema(self, db): # Fetches the avg query execution time per schema and returns the # value in microseconds try: with closing(db.cursor()) as cursor: cursor.execute(SQL_AVG_QUERY_RUN_TIME) if cursor.rowcount < 1: self.warning( "Failed to fetch records from the perf schema \ 'events_statements_summary_by_digest' table.") return None schema_query_avg_run_time = {} for row in cursor.fetchall(): schema_name = str(row[0]) avg_us = long(row[1]) # set the tag as the dictionary key schema_query_avg_run_time["schema:{0}".format( schema_name)] = avg_us return schema_query_avg_run_time except (pymysql.err.InternalError, pymysql.err.OperationalError) as e: self.warning( "Avg exec time performance metrics unavailable at this time: %s", e) return None def _query_size_per_schema(self, db): # Fetches the avg query execution time per schema and returns the # value in microseconds try: with closing(db.cursor()) as cursor: cursor.execute(SQL_QUERY_SCHEMA_SIZE) if cursor.rowcount < 1: self.warning( "Failed to fetch records from the information schema 'tables' table." ) return None schema_size = {} for row in cursor.fetchall(): schema_name = str(row[0]) size = long(row[1]) # set the tag as the dictionary key schema_size["schema:{0}".format(schema_name)] = size return schema_size except (pymysql.err.InternalError, pymysql.err.OperationalError) as e: self.warning( "Avg exec time performance metrics unavailable at this time: %s", e) return {} def _compute_synthetic_results(self, results): if ('Qcache_hits' in results) and ('Qcache_inserts' in results) and ('Qcache_not_cached' in results): if not int(results['Qcache_hits']): results['Qcache_utilization'] = 0 else: results['Qcache_utilization'] = ( float(results['Qcache_hits']) / (int(results['Qcache_inserts']) + int(results['Qcache_not_cached']) + int(results['Qcache_hits'])) * 100) if all(v is not None for v in (self._qcache_hits, self._qcache_inserts, self._qcache_not_cached)): if not (int(results['Qcache_hits']) - self._qcache_hits): results['Qcache_instant_utilization'] = 0 else: top = float(results['Qcache_hits']) - self._qcache_hits bottom = ( (int(results['Qcache_inserts']) - self._qcache_inserts) + (int(results['Qcache_not_cached']) - self._qcache_not_cached) + (int(results['Qcache_hits']) - self._qcache_hits)) results['Qcache_instant_utilization'] = (top / bottom) * 100 # update all three, or none - for consistent samples. self._qcache_hits = int(results['Qcache_hits']) self._qcache_inserts = int(results['Qcache_inserts']) self._qcache_not_cached = int(results['Qcache_not_cached'])
class MySql(AgentCheck): SERVICE_CHECK_NAME = 'mysql.can_connect' SLAVE_SERVICE_CHECK_NAME = 'mysql.replication.slave_running' REPLICA_SERVICE_CHECK_NAME = 'mysql.replication.replica_running' GROUP_REPLICATION_SERVICE_CHECK_NAME = 'mysql.replication.group.status' DEFAULT_MAX_CUSTOM_QUERIES = 20 def __init__(self, name, init_config, instances): super(MySql, self).__init__(name, init_config, instances) self.qcache_stats = {} self.version = None self.is_mariadb = None self._resolved_hostname = None self._agent_hostname = None self._is_aurora = None self._config = MySQLConfig(self.instance) # Create a new connection on every check run self._conn = None self._query_manager = QueryManager(self, self.execute_query_raw, queries=[]) self.check_initializations.append(self._query_manager.compile_queries) self.innodb_stats = InnoDBMetrics() self.check_initializations.append(self._config.configuration_checks) self.performance_schema_enabled = None self._warnings_by_code = {} self._statement_metrics = MySQLStatementMetrics(self, self._config, self._get_connection_args()) self._statement_samples = MySQLStatementSamples(self, self._config, self._get_connection_args()) self._query_activity = MySQLActivity(self, self._config, self._get_connection_args()) def execute_query_raw(self, query): with closing(self._conn.cursor(pymysql.cursors.SSCursor)) as cursor: cursor.execute(query) for row in cursor.fetchall_unbuffered(): yield row @AgentCheck.metadata_entrypoint def _send_metadata(self): self.set_metadata('version', self.version.version + '+' + self.version.build) self.set_metadata('flavor', self.version.flavor) @property def resolved_hostname(self): if self._resolved_hostname is None: if self._config.reported_hostname: self._resolved_hostname = self._config.reported_hostname elif self._config.dbm_enabled or self.disable_generic_tags: self._resolved_hostname = self.resolve_db_host() else: self._resolved_hostname = self.agent_hostname return self._resolved_hostname @property def agent_hostname(self): # type: () -> str if self._agent_hostname is None: self._agent_hostname = datadog_agent.get_hostname() return self._agent_hostname def check_performance_schema_enabled(self, db): if self.performance_schema_enabled is None: with closing(db.cursor()) as cursor: cursor.execute("SHOW VARIABLES LIKE 'performance_schema'") results = dict(cursor.fetchall()) self.performance_schema_enabled = self._get_variable_enabled(results, 'performance_schema') return self.performance_schema_enabled def resolve_db_host(self): return agent_host_resolver(self._config.host) def _get_debug_tags(self): return ['agent_hostname:{}'.format(datadog_agent.get_hostname())] @classmethod def get_library_versions(cls): return {'pymysql': pymysql.__version__} def check(self, _): if self.instance.get('user'): self._log_deprecation('_config_renamed', 'user', 'username') if self.instance.get('pass'): self._log_deprecation('_config_renamed', 'pass', 'password') tags = list(self._config.tags) self._set_qcache_stats() with self._connect() as db: try: self._conn = db # version collection self.version = get_version(db) self._send_metadata() self.is_mariadb = self.version.flavor == "MariaDB" if self._get_is_aurora(db): tags = tags + self._get_runtime_aurora_tags(db) self.check_performance_schema_enabled(db) # Metric collection if not self._config.only_custom_queries: self._collect_metrics(db, tags=tags) self._collect_system_metrics(self._config.host, db, tags) if self._config.dbm_enabled: dbm_tags = list(set(self.service_check_tags) | set(tags)) self._statement_metrics.run_job_loop(dbm_tags) self._statement_samples.run_job_loop(dbm_tags) self._query_activity.run_job_loop(dbm_tags) # keeping track of these: self._put_qcache_stats() # Custom queries self._query_manager.execute(extra_tags=tags) except Exception as e: self.log.exception("error!") raise e finally: self._conn = None self._report_warnings() def cancel(self): self._statement_samples.cancel() self._statement_metrics.cancel() self._query_activity.cancel() def _set_qcache_stats(self): host_key = self._get_host_key() qcache_st = self.qcache_stats.get(host_key, (None, None, None)) self._qcache_hits = qcache_st[0] self._qcache_inserts = qcache_st[1] self._qcache_not_cached = qcache_st[2] def _put_qcache_stats(self): host_key = self._get_host_key() self.qcache_stats[host_key] = (self._qcache_hits, self._qcache_inserts, self._qcache_not_cached) def _get_host_key(self): if self._config.defaults_file: return self._config.defaults_file hostkey = self._config.host if self._config.mysql_sock: hostkey = "{0}:{1}".format(hostkey, self._config.mysql_sock) elif self._config.port: hostkey = "{0}:{1}".format(hostkey, self._config.port) return hostkey def _get_connection_args(self): ssl = dict(self._config.ssl) if self._config.ssl else None connection_args = { 'ssl': ssl, 'connect_timeout': self._config.connect_timeout, 'autocommit': True, } if self._config.charset: connection_args['charset'] = self._config.charset if self._config.defaults_file != '': connection_args['read_default_file'] = self._config.defaults_file return connection_args connection_args.update({'user': self._config.user, 'passwd': self._config.password}) if self._config.mysql_sock != '': self.service_check_tags = self._service_check_tags(self._config.mysql_sock) connection_args.update({'unix_socket': self._config.mysql_sock}) else: connection_args.update({'host': self._config.host}) if self._config.port: connection_args.update({'port': self._config.port}) return connection_args def _service_check_tags(self, server=None): # type: (Optional[str]) -> List[str] if server is None: server = self._config.mysql_sock if self._config.mysql_sock != '' else self._config.host service_check_tags = [ 'port:{}'.format(self._config.port if self._config.port else 'unix_socket'), ] + self._config.tags if not self.disable_generic_tags: service_check_tags.append('server:{0}'.format(server)) return service_check_tags @contextmanager def _connect(self): service_check_tags = self._service_check_tags() db = None try: connect_args = self._get_connection_args() db = pymysql.connect(**connect_args) self.log.debug("Connected to MySQL") self.service_check_tags = list(set(service_check_tags)) self.service_check( self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags, hostname=self.resolved_hostname ) yield db except Exception: self.service_check( self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, hostname=self.resolved_hostname ) raise finally: if db: db.close() def _collect_metrics(self, db, tags): # Get aggregate of all VARS we want to collect metrics = copy.deepcopy(STATUS_VARS) # collect results from db results = self._get_stats_from_status(db) results.update(self._get_stats_from_variables(db)) if not is_affirmative( self._config.options.get('disable_innodb_metrics', False) ) and self._is_innodb_engine_enabled(db): results.update(self.innodb_stats.get_stats_from_innodb_status(db)) self.innodb_stats.process_innodb_stats(results, self._config.options, metrics) # Binary log statistics if self._get_variable_enabled(results, 'log_bin'): results['Binlog_space_usage_bytes'] = self._get_binary_log_stats(db) # Compute key cache utilization metric key_blocks_unused = collect_scalar('Key_blocks_unused', results) key_cache_block_size = collect_scalar('key_cache_block_size', results) key_buffer_size = collect_scalar('key_buffer_size', results) results['Key_buffer_size'] = key_buffer_size try: # can be null if the unit is missing in the user config (4 instead of 4G for eg.) if key_buffer_size != 0: key_cache_utilization = 1 - ((key_blocks_unused * key_cache_block_size) / key_buffer_size) results['Key_cache_utilization'] = key_cache_utilization results['Key_buffer_bytes_used'] = collect_scalar('Key_blocks_used', results) * key_cache_block_size results['Key_buffer_bytes_unflushed'] = ( collect_scalar('Key_blocks_not_flushed', results) * key_cache_block_size ) except TypeError as e: self.log.error("Not all Key metrics are available, unable to compute: %s", e) metrics.update(VARIABLES_VARS) metrics.update(INNODB_VARS) metrics.update(BINLOG_VARS) if is_affirmative(self._config.options.get('extra_status_metrics', self._config.dbm_enabled)): self.log.debug("Collecting Extra Status Metrics") metrics.update(OPTIONAL_STATUS_VARS) if self.version.version_compatible((5, 6, 6)): metrics.update(OPTIONAL_STATUS_VARS_5_6_6) if is_affirmative(self._config.options.get('galera_cluster', False)): # already in result-set after 'SHOW STATUS' just add vars to collect self.log.debug("Collecting Galera Metrics.") metrics.update(GALERA_VARS) above_560 = self.version.version_compatible((5, 6, 0)) if ( is_affirmative(self._config.options.get('extra_performance_metrics', False)) and above_560 and self.performance_schema_enabled ): # report size of schemas in MiB to Datadog results['perf_digest_95th_percentile_avg_us'] = self._get_query_exec_time_95th_us(db) results['query_run_time_avg'] = self._query_exec_time_per_schema(db) metrics.update(PERFORMANCE_VARS) if is_affirmative(self._config.options.get('schema_size_metrics', False)): # report avg query response time per schema to Datadog results['information_schema_size'] = self._query_size_per_schema(db) metrics.update(SCHEMA_VARS) if is_affirmative(self._config.options.get('table_size_metrics', False)): # report size of tables in MiB to Datadog (table_index_size, table_data_size) = self._query_size_per_table(db) results['information_table_index_size'] = table_index_size results['information_table_data_size'] = table_data_size metrics.update(TABLE_VARS) if is_affirmative(self._config.options.get('system_table_size_metrics', False)): # report size of tables in MiB to Datadog (table_index_size, table_data_size) = self._query_size_per_table(db, system_tables=True) results['information_table_index_size'] = table_index_size results['information_table_data_size'] = table_data_size metrics.update(TABLE_VARS) if is_affirmative(self._config.options.get('replication', self._config.dbm_enabled)): if self.performance_schema_enabled and self._is_group_replication_active(db): self.log.debug('Collecting group replication metrics.') self._collect_group_replica_metrics(db, results) else: replication_metrics = self._collect_replication_metrics(db, results, above_560) metrics.update(replication_metrics) self._check_replication_status(results) if len(self._config.additional_status) > 0: additional_status_dict = {} for status_dict in self._config.additional_status: status_name = status_dict["name"] status_metric = status_dict["metric_name"] if status_name in metrics.keys(): collected_metric = metrics.get(status_name)[0] self.log.debug( "Skipping status variable %s for metric %s as it is already collected by %s", status_name, status_metric, collected_metric, ) else: additional_status_dict[status_dict["name"]] = (status_dict["metric_name"], status_dict["type"]) metrics.update(additional_status_dict) if len(self._config.additional_variable) > 0: additional_variable_dict = {} for variable_dict in self._config.additional_variable: variable_name = variable_dict["name"] variable_metric = variable_dict["metric_name"] if variable_name in metrics.keys(): collected_metric = metrics.get(variable_name)[0] self.log.debug( "Skipping variable %s for metric %s as it is already collected by %s", variable_name, variable_metric, collected_metric, ) else: additional_variable_dict[variable_name] = (variable_metric, variable_dict["type"]) metrics.update(additional_variable_dict) # "synthetic" metrics metrics.update(SYNTHETIC_VARS) self._compute_synthetic_results(results) # remove uncomputed metrics for k in SYNTHETIC_VARS: if k not in results: metrics.pop(k, None) # add duped metrics - reporting some as both rate and gauge dupes = [ ('Table_locks_waited', 'Table_locks_waited_rate'), ('Table_locks_immediate', 'Table_locks_immediate_rate'), ] for src, dst in dupes: if src in results: results[dst] = results[src] self._submit_metrics(metrics, results, tags) # Collect custom query metrics # Max of 20 queries allowed if isinstance(self._config.queries, list): for check in self._config.queries[: self._config.max_custom_queries]: total_tags = tags + check.get('tags', []) self._collect_dict( check['type'], {check['field']: check['metric']}, check['query'], db, tags=total_tags ) if len(self._config.queries) > self._config.max_custom_queries: self.warning( "Maximum number (%s) of custom queries reached. Skipping the rest.", self._config.max_custom_queries ) def _collect_replication_metrics(self, db, results, above_560): # Get replica stats replication_channel = self._config.options.get('replication_channel') results.update(self._get_replica_stats(db, self.is_mariadb, replication_channel)) nonblocking = is_affirmative(self._config.options.get('replication_non_blocking_status', False)) results.update(self._get_replica_status(db, above_560, nonblocking)) return REPLICA_VARS def _collect_group_replica_metrics(self, db, results): try: with closing(db.cursor()) as cursor: cursor.execute(SQL_GROUP_REPLICATION_MEMBER) replica_results = cursor.fetchone() status = self.OK additional_tags = [] if replica_results is None or len(replica_results) < 3: self.log.warning( 'Unable to get group replica status, setting mysql.replication.group.status as CRITICAL' ) status = self.CRITICAL else: status = self.OK if replica_results[1] == 'ONLINE' else self.CRITICAL additional_tags = [ 'channel_name:{}'.format(replica_results[0]), 'member_state:{}'.format(replica_results[1]), 'member_role:{}'.format(replica_results[2]), ] self.gauge('mysql.replication.group.member_status', 1, tags=additional_tags + self._config.tags) self.service_check( self.GROUP_REPLICATION_SERVICE_CHECK_NAME, status=status, tags=self._service_check_tags() + additional_tags, ) cursor.execute(SQL_GROUP_REPLICATION_METRICS) r = cursor.fetchone() if r is None: self.log.warning('Unable to get group replication metrics') return {} results = { 'Transactions_count': r[1], 'Transactions_check': r[2], 'Conflict_detected': r[3], 'Transactions_row_validating': r[4], 'Transactions_remote_applier_queue': r[5], 'Transactions_remote_applied': r[6], 'Transactions_local_proposed': r[7], 'Transactions_local_rollback': r[8], } # Submit metrics now so it's possible to attach `channel_name` tag self._submit_metrics( GROUP_REPLICATION_VARS, results, self._config.tags + ['channel_name:{}'.format(r[0])] ) return GROUP_REPLICATION_VARS except Exception as e: self.warning("Internal error happened during the group replication check: %s", e) return {} def _check_replication_status(self, results): # Replica_IO_Running: Whether the I/O thread for reading the source's binary log is running. # You want this to be Yes unless you have not yet started replication or have explicitly stopped it. replica_io_running = collect_type('Slave_IO_Running', results, dict) if replica_io_running is None: replica_io_running = collect_type('Replica_IO_Running', results, dict) # Replica_SQL_Running: Whether the SQL thread for executing events in the relay log is running. replica_sql_running = collect_type('Slave_SQL_Running', results, dict) if replica_sql_running is None: replica_sql_running = collect_type('Replica_SQL_Running', results, dict) if replica_io_running: replica_io_running = any(v.lower().strip() == 'yes' for v in itervalues(replica_io_running)) if replica_sql_running: replica_sql_running = any(v.lower().strip() == 'yes' for v in itervalues(replica_sql_running)) binlog_running = results.get('Binlog_enabled', False) # replicas will only be collected if user has PROCESS privileges. replicas = collect_scalar('Slaves_connected', results) if replicas is None: replicas = collect_scalar('Replicas_connected', results) # If the host act as a source source_repl_running_status = AgentCheck.UNKNOWN if self._is_source_host(replicas, results): if replicas > 0 and binlog_running: self.log.debug("Host is master, there are replicas and binlog is running") source_repl_running_status = AgentCheck.OK else: source_repl_running_status = AgentCheck.WARNING self._submit_replication_status(source_repl_running_status, ['replication_mode:source']) # If the host act as a replica # A host can be both a source and a replica # See https://dev.mysql.com/doc/refman/8.0/en/replication-solutions-performance.html # get replica running form global status page replica_running_status = AgentCheck.UNKNOWN if self._is_replica_host(replicas, results): if not (replica_io_running is None and replica_sql_running is None): if not replica_io_running and not replica_sql_running: self.log.debug("Replica_IO_Running and Replica_SQL_Running are not ok") replica_running_status = AgentCheck.CRITICAL elif not replica_io_running or not replica_sql_running: self.log.debug("Either Replica_IO_Running or Replica_SQL_Running are not ok") replica_running_status = AgentCheck.WARNING else: self.log.debug("Replica_IO_Running and Replica_SQL_Running are ok") replica_running_status = AgentCheck.OK self._submit_replication_status(replica_running_status, ['replication_mode:replica']) def _submit_replication_status(self, status, additional_tags): # deprecated in favor of service_check("mysql.replication.slave_running") self.gauge( name=self.SLAVE_SERVICE_CHECK_NAME, value=1 if status == AgentCheck.OK else 0, tags=self._config.tags + additional_tags, hostname=self.resolved_hostname, ) # deprecated in favor of service_check("mysql.replication.replica_running") self.service_check( self.SLAVE_SERVICE_CHECK_NAME, status, tags=self.service_check_tags + additional_tags, hostname=self.resolved_hostname, ) self.service_check( self.REPLICA_SERVICE_CHECK_NAME, status, tags=self.service_check_tags + additional_tags, hostname=self.resolved_hostname, ) def _is_source_host(self, replicas, results): # type: (float, Dict[str, Any]) -> bool # master uuid only collected in replicas source_host = collect_string('Master_Host', results) or collect_string('Source_Host', results) if replicas > 0 or not source_host: return True return False def _is_replica_host(self, replicas, results): return collect_string('Master_Host', results) or collect_string('Source_Host', results) def _is_group_replication_active(self, db): with closing(db.cursor()) as cursor: cursor.execute(SQL_GROUP_REPLICATION_PLUGIN_STATUS) r = cursor.fetchone() # Plugin is installed if r is not None and r[0].lower() == 'active': self.log.debug('Group replication plugin is detected and active') return True self.log.debug('Group replication plugin not detected') return False def _submit_metrics(self, variables, db_results, tags): for variable, metric in iteritems(variables): if isinstance(metric, list): for m in metric: metric_name, metric_type = m self.__submit_metric(metric_name, metric_type, variable, db_results, tags) else: metric_name, metric_type = metric self.__submit_metric(metric_name, metric_type, variable, db_results, tags) def __submit_metric(self, metric_name, metric_type, variable, db_results, tags): for tag, value in collect_all_scalars(variable, db_results): metric_tags = list(tags) if tag: if "," in tag: t_split = tag.split(",") for t in t_split: metric_tags.append(t) else: metric_tags.append(tag) if value is not None: if metric_type == RATE: self.rate(metric_name, value, tags=metric_tags, hostname=self.resolved_hostname) elif metric_type == GAUGE: self.gauge(metric_name, value, tags=metric_tags, hostname=self.resolved_hostname) elif metric_type == COUNT: self.count(metric_name, value, tags=metric_tags, hostname=self.resolved_hostname) elif metric_type == MONOTONIC: self.monotonic_count(metric_name, value, tags=metric_tags, hostname=self.resolved_hostname) def _collect_dict(self, metric_type, field_metric_map, query, db, tags): """ Query status and get a dictionary back. Extract each field out of the dictionary and stuff it in the corresponding metric. query: show status... field_metric_map: {"Seconds_behind_master": "mysqlSecondsBehindMaster"} """ try: with closing(db.cursor()) as cursor: cursor.execute(query) result = cursor.fetchone() if result is not None: for field, metric in list(iteritems(field_metric_map)): # Find the column name in the cursor description to identify the column index # http://www.python.org/dev/peps/pep-0249/ # cursor.description is a tuple of (column_name, ..., ...) try: col_idx = [d[0].lower() for d in cursor.description].index(field.lower()) self.log.debug("Collecting metric: %s", metric) if result[col_idx] is not None: self.log.debug("Collecting done, value %s", result[col_idx]) if metric_type == GAUGE: self.gauge( metric, float(result[col_idx]), tags=tags, hostname=self.resolved_hostname ) elif metric_type == RATE: self.rate( metric, float(result[col_idx]), tags=tags, hostname=self.resolved_hostname ) else: self.gauge( metric, float(result[col_idx]), tags=tags, hostname=self.resolved_hostname ) else: self.log.debug("Received value is None for index %d", col_idx) except ValueError: self.log.exception("Cannot find %s in the columns %s", field, cursor.description) except Exception: self.warning("Error while running %s\n%s", query, traceback.format_exc()) self.log.exception("Error while running %s", query) def _get_runtime_aurora_tags(self, db): runtime_tags = [] try: with closing(db.cursor()) as cursor: cursor.execute(SQL_REPLICATION_ROLE_AWS_AURORA) replication_role = cursor.fetchone()[0] if replication_role in {'writer', 'reader'}: runtime_tags.append('replication_role:' + replication_role) except Exception: self.log.warning("Error occurred while fetching Aurora runtime tags: %s", traceback.format_exc()) return runtime_tags def _collect_system_metrics(self, host, db, tags): pid = None # The server needs to run locally, accessed by TCP or socket if host in ["localhost", "127.0.0.1", "0.0.0.0"] or db.port == long(0): pid = self._get_server_pid(db) if pid: self.log.debug("System metrics for mysql w/ pid: %s", pid) # At last, get mysql cpu data out of psutil or procfs try: if PSUTIL_AVAILABLE: self.log.debug("psutil is available, attempting to collect mysql.performance.* metrics") proc = psutil.Process(pid) ucpu = proc.cpu_times()[0] scpu = proc.cpu_times()[1] if ucpu and scpu: self.rate("mysql.performance.user_time", ucpu, tags=tags, hostname=self.resolved_hostname) # should really be system_time self.rate("mysql.performance.kernel_time", scpu, tags=tags, hostname=self.resolved_hostname) self.rate("mysql.performance.cpu_time", ucpu + scpu, tags=tags, hostname=self.resolved_hostname) else: self.log.debug("psutil is not available, will not collect mysql.performance.* metrics") except Exception: self.warning("Error while reading mysql (pid: %s) procfs data\n%s", pid, traceback.format_exc()) def _get_pid_file_variable(self, db): """ Get the `pid_file` variable """ pid_file = None try: with closing(db.cursor()) as cursor: cursor.execute("SHOW VARIABLES LIKE 'pid_file'") pid_file = cursor.fetchone()[1] except Exception: self.warning("Error while fetching pid_file variable of MySQL.") return pid_file def _get_server_pid(self, db): pid = None # Try to get pid from pid file, it can fail for permission reason pid_file = self._get_pid_file_variable(db) if pid_file is not None: self.log.debug("pid file: %s", str(pid_file)) try: with open(pid_file, 'rb') as f: pid = int(f.readline()) except IOError: self.log.debug("Cannot read mysql pid file %s", pid_file) process_name = [PROC_NAME] if self.is_mariadb and self.version.version_compatible((10, 5, 0)): process_name.append("mariadbd") # If pid has not been found, read it from ps if pid is None and PSUTIL_AVAILABLE: for proc in psutil.process_iter(): try: if proc.name() in process_name: pid = proc.pid except (psutil.AccessDenied, psutil.ZombieProcess, psutil.NoSuchProcess): continue except Exception: self.log.exception("Error while fetching mysql pid from psutil") return pid def _get_is_aurora(self, db): """ Tests if the instance is an AWS Aurora database and caches the result. """ if self._is_aurora is not None: return self._is_aurora try: with closing(db.cursor()) as cursor: cursor.execute(SQL_SERVER_ID_AWS_AURORA) if len(cursor.fetchall()) > 0: self._is_aurora = True else: self._is_aurora = False except Exception: self.warning( "Unable to determine if server is Aurora. If this is an Aurora database, some " "information may be unavailable: %s", traceback.format_exc(), ) return False return self._is_aurora @classmethod def _get_stats_from_status(cls, db): with closing(db.cursor()) as cursor: cursor.execute("SHOW /*!50002 GLOBAL */ STATUS;") results = dict(cursor.fetchall()) return results @classmethod def _get_stats_from_variables(cls, db): with closing(db.cursor()) as cursor: cursor.execute("SHOW GLOBAL VARIABLES;") results = dict(cursor.fetchall()) return results def _get_binary_log_stats(self, db): try: with closing(db.cursor()) as cursor: cursor.execute("SHOW BINARY LOGS;") cursor_results = cursor.fetchall() master_logs = {result[0]: result[1] for result in cursor_results} binary_log_space = 0 for value in itervalues(master_logs): binary_log_space += value return binary_log_space except (pymysql.err.InternalError, pymysql.err.OperationalError) as e: self.warning("Privileges error accessing the BINARY LOGS (must grant REPLICATION CLIENT): %s", e) return None def _is_innodb_engine_enabled(self, db): # Whether InnoDB engine is available or not can be found out either # from the output of SHOW ENGINES or from information_schema.ENGINES # table. Later is chosen because that involves no string parsing. try: with closing(db.cursor()) as cursor: cursor.execute(SQL_INNODB_ENGINES) return cursor.rowcount > 0 except (pymysql.err.InternalError, pymysql.err.OperationalError, pymysql.err.NotSupportedError) as e: self.warning("Possibly innodb stats unavailable - error querying engines table: %s", e) return False def _get_replica_stats(self, db, is_mariadb, replication_channel): replica_results = defaultdict(dict) try: with closing(db.cursor(pymysql.cursors.DictCursor)) as cursor: if is_mariadb and replication_channel: cursor.execute("SET @@default_master_connection = '{0}';".format(replication_channel)) cursor.execute(show_replica_status_query(self.version, is_mariadb, replication_channel)) results = cursor.fetchall() self.log.debug("Getting replication status: %s", results) for replica_result in results: # MySQL <5.7 does not have Channel_Name. # For MySQL >=5.7 'Channel_Name' is set to an empty string by default channel = replication_channel or replica_result.get('Channel_Name') or 'default' for key, value in iteritems(replica_result): if value is not None: replica_results[key]['channel:{0}'.format(channel)] = value except (pymysql.err.InternalError, pymysql.err.OperationalError) as e: errno, msg = e.args if errno == 1617 and msg == "There is no master connection '{0}'".format(replication_channel): # MariaDB complains when you try to get replica status with a # connection name on the master, without connection name it # responds an empty string as expected. # Mysql behaves the same with or without connection name. pass else: self.warning("Privileges error getting replication status (must grant REPLICATION CLIENT): %s", e) try: with closing(db.cursor(pymysql.cursors.DictCursor)) as cursor: cursor.execute("SHOW MASTER STATUS;") binlog_results = cursor.fetchone() if binlog_results: replica_results.update({'Binlog_enabled': True}) except (pymysql.err.InternalError, pymysql.err.OperationalError) as e: self.warning("Privileges error getting binlog information (must grant REPLICATION CLIENT): %s", e) return replica_results def _get_replica_status(self, db, above_560, nonblocking): """ Retrieve the replicas statuses using: 1. The `performance_schema.threads` table. Non-blocking, requires version > 5.6.0 2. The `information_schema.processlist` table. Blocking """ try: with closing(db.cursor()) as cursor: if above_560 and nonblocking: # Query `performance_schema.threads` instead of ` # information_schema.processlist` to avoid mutex impact on performance. cursor.execute(SQL_WORKER_THREADS) else: cursor.execute(SQL_PROCESS_LIST) replica_results = cursor.fetchall() replicas = 0 for _ in replica_results: replicas += 1 return {'Replicas_connected': replicas} except (pymysql.err.InternalError, pymysql.err.OperationalError) as e: self.warning("Privileges error accessing the process tables (must grant PROCESS): %s", e) return {} @classmethod def _are_values_numeric(cls, array): return all(v.isdigit() for v in array) def _get_variable_enabled(self, results, var): enabled = collect_string(var, results) return enabled and enabled.lower().strip() == 'on' def _get_query_exec_time_95th_us(self, db): # Fetches the 95th percentile query execution time and returns the value # in microseconds try: with closing(db.cursor()) as cursor: cursor.execute(SQL_95TH_PERCENTILE) if cursor.rowcount < 1: self.warning( "Failed to fetch records from the perf schema \ 'events_statements_summary_by_digest' table." ) return None row = cursor.fetchone() query_exec_time_95th_per = row[0] return query_exec_time_95th_per except (pymysql.err.InternalError, pymysql.err.OperationalError) as e: self.warning("95th percentile performance metrics unavailable at this time: %s", e) return None def _query_exec_time_per_schema(self, db): # Fetches the avg query execution time per schema and returns the # value in microseconds try: with closing(db.cursor()) as cursor: cursor.execute(SQL_AVG_QUERY_RUN_TIME) if cursor.rowcount < 1: self.warning( "Failed to fetch records from the perf schema \ 'events_statements_summary_by_digest' table." ) return None schema_query_avg_run_time = {} for row in cursor.fetchall(): schema_name = str(row[0]) avg_us = long(row[1]) # set the tag as the dictionary key schema_query_avg_run_time["schema:{0}".format(schema_name)] = avg_us return schema_query_avg_run_time except (pymysql.err.InternalError, pymysql.err.OperationalError) as e: self.warning("Size of schemas metrics unavailable at this time: %s", e) return {} def _query_size_per_table(self, db, system_tables=False): try: with closing(db.cursor()) as cursor: if system_tables: cursor.execute(SQL_QUERY_SYSTEM_TABLE_SIZE) else: cursor.execute(SQL_QUERY_TABLE_SIZE) if cursor.rowcount < 1: self.warning("Failed to fetch records from the information schema 'tables' table.") return None table_index_size = {} table_data_size = {} for row in cursor.fetchall(): table_schema = str(row[0]) table_name = str(row[1]) index_size = float(row[2]) data_size = float(row[3]) # set the tag as the dictionary key table_index_size["schema:{},table:{}".format(table_schema, table_name)] = index_size table_data_size["schema:{},table:{}".format(table_schema, table_name)] = data_size return table_index_size, table_data_size except (pymysql.err.InternalError, pymysql.err.OperationalError) as e: self.warning("Size of tables metrics unavailable at this time: %s", e) return None def _query_size_per_schema(self, db): # Fetches the avg query execution time per schema and returns the # value in microseconds try: with closing(db.cursor()) as cursor: cursor.execute(SQL_QUERY_SCHEMA_SIZE) if cursor.rowcount < 1: self.warning("Failed to fetch records from the information schema 'tables' table.") return None schema_size = {} for row in cursor.fetchall(): schema_name = str(row[0]) size = long(row[1]) # set the tag as the dictionary key schema_size["schema:{0}".format(schema_name)] = size return schema_size except (pymysql.err.InternalError, pymysql.err.OperationalError) as e: self.warning("Avg exec time performance metrics unavailable at this time: %s", e) return {} def _compute_synthetic_results(self, results): if ('Qcache_hits' in results) and ('Qcache_inserts' in results) and ('Qcache_not_cached' in results): if not int(results['Qcache_hits']): results['Qcache_utilization'] = 0 else: results['Qcache_utilization'] = ( float(results['Qcache_hits']) / (int(results['Qcache_inserts']) + int(results['Qcache_not_cached']) + int(results['Qcache_hits'])) * 100 ) if all(v is not None for v in (self._qcache_hits, self._qcache_inserts, self._qcache_not_cached)): if not (int(results['Qcache_hits']) - self._qcache_hits): results['Qcache_instant_utilization'] = 0 else: top = float(results['Qcache_hits']) - self._qcache_hits bottom = ( (int(results['Qcache_inserts']) - self._qcache_inserts) + (int(results['Qcache_not_cached']) - self._qcache_not_cached) + (int(results['Qcache_hits']) - self._qcache_hits) ) results['Qcache_instant_utilization'] = (top / bottom) * 100 # update all three, or none - for consistent samples. self._qcache_hits = int(results['Qcache_hits']) self._qcache_inserts = int(results['Qcache_inserts']) self._qcache_not_cached = int(results['Qcache_not_cached']) def record_warning(self, code, message): # type: (DatabaseConfigurationError, str) -> None self._warnings_by_code[code] = message def _report_warnings(self): messages = self._warnings_by_code.values() # Reset the warnings for the next check run self._warnings_by_code = {} for warning in messages: self.warning(warning)
class SnowflakeCheck(AgentCheck): """ Collect Snowflake account usage metrics """ __NAMESPACE__ = 'snowflake' SERVICE_CHECK_CONNECT = 'snowflake.can_connect' def __init__(self, *args, **kwargs): super(SnowflakeCheck, self).__init__(*args, **kwargs) self._config = Config(self.instance) self._conn = None self.proxy_host = self.init_config.get('proxy_host', None) self.proxy_port = self.init_config.get('proxy_port', None) self.proxy_user = self.init_config.get('proxy_user', None) self.proxy_password = self.init_config.get('proxy_password', None) # Add default tags like account to all metrics self._tags = self._config.tags + [ 'account:{}'.format(self._config.account) ] if self._config.password: self.register_secret(self._config.password) if self._config.role == 'ACCOUNTADMIN': self.log.info( 'Snowflake `role` is set as `ACCOUNTADMIN` which should be used cautiously, ' 'refer to docs about custom roles.') self.metric_queries = [] self.errors = [] for mgroup in self._config.metric_groups: try: self.metric_queries.extend(METRIC_GROUPS[mgroup]) except KeyError: self.errors.append(mgroup) if self.errors: self.log.warning( 'Invalid metric_groups found in snowflake conf.yaml: %s', (', '.join(self.errors))) if not self.metric_queries: raise ConfigurationError( 'No valid metric_groups configured, please list at least one.') self._query_manager = QueryManager(self, self.execute_query_raw, queries=self.metric_queries, tags=self._tags) self.check_initializations.append(self._query_manager.compile_queries) def check(self, _): self.connect() if self._conn is not None: # Execute queries self._query_manager.execute() self._collect_version() self.log.debug("Closing connection to Snowflake...") self._conn.close() def execute_query_raw(self, query): """ Executes query with timestamp from parts if comparing start_time field. """ with closing(self._conn.cursor()) as cursor: cursor.execute(query) if cursor.rowcount is None or cursor.rowcount < 1: self.log.debug("Failed to fetch records from query: `%s`", query) return [] return cursor.fetchall() def connect(self): self.log.debug( "Establishing a new connection to Snowflake: account=%s, user=%s, database=%s, schema=%s, warehouse=%s, " "role=%s, timeout=%s, authenticator=%s, ocsp_response_cache_filename=%s, proxy_host=%s, proxy_port=%s", self._config.account, self._config.user, self._config.database, self._config.schema, self._config.warehouse, self._config.role, self._config.login_timeout, self._config.authenticator, self._config.ocsp_response_cache_filename, self.proxy_host, self.proxy_port, ) try: conn = sf.connect( user=self._config.user, password=self._config.password, account=self._config.account, database=self._config.database, schema=self._config.schema, warehouse=self._config.warehouse, role=self._config.role, passcode_in_password=self._config.passcode_in_password, passcode=self._config.passcode, client_prefetch_threads=self._config.client_prefetch_threads, login_timeout=self._config.login_timeout, ocsp_response_cache_filename=self._config. ocsp_response_cache_filename, authenticator=self._config.authenticator, token=self._config.token, client_session_keep_alive=self._config.client_keep_alive, proxy_host=self.proxy_host, proxy_port=self.proxy_port, proxy_user=self.proxy_user, proxy_password=self.proxy_password, ) except Exception as e: msg = "Unable to connect to Snowflake: {}".format(e) self.service_check(self.SERVICE_CHECK_CONNECT, self.CRITICAL, message=msg, tags=self._tags) self.warning(msg) else: self.service_check(self.SERVICE_CHECK_CONNECT, self.OK, tags=self._tags) self._conn = conn @AgentCheck.metadata_entrypoint def _collect_version(self): try: raw_version = self.execute_query_raw("select current_version();") version = raw_version[0][0] except Exception as e: self.log.error("Error collecting version for Snowflake: %s", e) else: if version: self.set_metadata('version', version)
class SinglestoreCheck(AgentCheck): SERVICE_CHECK_NAME = "can_connect" __NAMESPACE__ = "singlestore" def __init__(self, name, init_config, instances): # type: (AnyStr, Dict[AnyStr, Any], List[Dict[AnyStr, Any]]) -> None super(SinglestoreCheck, self).__init__(name, init_config, instances) self.config = SingleStoreConfig(self.instance) self._connection = cast(pymysql.Connection, None) manager_queries = [] manager_queries.extend(DEFAULT_QUERIES) if self.config.collect_system_metrics: manager_queries.extend(ADDITIONAL_SYSTEM_QUERIES) self._query_manager = QueryManager(self, self.execute_query_raw, queries=manager_queries, tags=self.config.tags) self.check_initializations.append(self._query_manager.compile_queries) self._service_check_tags = [ 'singlestore_endpoint:{}:{}'.format(self.config.host, self.config.port) ] + self.config.tags def check(self, _): # type: (Any) -> None with self.connect() as conn: self._connection = conn self._query_manager.execute() self._connection = cast(pymysql.Connection, None) def execute_query_raw(self, query): # type: (AnyStr) -> Iterable[Sequence] with closing(self._connection.cursor()) as cursor: cursor.execute(query) if cursor.rowcount < 1: self.log.warning("Failed to fetch records from query: `%s`.", query) return cleaner_method = get_row_cleaner(query) for row in cursor.fetchall(): try: yield cleaner_method(row) except Exception: self.log.debug("Unable to clean row %r.", exc_info=True) yield row @contextmanager def connect(self): # type: () -> Iterator[pymysql.Connection] ssl_context = self.get_tls_context() if self.config.use_tls else None conn = cast(pymysql.Connection, None) try: conn = pymysql.connect( host=self.config.host, port=self.config.port, user=self.config.username, password=self.config.password, connect_timeout=self.config.connect_timeout, read_timeout=self.config.read_timeout, ssl=ssl_context, ) self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=self._service_check_tags) self.log.debug("Connected to SingleStore") yield conn except Exception: self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=self._service_check_tags) self.log.exception("Cannot connect to SingleStore") raise finally: if conn: conn.close()