class MySQLStatementMetrics(DBMAsyncJob): """ MySQLStatementMetrics collects database metrics per normalized MySQL statement """ def __init__(self, check, config, connection_args): # (MySql, MySQLConfig) -> None collection_interval = float( config.statement_metrics_config.get('collection_interval', 10)) if collection_interval <= 0: collection_interval = 10 super(MySQLStatementMetrics, self).__init__( check, rate_limit=1 / float(collection_interval), run_sync=is_affirmative( config.statement_metrics_config.get('run_sync', False)), enabled=is_affirmative( config.statement_metrics_config.get('enabled', True)), expected_db_exceptions=(pymysql.err.DatabaseError, ), min_collection_interval=config.min_collection_interval, dbms="mysql", job_name="statement-metrics", shutdown_callback=self._close_db_conn, ) self._metric_collection_interval = collection_interval self._connection_args = connection_args self._db = None self._config = config self.log = get_check_logger() self._state = StatementMetrics() self._obfuscate_options = to_native_string( json.dumps(self._config.obfuscator_options)) # full_statement_text_cache: limit the ingestion rate of full statement text events per query_signature self._full_statement_text_cache = TTLCache( maxsize=self._config.full_statement_text_cache_max_size, ttl=60 * 60 / self._config.full_statement_text_samples_per_hour_per_query, ) # type: TTLCache def _get_db_connection(self): """ lazy reconnect db pymysql connections are not thread safe so we can't reuse the same connection from the main check :return: """ if not self._db: self._db = pymysql.connect(**self._connection_args) return self._db def _close_db_conn(self): if self._db: try: self._db.close() except Exception: self._log.debug("Failed to close db connection", exc_info=1) finally: self._db = None def run_job(self): self.collect_per_statement_metrics() def collect_per_statement_metrics(self): # Detect a database misconfiguration by checking if the performance schema is enabled since mysql # just returns no rows without errors if the performance schema is disabled if not self._check.performance_schema_enabled: self._check.record_warning( DatabaseConfigurationError.performance_schema_not_enabled, warning_with_tags( 'Unable to collect statement metrics because the performance schema is disabled. ' 'See https://docs.datadoghq.com/database_monitoring/setup_mysql/' 'troubleshooting#%s for more details', DatabaseConfigurationError.performance_schema_not_enabled. value, code=DatabaseConfigurationError. performance_schema_not_enabled.value, host=self._check.resolved_hostname, ), ) return rows = self._collect_per_statement_metrics() if not rows: return for event in self._rows_to_fqt_events(rows): self._check.database_monitoring_query_sample( json.dumps(event, default=default_json_event_encoding)) # truncate query text to the maximum length supported by metrics tags for row in rows: row['digest_text'] = row['digest_text'][0:200] if row[ 'digest_text'] is not None else None payload = { 'host': self._check.resolved_hostname, 'timestamp': time.time() * 1000, 'mysql_version': self._check.version.version + '+' + self._check.version.build, 'mysql_flavor': self._check.version.flavor, "ddagenthostname": self._check.agent_hostname, 'ddagentversion': datadog_agent.get_version(), 'min_collection_interval': self._metric_collection_interval, 'tags': self._tags, 'cloud_metadata': self._config.cloud_metadata, 'mysql_rows': rows, } self._check.database_monitoring_query_metrics( json.dumps(payload, default=default_json_event_encoding)) def _collect_per_statement_metrics(self): # type: () -> List[PyMysqlRow] monotonic_rows = self._query_summary_per_statement() monotonic_rows = self._normalize_queries(monotonic_rows) rows = self._state.compute_derivative_rows(monotonic_rows, METRICS_COLUMNS, key=_row_key) return rows def _query_summary_per_statement(self): # type: () -> List[PyMysqlRow] """ Collects per-statement metrics from performance schema. Because the statement sums are cumulative, the results of the previous run are stored and subtracted from the current values to get the counts for the elapsed period. This is similar to monotonic_count, but several fields must be further processed from the delta values. """ sql_statement_summary = """\ SELECT `schema_name`, `digest`, `digest_text`, `count_star`, `sum_timer_wait`, `sum_lock_time`, `sum_errors`, `sum_rows_affected`, `sum_rows_sent`, `sum_rows_examined`, `sum_select_scan`, `sum_select_full_join`, `sum_no_index_used`, `sum_no_good_index_used` FROM performance_schema.events_statements_summary_by_digest WHERE `digest_text` NOT LIKE 'EXPLAIN %' OR `digest_text` IS NULL ORDER BY `count_star` DESC LIMIT 10000""" with closing(self._get_db_connection().cursor( pymysql.cursors.DictCursor)) as cursor: cursor.execute(sql_statement_summary) rows = cursor.fetchall() or [] # type: ignore return rows def _normalize_queries(self, rows): normalized_rows = [] for row in rows: normalized_row = dict(copy.copy(row)) try: statement = obfuscate_sql_with_metadata( row['digest_text'], self._obfuscate_options) obfuscated_statement = statement['query'] if row[ 'digest_text'] is not None else None except Exception as e: self.log.warning("Failed to obfuscate query '%s': %s", row['digest_text'], e) continue normalized_row['digest_text'] = obfuscated_statement normalized_row['query_signature'] = compute_sql_signature( obfuscated_statement) metadata = statement['metadata'] normalized_row['dd_tables'] = metadata.get('tables', None) normalized_row['dd_commands'] = metadata.get('commands', None) normalized_rows.append(normalized_row) return normalized_rows def _rows_to_fqt_events(self, rows): for row in rows: query_cache_key = _row_key(row) if query_cache_key in self._full_statement_text_cache: continue self._full_statement_text_cache[query_cache_key] = True row_tags = self._tags + ["schema:{}".format(row['schema_name']) ] if row['schema_name'] else self._tags yield { "timestamp": time.time() * 1000, "host": self._check.resolved_hostname, "ddagentversion": datadog_agent.get_version(), "ddsource": "mysql", "ddtags": ",".join(row_tags), "dbm_type": "fqt", "db": { "instance": row['schema_name'], "query_signature": row['query_signature'], "statement": row['digest_text'], "metadata": { "tables": row['dd_tables'], "commands": row['dd_commands'], }, }, "mysql": { "schema": row["schema_name"] }, }
class PostgresStatementMetrics(object): """Collects telemetry for SQL statements""" def __init__(self, check, config): self._check = check self._config = config self._db_hostname = None self._log = get_check_logger() self._state = StatementMetrics() def _execute_query(self, cursor, query, params=()): try: self._log.debug("Running query [%s] %s", query, params) cursor.execute(query, params) return cursor.fetchall() except (psycopg2.ProgrammingError, psycopg2.errors.QueryCanceled) as e: self._log.warning('Statement-level metrics are unavailable: %s', e) return [] def _get_pg_stat_statements_columns(self, db): """ Load the list of the columns available under the `pg_stat_statements` table. This must be queried because version is not a reliable way to determine the available columns on `pg_stat_statements`. The database can be upgraded without upgrading extensions, even when the extension is included by default. """ # Querying over '*' with limit 0 allows fetching only the column names from the cursor without data query = STATEMENTS_QUERY.format( cols='*', pg_stat_statements_view=self._config.pg_stat_statements_view, limit=0, filters="" ) cursor = db.cursor() self._execute_query(cursor, query, params=(self._config.dbname,)) colnames = [desc[0] for desc in cursor.description] if cursor.description else None return colnames def _db_hostname_cached(self): if self._db_hostname: return self._db_hostname self._db_hostname = resolve_db_host(self._config.host) return self._db_hostname def collect_per_statement_metrics(self, db, tags): try: rows = self._collect_metrics_rows(db) if not rows: return payload = { 'host': self._db_hostname_cached(), 'timestamp': time.time() * 1000, 'min_collection_interval': self._config.min_collection_interval, 'tags': tags, 'postgres_rows': rows, } self._check.database_monitoring_query_metrics(json.dumps(payload, default=default_json_event_encoding)) except Exception: db.rollback() self._log.exception('Unable to collect statement metrics due to an error') return [] def _load_pg_stat_statements(self, db): available_columns = set(self._get_pg_stat_statements_columns(db)) missing_columns = PG_STAT_STATEMENTS_REQUIRED_COLUMNS - available_columns if len(missing_columns) > 0: self._log.warning( 'Unable to collect statement metrics because required fields are unavailable: %s', ', '.join(list(missing_columns)), ) return [] query_columns = sorted(list(PG_STAT_ALL_DESIRED_COLUMNS & available_columns)) params = () filters = "" if self._config.dbstrict: filters = "AND pg_database.datname = %s" params = (self._config.dbname,) return self._execute_query( db.cursor(cursor_factory=psycopg2.extras.DictCursor), STATEMENTS_QUERY.format( cols=', '.join(query_columns), pg_stat_statements_view=self._config.pg_stat_statements_view, filters=filters, limit=DEFAULT_STATEMENTS_LIMIT, ), params=params, ) def _collect_metrics_rows(self, db): rows = self._load_pg_stat_statements(db) def row_keyfunc(row): return (row['query_signature'], row['datname'], row['rolname']) rows = self._normalize_queries(rows) rows = self._state.compute_derivative_rows(rows, PG_STAT_STATEMENTS_METRICS_COLUMNS, key=row_keyfunc) self._check.gauge('dd.postgres.queries.query_rows_raw', len(rows)) return rows def _normalize_queries(self, rows): normalized_rows = [] for row in rows: normalized_row = dict(copy.copy(row)) try: obfuscated_statement = datadog_agent.obfuscate_sql(row['query']) except Exception as e: # obfuscation errors are relatively common so only log them during debugging self._log.debug("Failed to obfuscate query '%s': %s", row['query'], e) continue normalized_row['query'] = obfuscated_statement normalized_row['query_signature'] = compute_sql_signature(obfuscated_statement) normalized_rows.append(normalized_row) return normalized_rows
class SqlserverStatementMetrics(DBMAsyncJob): """Collects query metrics and plans""" def __init__(self, check): self.check = check self.log = check.log collection_interval = float( check.statement_metrics_config.get('collection_interval', DEFAULT_COLLECTION_INTERVAL) ) if collection_interval <= 0: collection_interval = DEFAULT_COLLECTION_INTERVAL self.collection_interval = collection_interval super(SqlserverStatementMetrics, self).__init__( check, run_sync=is_affirmative(check.statement_metrics_config.get('run_sync', False)), enabled=is_affirmative(check.statement_metrics_config.get('enabled', True)), expected_db_exceptions=(), min_collection_interval=check.min_collection_interval, dbms="sqlserver", rate_limit=1 / float(collection_interval), job_name="query-metrics", shutdown_callback=self._close_db_conn, ) self.disable_secondary_tags = is_affirmative( check.statement_metrics_config.get('disable_secondary_tags', False) ) self.dm_exec_query_stats_row_limit = int( check.statement_metrics_config.get('dm_exec_query_stats_row_limit', 10000) ) self.enforce_collection_interval_deadline = is_affirmative( check.statement_metrics_config.get('enforce_collection_interval_deadline', True) ) self._state = StatementMetrics() self._init_caches() self._conn_key_prefix = "dbm-" self._statement_metrics_query = None self._last_stats_query_time = None def _init_caches(self): # full_statement_text_cache: limit the ingestion rate of full statement text events per query_signature self._full_statement_text_cache = TTLCache( maxsize=self.check.instance.get('full_statement_text_cache_max_size', 10000), ttl=60 * 60 / self.check.instance.get('full_statement_text_samples_per_hour_per_query', 1), ) # seen_plans_ratelimiter: limit the ingestion rate per unique plan. # plans, we only really need them once per hour self._seen_plans_ratelimiter = RateLimitingTTLCache( # assuming ~100 bytes per entry (query & plan signature, key hash, 4 pointers (ordered dict), expiry time) # total size: 10k * 100 = 1 Mb maxsize=int(self.check.instance.get('seen_samples_cache_maxsize', 10000)), ttl=60 * 60 / int(self.check.instance.get('samples_per_hour_per_query', 4)), ) def _close_db_conn(self): pass def _get_available_query_metrics_columns(self, cursor, all_expected_columns): cursor.execute("select top 0 * from sys.dm_exec_query_stats") all_columns = set([i[0] for i in cursor.description]) available_columns = [c for c in all_expected_columns if c in all_columns] missing_columns = set(all_expected_columns) - set(available_columns) if missing_columns: self.log.debug( "missing the following expected query metrics columns from dm_exec_query_stats: %s", missing_columns ) self.log.debug("found available sys.dm_exec_query_stats columns: %s", available_columns) return available_columns def _get_statement_metrics_query_cached(self, cursor): if self._statement_metrics_query: return self._statement_metrics_query available_columns = self._get_available_query_metrics_columns(cursor, SQL_SERVER_QUERY_METRICS_COLUMNS) statements_query = ( STATEMENT_METRICS_QUERY_NO_AGGREGATES if self.disable_secondary_tags else STATEMENT_METRICS_QUERY ) self._statement_metrics_query = statements_query.format( query_metrics_columns=', '.join(available_columns), query_metrics_column_sums=', '.join(['sum({}) as {}'.format(c, c) for c in available_columns]), collection_interval=int(math.ceil(self.collection_interval) * 2), limit=self.dm_exec_query_stats_row_limit, ) return self._statement_metrics_query @tracked_method(agent_check_getter=agent_check_getter, track_result_length=True) def _load_raw_query_metrics_rows(self, cursor): self.log.debug("collecting sql server statement metrics") statement_metrics_query = self._get_statement_metrics_query_cached(cursor) now = time.time() query_interval = self.collection_interval if self._last_stats_query_time: query_interval = now - self._last_stats_query_time self._last_stats_query_time = now params = (math.ceil(query_interval),) self.log.debug("Running query [%s] %s", statement_metrics_query, params) cursor.execute(statement_metrics_query, params) columns = [i[0] for i in cursor.description] # construct row dicts manually as there's no DictCursor for pyodbc rows = [dict(zip(columns, row)) for row in cursor.fetchall()] self.log.debug("loaded sql server statement metrics len(rows)=%s", len(rows)) return rows def _normalize_queries(self, rows): normalized_rows = [] for row in rows: try: statement = obfuscate_sql_with_metadata(row['text'], self.check.obfuscator_options) except Exception as e: # obfuscation errors are relatively common so only log them during debugging self.log.debug("Failed to obfuscate query: %s", e) self.check.count( "dd.sqlserver.statements.error", 1, **self.check.debug_stats_kwargs(tags=["error:obfuscate-query-{}".format(type(e))]) ) continue obfuscated_statement = statement['query'] row['text'] = obfuscated_statement row['query_signature'] = compute_sql_signature(obfuscated_statement) row['query_hash'] = _hash_to_hex(row['query_hash']) row['query_plan_hash'] = _hash_to_hex(row['query_plan_hash']) row['plan_handle'] = _hash_to_hex(row['plan_handle']) metadata = statement['metadata'] row['dd_tables'] = metadata.get('tables', None) row['dd_commands'] = metadata.get('commands', None) row['dd_comments'] = metadata.get('comments', None) normalized_rows.append(row) return normalized_rows def _collect_metrics_rows(self, cursor): rows = self._load_raw_query_metrics_rows(cursor) rows = self._normalize_queries(rows) if not rows: return [] metric_columns = [c for c in rows[0].keys() if c.startswith("total_") or c == 'execution_count'] rows = self._state.compute_derivative_rows(rows, metric_columns, key=_row_key) return rows @staticmethod def _to_metrics_payload_row(row): row = {k: v for k, v in row.items()} if 'dd_comments' in row: del row['dd_comments'] # truncate query text to the maximum length supported by metrics tags row['text'] = row['text'][0:200] return row def _to_metrics_payload(self, rows): return { 'host': self.check.resolved_hostname, 'timestamp': time.time() * 1000, 'min_collection_interval': self.collection_interval, 'tags': self.check.tags, 'cloud_metadata': self.check.cloud_metadata, 'sqlserver_rows': [self._to_metrics_payload_row(r) for r in rows], 'sqlserver_version': self.check.static_info_cache.get(STATIC_INFO_VERSION, ""), 'ddagentversion': datadog_agent.get_version(), 'ddagenthostname': self._check.agent_hostname, } @tracked_method(agent_check_getter=agent_check_getter) def collect_statement_metrics_and_plans(self): """ Collects statement metrics and plans. :return: """ plans_submitted = 0 deadline = time.time() + self.collection_interval # re-use the check's conn module, but set extra_key=dbm- to ensure we get our own # raw connection. adodbapi and pyodbc modules are thread safe, but connections are not. with self.check.connection.open_managed_default_connection(key_prefix=self._conn_key_prefix): with self.check.connection.get_managed_cursor(key_prefix=self._conn_key_prefix) as cursor: rows = self._collect_metrics_rows(cursor) if not rows: return for event in self._rows_to_fqt_events(rows): self.check.database_monitoring_query_sample(json.dumps(event, default=default_json_event_encoding)) payload = self._to_metrics_payload(rows) self.check.database_monitoring_query_metrics(json.dumps(payload, default=default_json_event_encoding)) for event in self._collect_plans(rows, cursor, deadline): self.check.database_monitoring_query_sample(json.dumps(event, default=default_json_event_encoding)) plans_submitted += 1 self.check.count( "dd.sqlserver.statements.plans_submitted.count", plans_submitted, **self.check.debug_stats_kwargs() ) self.check.gauge( "dd.sqlserver.statements.seen_plans_cache.len", len(self._seen_plans_ratelimiter), **self.check.debug_stats_kwargs() ) self.check.gauge( "dd.sqlserver.statements.fqt_cache.len", len(self._full_statement_text_cache), **self.check.debug_stats_kwargs() ) def _rows_to_fqt_events(self, rows): for row in rows: query_cache_key = _row_key(row) if query_cache_key in self._full_statement_text_cache: continue self._full_statement_text_cache[query_cache_key] = True tags = list(self.check.tags) if 'database_name' in row: tags += ["db:{}".format(row['database_name'])] yield { "timestamp": time.time() * 1000, "host": self.check.resolved_hostname, "ddagentversion": datadog_agent.get_version(), "ddsource": "sqlserver", "ddtags": ",".join(tags), "dbm_type": "fqt", "db": { "instance": row.get('database_name', None), "query_signature": row['query_signature'], "statement": row['text'], "metadata": { "tables": row['dd_tables'], "commands": row['dd_commands'], }, }, 'sqlserver': { 'query_hash': row['query_hash'], 'query_plan_hash': row['query_plan_hash'], }, } def run_job(self): self.collect_statement_metrics_and_plans() @tracked_method(agent_check_getter=agent_check_getter) def _load_plan(self, plan_handle, cursor): self.log.debug("collecting plan. plan_handle=%s", plan_handle) self.log.debug("Running query [%s] %s", PLAN_LOOKUP_QUERY, (plan_handle,)) cursor.execute(PLAN_LOOKUP_QUERY, ("0x" + plan_handle,)) result = cursor.fetchall() if not result or not result[0]: self.log.debug("failed to loan plan, it must have just been expired out of the plan cache") return None, None return result[0] @tracked_method(agent_check_getter=agent_check_getter) def _collect_plans(self, rows, cursor, deadline): for row in rows: if self.enforce_collection_interval_deadline and time.time() > deadline: self.log.debug("ending plan collection early because check deadline has been exceeded") self.check.count("dd.sqlserver.statements.deadline_exceeded", 1, **self.check.debug_stats_kwargs()) return plan_key = (row['query_signature'], row['query_hash'], row['query_plan_hash']) if self._seen_plans_ratelimiter.acquire(plan_key): raw_plan, is_plan_encrypted = self._load_plan(row['plan_handle'], cursor) obfuscated_plan, collection_errors = None, None try: if raw_plan: obfuscated_plan = obfuscate_xml_plan(raw_plan, self.check.obfuscator_options) except Exception as e: self.log.debug( ( "failed to obfuscate XML Plan query_signature=%s query_hash=%s " "query_plan_hash=%s plan_handle=%s: %s" ), row['query_signature'], row['query_hash'], row['query_plan_hash'], row['plan_handle'], e, ) collection_errors = [{'code': "obfuscate_xml_plan_error", 'message': str(e)}] self.check.count( "dd.sqlserver.statements.error", 1, **self.check.debug_stats_kwargs(tags=["error:obfuscate-xml-plan-{}".format(type(e))]) ) tags = list(self.check.tags) if 'database_name' in row: tags += ["db:{}".format(row['database_name'])] yield { "host": self.check.resolved_hostname, "ddagentversion": datadog_agent.get_version(), "ddsource": "sqlserver", "ddtags": ",".join(tags), "timestamp": time.time() * 1000, "dbm_type": "plan", "db": { "instance": row.get("database_name", None), "plan": { "definition": obfuscated_plan, "signature": row['query_plan_hash'], "collection_errors": collection_errors, }, "query_signature": row['query_signature'], "statement": row['text'], "metadata": { "tables": row['dd_tables'], "commands": row['dd_commands'], "comments": row['dd_comments'], }, }, 'sqlserver': { "is_plan_encrypted": is_plan_encrypted, "is_statement_encrypted": row['is_encrypted'], 'query_hash': row['query_hash'], 'query_plan_hash': row['query_plan_hash'], 'plan_handle': row['plan_handle'], 'execution_count': row.get('execution_count', None), 'total_elapsed_time': row.get('total_elapsed_time', None), }, }
class PostgresStatementMetrics(object): """Collects telemetry for SQL statements""" def __init__(self, config): self.config = config self.log = get_check_logger() self._state = StatementMetrics() def _execute_query(self, cursor, query, params=()): try: cursor.execute(query, params) return cursor.fetchall() except (psycopg2.ProgrammingError, psycopg2.errors.QueryCanceled) as e: self.log.warning('Statement-level metrics are unavailable: %s', e) return [] def _get_pg_stat_statements_columns(self, db): """ Load the list of the columns available under the `pg_stat_statements` table. This must be queried because version is not a reliable way to determine the available columns on `pg_stat_statements`. The database can be upgraded without upgrading extensions, even when the extension is included by default. """ # Querying over '*' with limit 0 allows fetching only the column names from the cursor without data query = STATEMENTS_QUERY.format( cols='*', pg_stat_statements_view=self.config.pg_stat_statements_view, limit=0, ) cursor = db.cursor() self._execute_query(cursor, query, params=(self.config.dbname, )) colnames = [desc[0] for desc in cursor.description] return colnames def collect_per_statement_metrics(self, db): try: return self._collect_per_statement_metrics(db) except Exception: db.rollback() self.log.exception( 'Unable to collect statement metrics due to an error') return [] def _collect_per_statement_metrics(self, db): metrics = [] available_columns = self._get_pg_stat_statements_columns(db) missing_columns = PG_STAT_STATEMENTS_REQUIRED_COLUMNS - set( available_columns) if len(missing_columns) > 0: self.log.warning( 'Unable to collect statement metrics because required fields are unavailable: %s', ', '.join(list(missing_columns)), ) return metrics desired_columns = (list(PG_STAT_STATEMENTS_METRIC_COLUMNS.keys()) + list(PG_STAT_STATEMENTS_OPTIONAL_COLUMNS) + list(PG_STAT_STATEMENTS_TAG_COLUMNS.keys())) query_columns = list( set(desired_columns) & set(available_columns) | set(PG_STAT_STATEMENTS_TAG_COLUMNS.keys())) rows = self._execute_query( db.cursor(cursor_factory=psycopg2.extras.DictCursor), STATEMENTS_QUERY.format( cols=', '.join(query_columns), pg_stat_statements_view=self.config.pg_stat_statements_view, limit=DEFAULT_STATEMENTS_LIMIT, ), params=(self.config.dbname, ), ) if not rows: return metrics def row_keyfunc(row): # old versions of pg_stat_statements don't have a query ID so fall back to the query string itself queryid = row['queryid'] if 'queryid' in row else row['query'] return (queryid, row['datname'], row['rolname']) rows = self._state.compute_derivative_rows( rows, PG_STAT_STATEMENTS_METRIC_COLUMNS.keys(), key=row_keyfunc) rows = apply_row_limits(rows, DEFAULT_STATEMENT_METRIC_LIMITS, tiebreaker_metric='calls', tiebreaker_reverse=True, key=row_keyfunc) for row in rows: try: normalized_query = datadog_agent.obfuscate_sql(row['query']) if not normalized_query: self.log.warning( "Obfuscation of query '%s' resulted in empty query", row['query']) continue except Exception as e: # If query obfuscation fails, it is acceptable to log the raw query here because the # pg_stat_statements table contains no parameters in the raw queries. self.log.warning("Failed to obfuscate query '%s': %s", row['query'], e) continue query_signature = compute_sql_signature(normalized_query) # All "Deep Database Monitoring" statement-level metrics are tagged with a `query_signature` # which uniquely identifies the normalized query family. Where possible, this hash should # match the hash of APM "resources" (https://docs.datadoghq.com/tracing/visualization/resource/) # when the resource is a SQL query. Postgres' query normalization in the `pg_stat_statements` table # preserves most of the original query, so we tag the `resource_hash` with the same value as the # `query_signature`. The `resource_hash` tag should match the *actual* APM resource hash most of # the time, but not always. So this is a best-effort approach to link these metrics to APM metrics. tags = [ 'query_signature:' + query_signature, 'resource_hash:' + query_signature ] for column, tag_name in PG_STAT_STATEMENTS_TAG_COLUMNS.items(): if column not in row: continue value = row[column] if column == 'query': value = normalize_query_tag(normalized_query) tags.append('{tag_name}:{value}'.format(tag_name=tag_name, value=value)) for column, metric_name in PG_STAT_STATEMENTS_METRIC_COLUMNS.items( ): if column not in row: continue value = row[column] if column == 'total_time': # All "Deep Database Monitoring" timing metrics are in nanoseconds # Postgres tracks pg_stat* timing stats in milliseconds value = milliseconds_to_nanoseconds(value) metrics.append((metric_name, value, tags)) return metrics
def test_compute_derivative_rows_boundary_cases(self, fn_args): sm = StatementMetrics() sm.compute_derivative_rows(*fn_args) sm.compute_derivative_rows(*fn_args)
def test_compute_derivative_rows_happy_path(self): sm = StatementMetrics() rows1 = [ { 'count': 13, 'time': 2005, 'errors': 1, 'query': 'COMMIT', 'db': 'puppies', 'user': '******' }, { 'count': 25, 'time': 105, 'errors': 0, 'query': 'ROLLBACK', 'db': 'puppies', 'user': '******' }, { 'count': 1, 'time': 10005, 'errors': 0, 'query': 'select * from kennel', 'db': 'puppies', 'user': '******' }, { 'count': 99991882777665555, 'time': 10005, 'errors': 0, 'query': 'update kennel set breed="dalmatian" where id = ?', 'db': 'puppies', 'user': '******', }, ] def key(row): return (row['query'], row['db'], row['user']) metrics = ['count', 'time'] assert [] == sm.compute_derivative_rows(rows1, metrics, key=key) # No changes should produce no rows assert [] == sm.compute_derivative_rows(rows1, metrics, key=key) rows2 = [ { 'count': 1, 'time': 1, 'errors': 1, 'query': 'SELECT CURRENT_TIME', 'db': 'puppies', 'user': '******' }, add_to_dict(rows1[0], { 'count': 0, 'time': 0, 'errors': 15 }), add_to_dict(rows1[1], { 'count': 1, 'time': 15, 'errors': 0 }), add_to_dict(rows1[2], { 'count': 20, 'time': 900, 'errors': 0 }), add_to_dict(rows1[3], { 'count': 7, 'time': 0.5, 'errors': 0 }), ] expected = [ # First row only incremented 'errors' which is not a tracked metric, so it is omitted from the output { 'count': 1, 'time': 15, 'errors': 0, 'query': 'ROLLBACK', 'db': 'puppies', 'user': '******' }, { 'count': 20, 'time': 900, 'errors': 0, 'query': 'select * from kennel', 'db': 'puppies', 'user': '******' }, { 'count': 7, 'time': 0.5, 'errors': 0, 'query': 'update kennel set breed="dalmatian" where id = ?', 'db': 'puppies', 'user': '******', }, ] assert expected == sm.compute_derivative_rows(rows2, metrics, key=key) # No changes should produce no rows assert [] == sm.compute_derivative_rows(rows2, metrics, key=key)
class MySQLStatementMetrics(object): """ MySQLStatementMetrics collects database metrics per normalized MySQL statement """ def __init__(self, config): # type: (MySQLConfig) -> None self.config = config self.log = get_check_logger() self._state = StatementMetrics() def collect_per_statement_metrics(self, db): # type: (pymysql.connections.Connection) -> List[Metric] try: return self._collect_per_statement_metrics(db) except Exception: self.log.exception( 'Unable to collect statement metrics due to an error') return [] def _collect_per_statement_metrics(self, db): # type: (pymysql.connections.Connection) -> List[Metric] metrics = [] def keyfunc(row): return (row['schema'], row['digest']) monotonic_rows = self._query_summary_per_statement(db) monotonic_rows = self._merge_duplicate_rows(monotonic_rows, key=keyfunc) rows = self._state.compute_derivative_rows(monotonic_rows, STATEMENT_METRICS.keys(), key=keyfunc) rows = apply_row_limits( rows, DEFAULT_STATEMENT_METRIC_LIMITS, tiebreaker_metric='count', tiebreaker_reverse=True, key=keyfunc, ) for row in rows: tags = [] tags.append('digest:' + row['digest']) if row['schema'] is not None: tags.append('schema:' + row['schema']) try: obfuscated_statement = datadog_agent.obfuscate_sql( row['query']) except Exception as e: self.log.warning("Failed to obfuscate query '%s': %s", row['query'], e) continue tags.append('query_signature:' + compute_sql_signature(obfuscated_statement)) tags.append('query:' + normalize_query_tag(obfuscated_statement).strip()) for col, name in STATEMENT_METRICS.items(): value = row[col] metrics.append((name, value, tags)) return metrics @staticmethod def _merge_duplicate_rows(rows, key): # type: (List[PyMysqlRow], RowKeyFunction) -> List[PyMysqlRow] """ Merges the metrics from duplicate rows because the (schema, digest) identifier may not be unique, see: https://bugs.mysql.com/bug.php?id=79533 """ merged = {} # type: Dict[RowKey, PyMysqlRow] for row in rows: k = key(row) if k in merged: for m in STATEMENT_METRICS: merged[k][m] += row[m] else: merged[k] = copy.copy(row) return list(merged.values()) def _query_summary_per_statement(self, db): # type: (pymysql.connections.Connection) -> List[PyMysqlRow] """ Collects per-statement metrics from performance schema. Because the statement sums are cumulative, the results of the previous run are stored and subtracted from the current values to get the counts for the elapsed period. This is similar to monotonic_count, but several fields must be further processed from the delta values. """ sql_statement_summary = """\ SELECT `schema_name` as `schema`, `digest` as `digest`, `digest_text` as `query`, `count_star` as `count`, `sum_timer_wait` / 1000 as `time`, `sum_lock_time` / 1000 as `lock_time`, `sum_errors` as `errors`, `sum_rows_affected` as `rows_affected`, `sum_rows_sent` as `rows_sent`, `sum_rows_examined` as `rows_examined`, `sum_select_scan` as `select_scan`, `sum_select_full_join` as `select_full_join`, `sum_no_index_used` as `no_index_used`, `sum_no_good_index_used` as `no_good_index_used` FROM performance_schema.events_statements_summary_by_digest WHERE `digest_text` NOT LIKE 'EXPLAIN %' ORDER BY `count_star` DESC LIMIT 10000""" rows = [] # type: List[PyMysqlRow] try: with closing(db.cursor(pymysql.cursors.DictCursor)) as cursor: cursor.execute(sql_statement_summary) rows = cursor.fetchall() or [] # type: ignore except (pymysql.err.InternalError, pymysql.err.OperationalError) as e: self.log.warning( "Statement summary metrics are unavailable at this time: %s", e) return rows
def test_compute_derivative_rows_stats_reset(self): sm = StatementMetrics() def key(row): return (row['query'], row['db'], row['user']) metrics = ['count', 'time'] rows1 = [ { 'count': 13, 'time': 2005, 'errors': 1, 'query': 'COMMIT', 'db': 'puppies', 'user': '******' }, { 'count': 25, 'time': 105, 'errors': 0, 'query': 'ROLLBACK', 'db': 'puppies', 'user': '******' }, ] rows2 = [ add_to_dict(rows1[0], { 'count': 0, 'time': 1, 'errors': 15 }), add_to_dict(rows1[1], { 'count': 1, 'time': 15, 'errors': 0 }), ] # Simulate a stats reset by decreasing one of the metrics rather than increasing rows3 = [ add_to_dict(rows2[1], { 'count': 1, 'time': 15, 'errors': 0 }), add_to_dict(rows2[0], { 'count': -1, 'time': 0, 'errors': 15 }), ] rows4 = [ add_to_dict(rows3[1], { 'count': 1, 'time': 1, 'errors': 0 }), add_to_dict(rows3[0], { 'count': 1, 'time': 1, 'errors': 1 }), ] assert [] == sm.compute_derivative_rows(rows1, metrics, key=key) assert 2 == len(sm.compute_derivative_rows(rows2, metrics, key=key)) assert [] == sm.compute_derivative_rows(rows3, metrics, key=key) assert 2 == len(sm.compute_derivative_rows(rows4, metrics, key=key))
class MySQLStatementMetrics(object): """ MySQLStatementMetrics collects database metrics per normalized MySQL statement """ def __init__(self, check, config): # (MySql, MySQLConfig) -> None self._check = check self._config = config self._db_hostname = None self.log = get_check_logger() self._state = StatementMetrics() # full_statement_text_cache: limit the ingestion rate of full statement text events per query_signature self._full_statement_text_cache = TTLCache( maxsize=self._config.full_statement_text_cache_max_size, ttl=60 * 60 / self._config.full_statement_text_samples_per_hour_per_query, ) # type: TTLCache def _db_hostname_cached(self): if self._db_hostname: return self._db_hostname self._db_hostname = resolve_db_host(self._config.host) return self._db_hostname def collect_per_statement_metrics(self, db, tags): # type: (pymysql.connections.Connection, List[str]) -> None try: rows = self._collect_per_statement_metrics(db) if not rows: return for event in self._rows_to_fqt_events(rows, tags): self._check.database_monitoring_query_sample( json.dumps(event, default=default_json_event_encoding)) # truncate query text to the maximum length supported by metrics tags for row in rows: row['digest_text'] = row['digest_text'][0:200] payload = { 'host': self._db_hostname_cached(), 'timestamp': time.time() * 1000, 'min_collection_interval': self._config.min_collection_interval, 'tags': tags, 'mysql_rows': rows, } self._check.database_monitoring_query_metrics( json.dumps(payload, default=default_json_event_encoding)) except Exception: self.log.exception( 'Unable to collect statement metrics due to an error') def _collect_per_statement_metrics(self, db): # type: (pymysql.connections.Connection) -> List[PyMysqlRow] monotonic_rows = self._query_summary_per_statement(db) monotonic_rows = self._normalize_queries(monotonic_rows) rows = self._state.compute_derivative_rows(monotonic_rows, METRICS_COLUMNS, key=_row_key) return rows def _query_summary_per_statement(self, db): # type: (pymysql.connections.Connection) -> List[PyMysqlRow] """ Collects per-statement metrics from performance schema. Because the statement sums are cumulative, the results of the previous run are stored and subtracted from the current values to get the counts for the elapsed period. This is similar to monotonic_count, but several fields must be further processed from the delta values. """ sql_statement_summary = """\ SELECT `schema_name`, `digest`, `digest_text`, `count_star`, `sum_timer_wait`, `sum_lock_time`, `sum_errors`, `sum_rows_affected`, `sum_rows_sent`, `sum_rows_examined`, `sum_select_scan`, `sum_select_full_join`, `sum_no_index_used`, `sum_no_good_index_used` FROM performance_schema.events_statements_summary_by_digest WHERE `digest_text` NOT LIKE 'EXPLAIN %' ORDER BY `count_star` DESC LIMIT 10000""" rows = [] # type: List[PyMysqlRow] try: with closing(db.cursor(pymysql.cursors.DictCursor)) as cursor: cursor.execute(sql_statement_summary) rows = cursor.fetchall() or [] # type: ignore except (pymysql.err.InternalError, pymysql.err.OperationalError) as e: self.log.warning( "Statement summary metrics are unavailable at this time: %s", e) return rows def _normalize_queries(self, rows): normalized_rows = [] for row in rows: normalized_row = dict(copy.copy(row)) try: obfuscated_statement = datadog_agent.obfuscate_sql( row['digest_text']) except Exception as e: self.log.warning("Failed to obfuscate query '%s': %s", row['digest_text'], e) continue normalized_row['digest_text'] = obfuscated_statement normalized_row['query_signature'] = compute_sql_signature( obfuscated_statement) normalized_rows.append(normalized_row) return normalized_rows def _rows_to_fqt_events(self, rows, tags): for row in rows: query_cache_key = _row_key(row) if query_cache_key in self._full_statement_text_cache: continue self._full_statement_text_cache[query_cache_key] = True row_tags = tags + ["schema:{}".format(row['schema_name']) ] if row['schema_name'] else tags yield { "timestamp": time.time() * 1000, "host": self._db_hostname_cached(), "ddsource": "mysql", "ddtags": ",".join(row_tags), "dbm_type": "fqt", "db": { "instance": row['schema_name'], "query_signature": row['query_signature'], "statement": row['digest_text'], }, "mysql": { "schema": row["schema_name"] }, }
class MySQLStatementMetrics(object): """ MySQLStatementMetrics collects database metrics per normalized MySQL statement """ def __init__(self, check, config): # (MySql, MySQLConfig) -> None self._check = check self._config = config self._db_hostname = None self.log = get_check_logger() self._state = StatementMetrics() def _db_hostname_cached(self): if self._db_hostname: return self._db_hostname self._db_hostname = resolve_db_host(self._config.host) return self._db_hostname def collect_per_statement_metrics(self, db, tags): # type: (pymysql.connections.Connection, List[str]) -> None try: rows = self._collect_per_statement_metrics(db) if not rows: return payload = { 'host': self._db_hostname_cached(), 'timestamp': time.time() * 1000, 'min_collection_interval': self._config.min_collection_interval, 'tags': tags, 'mysql_rows': rows, } self._check.database_monitoring_query_metrics( json.dumps(payload, default=default_json_event_encoding)) except Exception: self.log.exception( 'Unable to collect statement metrics due to an error') def _collect_per_statement_metrics(self, db): # type: (pymysql.connections.Connection) -> List[Metric] metrics = [] # type: List[Metric] def keyfunc(row): return (row['schema_name'], row['query_signature']) monotonic_rows = self._query_summary_per_statement(db) monotonic_rows = self._normalize_queries(monotonic_rows) rows = self._state.compute_derivative_rows(monotonic_rows, METRICS_COLUMNS, key=keyfunc) metrics.append(('dd.mysql.queries.query_rows_raw', len(rows), [])) return rows def _query_summary_per_statement(self, db): # type: (pymysql.connections.Connection) -> List[PyMysqlRow] """ Collects per-statement metrics from performance schema. Because the statement sums are cumulative, the results of the previous run are stored and subtracted from the current values to get the counts for the elapsed period. This is similar to monotonic_count, but several fields must be further processed from the delta values. """ sql_statement_summary = """\ SELECT `schema_name`, `digest`, `digest_text`, `count_star`, `sum_timer_wait`, `sum_lock_time`, `sum_errors`, `sum_rows_affected`, `sum_rows_sent`, `sum_rows_examined`, `sum_select_scan`, `sum_select_full_join`, `sum_no_index_used`, `sum_no_good_index_used` FROM performance_schema.events_statements_summary_by_digest WHERE `digest_text` NOT LIKE 'EXPLAIN %' ORDER BY `count_star` DESC LIMIT 10000""" rows = [] # type: List[PyMysqlRow] try: with closing(db.cursor(pymysql.cursors.DictCursor)) as cursor: cursor.execute(sql_statement_summary) rows = cursor.fetchall() or [] # type: ignore except (pymysql.err.InternalError, pymysql.err.OperationalError) as e: self.log.warning( "Statement summary metrics are unavailable at this time: %s", e) return rows def _normalize_queries(self, rows): normalized_rows = [] for row in rows: normalized_row = dict(copy.copy(row)) try: obfuscated_statement = datadog_agent.obfuscate_sql( row['digest_text']) except Exception as e: self.log.warning("Failed to obfuscate query '%s': %s", row['digest_text'], e) continue normalized_row['digest_text'] = obfuscated_statement normalized_row['query_signature'] = compute_sql_signature( obfuscated_statement) normalized_rows.append(normalized_row) return normalized_rows
class PostgresStatementMetrics(DBMAsyncJob): """Collects telemetry for SQL statements""" def __init__(self, check, config, shutdown_callback): collection_interval = float( config.statement_metrics_config.get('collection_interval', DEFAULT_COLLECTION_INTERVAL) ) if collection_interval <= 0: collection_interval = DEFAULT_COLLECTION_INTERVAL super(PostgresStatementMetrics, self).__init__( check, run_sync=is_affirmative(config.statement_metrics_config.get('run_sync', False)), enabled=is_affirmative(config.statement_metrics_config.get('enabled', True)), expected_db_exceptions=(psycopg2.errors.DatabaseError,), min_collection_interval=config.min_collection_interval, dbms="postgres", rate_limit=1 / float(collection_interval), job_name="query-metrics", shutdown_callback=shutdown_callback, ) self._metrics_collection_interval = collection_interval self._config = config self._state = StatementMetrics() self._stat_column_cache = [] self._obfuscate_options = to_native_string(json.dumps(self._config.obfuscator_options)) # full_statement_text_cache: limit the ingestion rate of full statement text events per query_signature self._full_statement_text_cache = TTLCache( maxsize=config.full_statement_text_cache_max_size, ttl=60 * 60 / config.full_statement_text_samples_per_hour_per_query, ) def _execute_query(self, cursor, query, params=()): try: self._log.debug("Running query [%s] %s", query, params) cursor.execute(query, params) return cursor.fetchall() except (psycopg2.ProgrammingError, psycopg2.errors.QueryCanceled) as e: # A failed query could've derived from incorrect columns within the cache. It's a rare edge case, # but the next time the query is run, it will retrieve the correct columns. self._stat_column_cache = [] raise e def _get_pg_stat_statements_columns(self): """ Load the list of the columns available under the `pg_stat_statements` table. This must be queried because version is not a reliable way to determine the available columns on `pg_stat_statements`. The database can be upgraded without upgrading extensions, even when the extension is included by default. """ if self._stat_column_cache: return self._stat_column_cache # Querying over '*' with limit 0 allows fetching only the column names from the cursor without data query = STATEMENTS_QUERY.format( cols='*', pg_stat_statements_view=self._config.pg_stat_statements_view, limit=0, filters="" ) cursor = self._check._get_db(self._config.dbname).cursor() self._execute_query(cursor, query, params=(self._config.dbname,)) col_names = [desc[0] for desc in cursor.description] if cursor.description else [] self._stat_column_cache = col_names return col_names def run_job(self): self._tags_no_db = [t for t in self._tags if not t.startswith('db:')] self.collect_per_statement_metrics() def _payload_pg_version(self): version = self._check.version if not version: return "" return 'v{major}.{minor}.{patch}'.format(major=version.major, minor=version.minor, patch=version.patch) def collect_per_statement_metrics(self): # exclude the default "db" tag from statement metrics & FQT events because this data is collected from # all databases on the host. For metrics the "db" tag is added during ingestion based on which database # each query came from. try: rows = self._collect_metrics_rows() if not rows: return for event in self._rows_to_fqt_events(rows): self._check.database_monitoring_query_sample(json.dumps(event, default=default_json_event_encoding)) # truncate query text to the maximum length supported by metrics tags for row in rows: row['query'] = row['query'][0:200] payload = { 'host': self._check.resolved_hostname, 'timestamp': time.time() * 1000, 'min_collection_interval': self._metrics_collection_interval, 'tags': self._tags_no_db, 'cloud_metadata': self._config.cloud_metadata, 'postgres_rows': rows, 'postgres_version': self._payload_pg_version(), 'ddagentversion': datadog_agent.get_version(), "ddagenthostname": self._check.agent_hostname, } self._check.database_monitoring_query_metrics(json.dumps(payload, default=default_json_event_encoding)) except Exception: self._log.exception('Unable to collect statement metrics due to an error') return [] def _load_pg_stat_statements(self): try: available_columns = set(self._get_pg_stat_statements_columns()) missing_columns = PG_STAT_STATEMENTS_REQUIRED_COLUMNS - available_columns if len(missing_columns) > 0: self._check.warning( warning_with_tags( "Unable to collect statement metrics because required fields are unavailable: %s.", ', '.join(sorted(list(missing_columns))), host=self._check.resolved_hostname, dbname=self._config.dbname, ), ) self._check.count( "dd.postgres.statement_metrics.error", 1, tags=self._tags + [ "error:database-missing_pg_stat_statements_required_columns", ] + self._check._get_debug_tags(), hostname=self._check.resolved_hostname, ) return [] query_columns = sorted(list(available_columns & PG_STAT_ALL_DESIRED_COLUMNS)) params = () filters = "" if self._config.dbstrict: filters = "AND pg_database.datname = %s" params = (self._config.dbname,) return self._execute_query( self._check._get_db(self._config.dbname).cursor(cursor_factory=psycopg2.extras.DictCursor), STATEMENTS_QUERY.format( cols=', '.join(query_columns), pg_stat_statements_view=self._config.pg_stat_statements_view, filters=filters, limit=DEFAULT_STATEMENTS_LIMIT, ), params=params, ) except psycopg2.Error as e: error_tag = "error:database-{}".format(type(e).__name__) if ( isinstance(e, psycopg2.errors.ObjectNotInPrerequisiteState) ) and 'pg_stat_statements must be loaded' in str(e.pgerror): error_tag = "error:database-{}-pg_stat_statements_not_loaded".format(type(e).__name__) self._check.record_warning( DatabaseConfigurationError.pg_stat_statements_not_loaded, warning_with_tags( "Unable to collect statement metrics because pg_stat_statements " "extension is not loaded in database '%s'. " "See https://docs.datadoghq.com/database_monitoring/setup_postgres/" "troubleshooting#%s for more details", self._config.dbname, DatabaseConfigurationError.pg_stat_statements_not_loaded.value, host=self._check.resolved_hostname, dbname=self._config.dbname, code=DatabaseConfigurationError.pg_stat_statements_not_loaded.value, ), ) elif isinstance(e, psycopg2.errors.UndefinedTable) and 'pg_stat_statements' in str(e.pgerror): error_tag = "error:database-{}-pg_stat_statements_not_created".format(type(e).__name__) self._check.record_warning( DatabaseConfigurationError.pg_stat_statements_not_created, warning_with_tags( "Unable to collect statement metrics because pg_stat_statements is not created " "in database '%s'. See https://docs.datadoghq.com/database_monitoring/setup_postgres/" "troubleshooting#%s for more details", self._config.dbname, DatabaseConfigurationError.pg_stat_statements_not_created.value, host=self._check.resolved_hostname, dbname=self._config.dbname, code=DatabaseConfigurationError.pg_stat_statements_not_created.value, ), ) else: self._check.warning( warning_with_tags( "Unable to collect statement metrics because of an error running queries " "in database '%s'. See https://docs.datadoghq.com/database_monitoring/troubleshooting for " "help: %s", self._config.dbname, str(e), host=self._check.resolved_hostname, dbname=self._config.dbname, ), ) self._check.count( "dd.postgres.statement_metrics.error", 1, tags=self._tags + [error_tag] + self._check._get_debug_tags(), hostname=self._check.resolved_hostname, ) return [] def _emit_pg_stat_statements_metrics(self): query = PG_STAT_STATEMENTS_COUNT_QUERY_LT_9_4 if self._check.version < V9_4 else PG_STAT_STATEMENTS_COUNT_QUERY try: rows = self._execute_query( self._check._get_db(self._config.dbname).cursor(cursor_factory=psycopg2.extras.DictCursor), query, ) count = 0 if rows: count = rows[0][0] self._check.count( "postgresql.pg_stat_statements.max", self._check.pg_settings.get("pg_stat_statements.max", 0), tags=self._tags, hostname=self._check.resolved_hostname, ) self._check.count( "postgresql.pg_stat_statements.count", count, tags=self._tags, hostname=self._check.resolved_hostname, ) except psycopg2.Error as e: self._log.warning("Failed to query for pg_stat_statements count: %s", e) def _collect_metrics_rows(self): rows = self._load_pg_stat_statements() if rows: self._emit_pg_stat_statements_metrics() rows = self._normalize_queries(rows) if not rows: return [] available_columns = set(rows[0].keys()) metric_columns = available_columns & PG_STAT_STATEMENTS_METRICS_COLUMNS rows = self._state.compute_derivative_rows(rows, metric_columns, key=_row_key) self._check.gauge( 'dd.postgres.queries.query_rows_raw', len(rows), tags=self._tags + self._check._get_debug_tags(), hostname=self._check.resolved_hostname, ) return rows def _normalize_queries(self, rows): normalized_rows = [] for row in rows: normalized_row = dict(copy.copy(row)) try: statement = obfuscate_sql_with_metadata(row['query'], self._obfuscate_options) except Exception as e: # obfuscation errors are relatively common so only log them during debugging self._log.debug("Failed to obfuscate query '%s': %s", row['query'], e) continue obfuscated_query = statement['query'] normalized_row['query'] = obfuscated_query normalized_row['query_signature'] = compute_sql_signature(obfuscated_query) metadata = statement['metadata'] normalized_row['dd_tables'] = metadata.get('tables', None) normalized_row['dd_commands'] = metadata.get('commands', None) normalized_rows.append(normalized_row) return normalized_rows def _rows_to_fqt_events(self, rows): for row in rows: query_cache_key = _row_key(row) if query_cache_key in self._full_statement_text_cache: continue self._full_statement_text_cache[query_cache_key] = True row_tags = self._tags_no_db + [ "db:{}".format(row['datname']), "rolname:{}".format(row['rolname']), ] yield { "timestamp": time.time() * 1000, "host": self._check.resolved_hostname, "ddagentversion": datadog_agent.get_version(), "ddsource": "postgres", "ddtags": ",".join(row_tags), "dbm_type": "fqt", "db": { "instance": row['datname'], "query_signature": row['query_signature'], "statement": row['query'], "metadata": { "tables": row['dd_tables'], "commands": row['dd_commands'], }, }, "postgres": { "datname": row["datname"], "rolname": row["rolname"], }, }
def test_compute_derivative_rows_with_duplicates(self): sm = StatementMetrics() def key(row): return (row['query_signature'], row['db'], row['user']) metrics = ['count', 'time'] rows1 = [ { 'count': 13, 'time': 2005, 'errors': 1, 'query': 'SELECT * FROM table1 where id = ANY(?)', 'query_signature': 'sig1', 'db': 'puppies', 'user': '******', }, { 'count': 25, 'time': 105, 'errors': 0, 'query': 'SELECT * FROM table1 where id = ANY(?, ?)', 'query_signature': 'sig1', 'db': 'puppies', 'user': '******', }, ] rows2 = [ { 'count': 14, 'time': 2006, 'errors': 32, 'query': 'SELECT * FROM table1 where id = ANY(?)', 'query_signature': 'sig1', 'db': 'puppies', 'user': '******', }, { 'count': 26, 'time': 125, 'errors': 1, 'query': 'SELECT * FROM table1 where id = ANY(?, ?)', 'query_signature': 'sig1', 'db': 'puppies', 'user': '******', }, ] # Run a first check to initialize tracking sm.compute_derivative_rows(rows1, metrics, key=key) # Run the check again to compute the metrics metrics = sm.compute_derivative_rows(rows2, metrics, key=key) expected_merged_metrics = [{ 'count': 2, 'time': 21, 'errors': 32, 'db': 'puppies', 'query': 'SELECT * FROM table1 where id = ANY(?)', 'query_signature': 'sig1', 'user': '******', }] assert expected_merged_metrics == metrics
class PostgresStatementMetrics(object): """Collects telemetry for SQL statements""" def __init__(self, check, config): self._check = check self._config = config self._db_hostname = None self._log = get_check_logger() self._state = StatementMetrics() self._stat_column_cache = [] # full_statement_text_cache: limit the ingestion rate of full statement text events per query_signature self._full_statement_text_cache = TTLCache( maxsize=self._config.full_statement_text_cache_max_size, ttl=60 * 60 / self._config.full_statement_text_samples_per_hour_per_query, ) def _execute_query(self, cursor, query, params=()): try: self._log.debug("Running query [%s] %s", query, params) cursor.execute(query, params) return cursor.fetchall() except (psycopg2.ProgrammingError, psycopg2.errors.QueryCanceled) as e: # A failed query could've derived from incorrect columns within the cache. It's a rare edge case, # but the next time the query is run, it will retrieve the correct columns. self._stat_column_cache = [] self._log.warning('Statement-level metrics are unavailable: %s', e) return [] def _get_pg_stat_statements_columns(self, db): """ Load the list of the columns available under the `pg_stat_statements` table. This must be queried because version is not a reliable way to determine the available columns on `pg_stat_statements`. The database can be upgraded without upgrading extensions, even when the extension is included by default. """ if self._stat_column_cache: return self._stat_column_cache # Querying over '*' with limit 0 allows fetching only the column names from the cursor without data query = STATEMENTS_QUERY.format( cols='*', pg_stat_statements_view=self._config.pg_stat_statements_view, limit=0, filters="") cursor = db.cursor() self._execute_query(cursor, query, params=(self._config.dbname, )) col_names = [desc[0] for desc in cursor.description ] if cursor.description else [] self._stat_column_cache = col_names return col_names def _db_hostname_cached(self): if self._db_hostname: return self._db_hostname self._db_hostname = resolve_db_host(self._config.host) return self._db_hostname def collect_per_statement_metrics(self, db, db_version, tags): try: rows = self._collect_metrics_rows(db) if not rows: return for event in self._rows_to_fqt_events(rows, tags): self._check.database_monitoring_query_sample( json.dumps(event, default=default_json_event_encoding)) # truncate query text to the maximum length supported by metrics tags for row in rows: row['query'] = row['query'][0:200] payload = { 'host': self._db_hostname_cached(), 'timestamp': time.time() * 1000, 'min_collection_interval': self._config.min_collection_interval, 'tags': tags, 'postgres_rows': rows, 'postgres_version': 'v{major}.{minor}.{patch}'.format(major=db_version.major, minor=db_version.minor, patch=db_version.patch), } self._check.database_monitoring_query_metrics( json.dumps(payload, default=default_json_event_encoding)) except Exception: db.rollback() self._log.exception( 'Unable to collect statement metrics due to an error') return [] def _load_pg_stat_statements(self, db): available_columns = set(self._get_pg_stat_statements_columns(db)) missing_columns = PG_STAT_STATEMENTS_REQUIRED_COLUMNS - available_columns if len(missing_columns) > 0: self._log.warning( 'Unable to collect statement metrics because required fields are unavailable: %s', ', '.join(list(missing_columns)), ) return [] query_columns = sorted( list(available_columns & PG_STAT_ALL_DESIRED_COLUMNS)) params = () filters = "" if self._config.dbstrict: filters = "AND pg_database.datname = %s" params = (self._config.dbname, ) return self._execute_query( db.cursor(cursor_factory=psycopg2.extras.DictCursor), STATEMENTS_QUERY.format( cols=', '.join(query_columns), pg_stat_statements_view=self._config.pg_stat_statements_view, filters=filters, limit=DEFAULT_STATEMENTS_LIMIT, ), params=params, ) def _collect_metrics_rows(self, db): rows = self._load_pg_stat_statements(db) rows = self._normalize_queries(rows) if not rows: return [] available_columns = set(rows[0].keys()) metric_columns = available_columns & PG_STAT_STATEMENTS_METRICS_COLUMNS rows = self._state.compute_derivative_rows(rows, metric_columns, key=_row_key) self._check.gauge('dd.postgres.queries.query_rows_raw', len(rows)) return rows def _normalize_queries(self, rows): normalized_rows = [] for row in rows: normalized_row = dict(copy.copy(row)) try: obfuscated_statement = datadog_agent.obfuscate_sql( row['query']) except Exception as e: # obfuscation errors are relatively common so only log them during debugging self._log.debug("Failed to obfuscate query '%s': %s", row['query'], e) continue normalized_row['query'] = obfuscated_statement normalized_row['query_signature'] = compute_sql_signature( obfuscated_statement) normalized_rows.append(normalized_row) return normalized_rows def _rows_to_fqt_events(self, rows, tags): for row in rows: query_cache_key = _row_key(row) if query_cache_key in self._full_statement_text_cache: continue self._full_statement_text_cache[query_cache_key] = True row_tags = tags + [ "db:{}".format(row['datname']), "rolname:{}".format(row['rolname']), ] yield { "timestamp": time.time() * 1000, "host": self._db_hostname_cached(), "ddsource": "postgres", "ddtags": ",".join(row_tags), "dbm_type": "fqt", "db": { "instance": row['datname'], "query_signature": row['query_signature'], "statement": row['query'], }, "postgres": { "datname": row["datname"], "rolname": row["rolname"], }, }