def _collect_per_statement_metrics(self, db): # type: (pymysql.connections.Connection) -> List[Metric] metrics = [] # type: List[Metric] def keyfunc(row): return (row['schema'], row['digest']) monotonic_rows = self._query_summary_per_statement(db) monotonic_rows = self._merge_duplicate_rows(monotonic_rows, key=keyfunc) rows = self._state.compute_derivative_rows(monotonic_rows, STATEMENT_METRICS.keys(), key=keyfunc) metrics.append(('dd.mysql.queries.query_rows_raw', len(rows), [])) rows = generate_synthetic_rows(rows) rows = apply_row_limits( rows, self.config.statement_metrics_limits or DEFAULT_STATEMENT_METRICS_LIMITS, tiebreaker_metric='count', tiebreaker_reverse=True, key=keyfunc, ) metrics.append(('dd.mysql.queries.query_rows_limited', len(rows), [])) for row in rows: tags = [] tags.append('digest:' + row['digest']) if row['schema'] is not None: tags.append('schema:' + row['schema']) try: obfuscated_statement = datadog_agent.obfuscate_sql( row['query']) except Exception as e: self.log.warning("Failed to obfuscate query '%s': %s", row['query'], e) continue tags.append('query_signature:' + compute_sql_signature(obfuscated_statement)) tags.append('query:' + normalize_query_tag(obfuscated_statement).strip()) for col, name in STATEMENT_METRICS.items(): value = row[col] metrics.append((name, value, tags)) return metrics
def _collect_per_statement_metrics(self, db): metrics = [] available_columns = self._get_pg_stat_statements_columns(db) missing_columns = PG_STAT_STATEMENTS_REQUIRED_COLUMNS - set( available_columns) if len(missing_columns) > 0: self.log.warning( 'Unable to collect statement metrics because required fields are unavailable: %s', ', '.join(list(missing_columns)), ) return metrics desired_columns = (list(PG_STAT_STATEMENTS_METRIC_COLUMNS.keys()) + list(PG_STAT_STATEMENTS_OPTIONAL_COLUMNS) + list(PG_STAT_STATEMENTS_TAG_COLUMNS.keys())) query_columns = list( set(desired_columns) & set(available_columns) | set(PG_STAT_STATEMENTS_TAG_COLUMNS.keys())) rows = self._execute_query( db.cursor(cursor_factory=psycopg2.extras.DictCursor), STATEMENTS_QUERY.format( cols=', '.join(query_columns), pg_stat_statements_view=self.config.pg_stat_statements_view, limit=DEFAULT_STATEMENTS_LIMIT, ), params=(self.config.dbname, ), ) if not rows: return metrics def row_keyfunc(row): # old versions of pg_stat_statements don't have a query ID so fall back to the query string itself queryid = row['queryid'] if 'queryid' in row else row['query'] return (queryid, row['datname'], row['rolname']) rows = self._state.compute_derivative_rows( rows, PG_STAT_STATEMENTS_METRIC_COLUMNS.keys(), key=row_keyfunc) rows = apply_row_limits(rows, DEFAULT_STATEMENT_METRIC_LIMITS, tiebreaker_metric='calls', tiebreaker_reverse=True, key=row_keyfunc) for row in rows: try: normalized_query = datadog_agent.obfuscate_sql(row['query']) if not normalized_query: self.log.warning( "Obfuscation of query '%s' resulted in empty query", row['query']) continue except Exception as e: # If query obfuscation fails, it is acceptable to log the raw query here because the # pg_stat_statements table contains no parameters in the raw queries. self.log.warning("Failed to obfuscate query '%s': %s", row['query'], e) continue query_signature = compute_sql_signature(normalized_query) # All "Deep Database Monitoring" statement-level metrics are tagged with a `query_signature` # which uniquely identifies the normalized query family. Where possible, this hash should # match the hash of APM "resources" (https://docs.datadoghq.com/tracing/visualization/resource/) # when the resource is a SQL query. Postgres' query normalization in the `pg_stat_statements` table # preserves most of the original query, so we tag the `resource_hash` with the same value as the # `query_signature`. The `resource_hash` tag should match the *actual* APM resource hash most of # the time, but not always. So this is a best-effort approach to link these metrics to APM metrics. tags = [ 'query_signature:' + query_signature, 'resource_hash:' + query_signature ] for column, tag_name in PG_STAT_STATEMENTS_TAG_COLUMNS.items(): if column not in row: continue value = row[column] if column == 'query': value = normalize_query_tag(normalized_query) tags.append('{tag_name}:{value}'.format(tag_name=tag_name, value=value)) for column, metric_name in PG_STAT_STATEMENTS_METRIC_COLUMNS.items( ): if column not in row: continue value = row[column] if column == 'total_time': # All "Deep Database Monitoring" timing metrics are in nanoseconds # Postgres tracks pg_stat* timing stats in milliseconds value = milliseconds_to_nanoseconds(value) metrics.append((metric_name, value, tags)) return metrics
def test_apply_row_limits(self): def assert_any_order(a, b): assert sorted(a, key=lambda row: row['_']) == sorted( b, key=lambda row: row['_']) rows = [ { '_': 0, 'count': 2, 'time': 1000 }, { '_': 1, 'count': 20, 'time': 5000 }, { '_': 2, 'count': 20, 'time': 8000 }, { '_': 3, 'count': 180, 'time': 8000 }, { '_': 4, 'count': 0, 'time': 10 }, { '_': 5, 'count': 60, 'time': 500 }, { '_': 6, 'count': 90, 'time': 5000 }, { '_': 7, 'count': 50, 'time': 5000 }, { '_': 8, 'count': 40, 'time': 100 }, { '_': 9, 'count': 30, 'time': 900 }, { '_': 10, 'count': 80, 'time': 800 }, { '_': 11, 'count': 110, 'time': 7000 }, ] assert_any_order([], apply_row_limits(rows, { 'count': (0, 0), 'time': (0, 0) }, 'count', True, key=lambda row: row['_'])) expected = [ { '_': 3, 'count': 180, 'time': 8000 }, { '_': 4, 'count': 0, 'time': 10 }, # The bottom 1 row for both 'count' and 'time' { '_': 2, 'count': 20, 'time': 8000 }, ] assert_any_order( expected, apply_row_limits(rows, { 'count': (1, 1), 'time': (1, 1) }, 'count', True, key=lambda row: row['_'])) expected = [ { '_': 5, 'count': 60, 'time': 500 }, { '_': 10, 'count': 80, 'time': 800 }, { '_': 6, 'count': 90, 'time': 5000 }, { '_': 11, 'count': 110, 'time': 7000 }, { '_': 3, 'count': 180, 'time': 8000 }, { '_': 4, 'count': 0, 'time': 10 }, { '_': 0, 'count': 2, 'time': 1000 }, { '_': 2, 'count': 20, 'time': 8000 }, { '_': 8, 'count': 40, 'time': 100 }, ] assert_any_order( expected, apply_row_limits(rows, { 'count': (5, 2), 'time': (2, 2) }, 'count', True, key=lambda row: row['_'])) assert_any_order( rows, apply_row_limits(rows, { 'count': (6, 6), 'time': (0, 0) }, 'time', False, key=lambda row: row['_']), ) assert_any_order( rows, apply_row_limits(rows, { 'count': (0, 0), 'time': (4, 8) }, 'time', False, key=lambda row: row['_']), ) assert_any_order( rows, apply_row_limits(rows, { 'count': (20, 20), 'time': (12, 5) }, 'time', False, key=lambda row: row['_']), )
def _collect_per_statement_metrics(self, db): metrics = [] available_columns = self._get_pg_stat_statements_columns(db) missing_columns = PG_STAT_STATEMENTS_REQUIRED_COLUMNS - set( available_columns) if len(missing_columns) > 0: self.log.warning( 'Unable to collect statement metrics because required fields are unavailable: %s', ', '.join(list(missing_columns)), ) return metrics desired_columns = (list(PG_STAT_STATEMENTS_METRIC_COLUMNS.keys()) + list(PG_STAT_STATEMENTS_OPTIONAL_COLUMNS) + list(PG_STAT_STATEMENTS_TAG_COLUMNS.keys())) query_columns = list( set(desired_columns) & set(available_columns) | set(PG_STAT_STATEMENTS_TAG_COLUMNS.keys())) rows = self._execute_query( db.cursor(cursor_factory=psycopg2.extras.DictCursor), STATEMENTS_QUERY.format( cols=', '.join(query_columns), pg_stat_statements_view=self.config.pg_stat_statements_view, limit=DEFAULT_STATEMENTS_LIMIT, ), params=(self.config.dbname, ), ) if not rows: return metrics def row_keyfunc(row): return (row['query_signature'], row['datname'], row['rolname']) rows = self._normalize_queries(rows) rows = self._state.compute_derivative_rows( rows, PG_STAT_STATEMENTS_METRIC_COLUMNS.keys(), key=row_keyfunc) metrics.append(('dd.postgres.queries.query_rows_raw', len(rows), [])) rows = generate_synthetic_rows(rows) rows = apply_row_limits( rows, self.config.statement_metrics_limits or DEFAULT_STATEMENT_METRICS_LIMITS, tiebreaker_metric='calls', tiebreaker_reverse=True, key=row_keyfunc, ) metrics.append( ('dd.postgres.queries.query_rows_limited', len(rows), [])) for row in rows: # All "Deep Database Monitoring" statement-level metrics are tagged with a `query_signature` # which uniquely identifies the normalized query family. Where possible, this hash should # match the hash of APM "resources" (https://docs.datadoghq.com/tracing/visualization/resource/) # when the resource is a SQL query. Postgres' query normalization in the `pg_stat_statements` table # preserves most of the original query, so we tag the `resource_hash` with the same value as the # `query_signature`. The `resource_hash` tag should match the *actual* APM resource hash most of # the time, but not always. So this is a best-effort approach to link these metrics to APM metrics. tags = [ 'query_signature:' + row['query_signature'], 'resource_hash:' + row['query_signature'] ] for column, tag_name in PG_STAT_STATEMENTS_TAG_COLUMNS.items(): if column not in row: continue value = row[column] if column == 'query': value = normalize_query_tag(row['query']) tags.append('{tag_name}:{value}'.format(tag_name=tag_name, value=value)) for column, metric_name in PG_STAT_STATEMENTS_METRIC_COLUMNS.items( ): if column not in row: continue value = row[column] if column == 'total_time': # All "Deep Database Monitoring" timing metrics are in nanoseconds # Postgres tracks pg_stat* timing stats in milliseconds value = milliseconds_to_nanoseconds(value) metrics.append((metric_name, value, tags)) return metrics