コード例 #1
0
    def _collect_per_statement_metrics(self, db):
        # type: (pymysql.connections.Connection) -> List[Metric]
        metrics = []  # type: List[Metric]

        def keyfunc(row):
            return (row['schema'], row['digest'])

        monotonic_rows = self._query_summary_per_statement(db)
        monotonic_rows = self._merge_duplicate_rows(monotonic_rows,
                                                    key=keyfunc)
        rows = self._state.compute_derivative_rows(monotonic_rows,
                                                   STATEMENT_METRICS.keys(),
                                                   key=keyfunc)
        metrics.append(('dd.mysql.queries.query_rows_raw', len(rows), []))

        rows = generate_synthetic_rows(rows)
        rows = apply_row_limits(
            rows,
            self.config.statement_metrics_limits
            or DEFAULT_STATEMENT_METRICS_LIMITS,
            tiebreaker_metric='count',
            tiebreaker_reverse=True,
            key=keyfunc,
        )
        metrics.append(('dd.mysql.queries.query_rows_limited', len(rows), []))

        for row in rows:
            tags = []
            tags.append('digest:' + row['digest'])
            if row['schema'] is not None:
                tags.append('schema:' + row['schema'])

            try:
                obfuscated_statement = datadog_agent.obfuscate_sql(
                    row['query'])
            except Exception as e:
                self.log.warning("Failed to obfuscate query '%s': %s",
                                 row['query'], e)
                continue
            tags.append('query_signature:' +
                        compute_sql_signature(obfuscated_statement))
            tags.append('query:' +
                        normalize_query_tag(obfuscated_statement).strip())

            for col, name in STATEMENT_METRICS.items():
                value = row[col]
                metrics.append((name, value, tags))

        return metrics
コード例 #2
0
    def _collect_per_statement_metrics(self, db):
        metrics = []

        available_columns = self._get_pg_stat_statements_columns(db)
        missing_columns = PG_STAT_STATEMENTS_REQUIRED_COLUMNS - set(
            available_columns)
        if len(missing_columns) > 0:
            self.log.warning(
                'Unable to collect statement metrics because required fields are unavailable: %s',
                ', '.join(list(missing_columns)),
            )
            return metrics

        desired_columns = (list(PG_STAT_STATEMENTS_METRIC_COLUMNS.keys()) +
                           list(PG_STAT_STATEMENTS_OPTIONAL_COLUMNS) +
                           list(PG_STAT_STATEMENTS_TAG_COLUMNS.keys()))
        query_columns = list(
            set(desired_columns) & set(available_columns)
            | set(PG_STAT_STATEMENTS_TAG_COLUMNS.keys()))
        rows = self._execute_query(
            db.cursor(cursor_factory=psycopg2.extras.DictCursor),
            STATEMENTS_QUERY.format(
                cols=', '.join(query_columns),
                pg_stat_statements_view=self.config.pg_stat_statements_view,
                limit=DEFAULT_STATEMENTS_LIMIT,
            ),
            params=(self.config.dbname, ),
        )
        if not rows:
            return metrics

        def row_keyfunc(row):
            # old versions of pg_stat_statements don't have a query ID so fall back to the query string itself
            queryid = row['queryid'] if 'queryid' in row else row['query']
            return (queryid, row['datname'], row['rolname'])

        rows = self._state.compute_derivative_rows(
            rows, PG_STAT_STATEMENTS_METRIC_COLUMNS.keys(), key=row_keyfunc)
        rows = apply_row_limits(rows,
                                DEFAULT_STATEMENT_METRIC_LIMITS,
                                tiebreaker_metric='calls',
                                tiebreaker_reverse=True,
                                key=row_keyfunc)

        for row in rows:
            try:
                normalized_query = datadog_agent.obfuscate_sql(row['query'])
                if not normalized_query:
                    self.log.warning(
                        "Obfuscation of query '%s' resulted in empty query",
                        row['query'])
                    continue
            except Exception as e:
                # If query obfuscation fails, it is acceptable to log the raw query here because the
                # pg_stat_statements table contains no parameters in the raw queries.
                self.log.warning("Failed to obfuscate query '%s': %s",
                                 row['query'], e)
                continue

            query_signature = compute_sql_signature(normalized_query)

            # All "Deep Database Monitoring" statement-level metrics are tagged with a `query_signature`
            # which uniquely identifies the normalized query family. Where possible, this hash should
            # match the hash of APM "resources" (https://docs.datadoghq.com/tracing/visualization/resource/)
            # when the resource is a SQL query. Postgres' query normalization in the `pg_stat_statements` table
            # preserves most of the original query, so we tag the `resource_hash` with the same value as the
            # `query_signature`. The `resource_hash` tag should match the *actual* APM resource hash most of
            # the time, but not always. So this is a best-effort approach to link these metrics to APM metrics.
            tags = [
                'query_signature:' + query_signature,
                'resource_hash:' + query_signature
            ]

            for column, tag_name in PG_STAT_STATEMENTS_TAG_COLUMNS.items():
                if column not in row:
                    continue
                value = row[column]
                if column == 'query':
                    value = normalize_query_tag(normalized_query)
                tags.append('{tag_name}:{value}'.format(tag_name=tag_name,
                                                        value=value))

            for column, metric_name in PG_STAT_STATEMENTS_METRIC_COLUMNS.items(
            ):
                if column not in row:
                    continue
                value = row[column]
                if column == 'total_time':
                    # All "Deep Database Monitoring" timing metrics are in nanoseconds
                    # Postgres tracks pg_stat* timing stats in milliseconds
                    value = milliseconds_to_nanoseconds(value)
                metrics.append((metric_name, value, tags))

        return metrics
コード例 #3
0
    def test_apply_row_limits(self):
        def assert_any_order(a, b):
            assert sorted(a, key=lambda row: row['_']) == sorted(
                b, key=lambda row: row['_'])

        rows = [
            {
                '_': 0,
                'count': 2,
                'time': 1000
            },
            {
                '_': 1,
                'count': 20,
                'time': 5000
            },
            {
                '_': 2,
                'count': 20,
                'time': 8000
            },
            {
                '_': 3,
                'count': 180,
                'time': 8000
            },
            {
                '_': 4,
                'count': 0,
                'time': 10
            },
            {
                '_': 5,
                'count': 60,
                'time': 500
            },
            {
                '_': 6,
                'count': 90,
                'time': 5000
            },
            {
                '_': 7,
                'count': 50,
                'time': 5000
            },
            {
                '_': 8,
                'count': 40,
                'time': 100
            },
            {
                '_': 9,
                'count': 30,
                'time': 900
            },
            {
                '_': 10,
                'count': 80,
                'time': 800
            },
            {
                '_': 11,
                'count': 110,
                'time': 7000
            },
        ]
        assert_any_order([],
                         apply_row_limits(rows, {
                             'count': (0, 0),
                             'time': (0, 0)
                         },
                                          'count',
                                          True,
                                          key=lambda row: row['_']))

        expected = [
            {
                '_': 3,
                'count': 180,
                'time': 8000
            },
            {
                '_': 4,
                'count': 0,
                'time': 10
            },  # The bottom 1 row for both 'count' and 'time'
            {
                '_': 2,
                'count': 20,
                'time': 8000
            },
        ]
        assert_any_order(
            expected,
            apply_row_limits(rows, {
                'count': (1, 1),
                'time': (1, 1)
            },
                             'count',
                             True,
                             key=lambda row: row['_']))

        expected = [
            {
                '_': 5,
                'count': 60,
                'time': 500
            },
            {
                '_': 10,
                'count': 80,
                'time': 800
            },
            {
                '_': 6,
                'count': 90,
                'time': 5000
            },
            {
                '_': 11,
                'count': 110,
                'time': 7000
            },
            {
                '_': 3,
                'count': 180,
                'time': 8000
            },
            {
                '_': 4,
                'count': 0,
                'time': 10
            },
            {
                '_': 0,
                'count': 2,
                'time': 1000
            },
            {
                '_': 2,
                'count': 20,
                'time': 8000
            },
            {
                '_': 8,
                'count': 40,
                'time': 100
            },
        ]
        assert_any_order(
            expected,
            apply_row_limits(rows, {
                'count': (5, 2),
                'time': (2, 2)
            },
                             'count',
                             True,
                             key=lambda row: row['_']))

        assert_any_order(
            rows,
            apply_row_limits(rows, {
                'count': (6, 6),
                'time': (0, 0)
            },
                             'time',
                             False,
                             key=lambda row: row['_']),
        )

        assert_any_order(
            rows,
            apply_row_limits(rows, {
                'count': (0, 0),
                'time': (4, 8)
            },
                             'time',
                             False,
                             key=lambda row: row['_']),
        )

        assert_any_order(
            rows,
            apply_row_limits(rows, {
                'count': (20, 20),
                'time': (12, 5)
            },
                             'time',
                             False,
                             key=lambda row: row['_']),
        )
コード例 #4
0
    def _collect_per_statement_metrics(self, db):
        metrics = []

        available_columns = self._get_pg_stat_statements_columns(db)
        missing_columns = PG_STAT_STATEMENTS_REQUIRED_COLUMNS - set(
            available_columns)
        if len(missing_columns) > 0:
            self.log.warning(
                'Unable to collect statement metrics because required fields are unavailable: %s',
                ', '.join(list(missing_columns)),
            )
            return metrics

        desired_columns = (list(PG_STAT_STATEMENTS_METRIC_COLUMNS.keys()) +
                           list(PG_STAT_STATEMENTS_OPTIONAL_COLUMNS) +
                           list(PG_STAT_STATEMENTS_TAG_COLUMNS.keys()))
        query_columns = list(
            set(desired_columns) & set(available_columns)
            | set(PG_STAT_STATEMENTS_TAG_COLUMNS.keys()))
        rows = self._execute_query(
            db.cursor(cursor_factory=psycopg2.extras.DictCursor),
            STATEMENTS_QUERY.format(
                cols=', '.join(query_columns),
                pg_stat_statements_view=self.config.pg_stat_statements_view,
                limit=DEFAULT_STATEMENTS_LIMIT,
            ),
            params=(self.config.dbname, ),
        )
        if not rows:
            return metrics

        def row_keyfunc(row):
            return (row['query_signature'], row['datname'], row['rolname'])

        rows = self._normalize_queries(rows)
        rows = self._state.compute_derivative_rows(
            rows, PG_STAT_STATEMENTS_METRIC_COLUMNS.keys(), key=row_keyfunc)
        metrics.append(('dd.postgres.queries.query_rows_raw', len(rows), []))

        rows = generate_synthetic_rows(rows)
        rows = apply_row_limits(
            rows,
            self.config.statement_metrics_limits
            or DEFAULT_STATEMENT_METRICS_LIMITS,
            tiebreaker_metric='calls',
            tiebreaker_reverse=True,
            key=row_keyfunc,
        )
        metrics.append(
            ('dd.postgres.queries.query_rows_limited', len(rows), []))

        for row in rows:
            # All "Deep Database Monitoring" statement-level metrics are tagged with a `query_signature`
            # which uniquely identifies the normalized query family. Where possible, this hash should
            # match the hash of APM "resources" (https://docs.datadoghq.com/tracing/visualization/resource/)
            # when the resource is a SQL query. Postgres' query normalization in the `pg_stat_statements` table
            # preserves most of the original query, so we tag the `resource_hash` with the same value as the
            # `query_signature`. The `resource_hash` tag should match the *actual* APM resource hash most of
            # the time, but not always. So this is a best-effort approach to link these metrics to APM metrics.
            tags = [
                'query_signature:' + row['query_signature'],
                'resource_hash:' + row['query_signature']
            ]

            for column, tag_name in PG_STAT_STATEMENTS_TAG_COLUMNS.items():
                if column not in row:
                    continue
                value = row[column]
                if column == 'query':
                    value = normalize_query_tag(row['query'])
                tags.append('{tag_name}:{value}'.format(tag_name=tag_name,
                                                        value=value))

            for column, metric_name in PG_STAT_STATEMENTS_METRIC_COLUMNS.items(
            ):
                if column not in row:
                    continue
                value = row[column]
                if column == 'total_time':
                    # All "Deep Database Monitoring" timing metrics are in nanoseconds
                    # Postgres tracks pg_stat* timing stats in milliseconds
                    value = milliseconds_to_nanoseconds(value)
                metrics.append((metric_name, value, tags))

        return metrics