def collect_per_statement_metrics(self): # exclude the default "db" tag from statement metrics & FQT events because this data is collected from # all databases on the host. For metrics the "db" tag is added during ingestion based on which database # each query came from. try: rows = self._collect_metrics_rows() if not rows: return for event in self._rows_to_fqt_events(rows): self._check.database_monitoring_query_sample(json.dumps(event, default=default_json_event_encoding)) # truncate query text to the maximum length supported by metrics tags for row in rows: row['query'] = row['query'][0:200] payload = { 'host': self._check.resolved_hostname, 'timestamp': time.time() * 1000, 'min_collection_interval': self._metrics_collection_interval, 'tags': self._tags_no_db, 'cloud_metadata': self._config.cloud_metadata, 'postgres_rows': rows, 'postgres_version': self._payload_pg_version(), 'ddagentversion': datadog_agent.get_version(), "ddagenthostname": self._check.agent_hostname, } self._check.database_monitoring_query_metrics(json.dumps(payload, default=default_json_event_encoding)) except Exception: self._log.exception('Unable to collect statement metrics due to an error') return []
def collect_per_statement_metrics(self, db, db_version, tags): try: rows = self._collect_metrics_rows(db) if not rows: return for event in self._rows_to_fqt_events(rows, tags): self._check.database_monitoring_query_sample( json.dumps(event, default=default_json_event_encoding)) # truncate query text to the maximum length supported by metrics tags for row in rows: row['query'] = row['query'][0:200] payload = { 'host': self._db_hostname_cached(), 'timestamp': time.time() * 1000, 'min_collection_interval': self._config.min_collection_interval, 'tags': tags, 'postgres_rows': rows, 'postgres_version': 'v{major}.{minor}.{patch}'.format(major=db_version.major, minor=db_version.minor, patch=db_version.patch), } self._check.database_monitoring_query_metrics( json.dumps(payload, default=default_json_event_encoding)) except Exception: db.rollback() self._log.exception( 'Unable to collect statement metrics due to an error') return []
def collect_per_statement_metrics(self, db, tags): # type: (pymysql.connections.Connection, List[str]) -> None try: rows = self._collect_per_statement_metrics(db) if not rows: return for event in self._rows_to_fqt_events(rows, tags): self._check.database_monitoring_query_sample( json.dumps(event, default=default_json_event_encoding)) # truncate query text to the maximum length supported by metrics tags for row in rows: row['digest_text'] = row['digest_text'][0:200] payload = { 'host': self._db_hostname_cached(), 'timestamp': time.time() * 1000, 'min_collection_interval': self._config.min_collection_interval, 'tags': tags, 'mysql_rows': rows, } self._check.database_monitoring_query_metrics( json.dumps(payload, default=default_json_event_encoding)) except Exception: self.log.exception( 'Unable to collect statement metrics due to an error')
def collect_per_statement_metrics(self): # Detect a database misconfiguration by checking if the performance schema is enabled since mysql # just returns no rows without errors if the performance schema is disabled if not self._check.performance_schema_enabled: self._check.record_warning( DatabaseConfigurationError.performance_schema_not_enabled, warning_with_tags( 'Unable to collect statement metrics because the performance schema is disabled. ' 'See https://docs.datadoghq.com/database_monitoring/setup_mysql/' 'troubleshooting#%s for more details', DatabaseConfigurationError.performance_schema_not_enabled. value, code=DatabaseConfigurationError. performance_schema_not_enabled.value, host=self._check.resolved_hostname, ), ) return rows = self._collect_per_statement_metrics() if not rows: return for event in self._rows_to_fqt_events(rows): self._check.database_monitoring_query_sample( json.dumps(event, default=default_json_event_encoding)) # truncate query text to the maximum length supported by metrics tags for row in rows: row['digest_text'] = row['digest_text'][0:200] if row[ 'digest_text'] is not None else None payload = { 'host': self._check.resolved_hostname, 'timestamp': time.time() * 1000, 'mysql_version': self._check.version.version + '+' + self._check.version.build, 'mysql_flavor': self._check.version.flavor, "ddagenthostname": self._check.agent_hostname, 'ddagentversion': datadog_agent.get_version(), 'min_collection_interval': self._metric_collection_interval, 'tags': self._tags, 'cloud_metadata': self._config.cloud_metadata, 'mysql_rows': rows, } self._check.database_monitoring_query_metrics( json.dumps(payload, default=default_json_event_encoding))
def _collect_statement_samples(self): start_time = time.time() pg_activity_cols = self._get_pg_stat_activity_cols_cached( PG_STAT_ACTIVITY_COLS) rows = self._get_new_pg_stat_activity(pg_activity_cols) rows = self._filter_and_normalize_statement_rows(rows) event_samples = self._collect_plans(rows) submitted_count = 0 for e in event_samples: self._check.database_monitoring_query_sample( json.dumps(e, default=default_json_event_encoding)) submitted_count += 1 if self._report_activity_event(): active_connections = self._get_active_connections() activity_event = self._create_activity_event( rows, active_connections) self._check.database_monitoring_query_activity( json.dumps(activity_event, default=default_json_event_encoding)) self._check.histogram("dd.postgres.collect_activity_snapshot.time", (time.time() - start_time) * 1000, tags=self._tags) elapsed_ms = (time.time() - start_time) * 1000 self._check.histogram( "dd.postgres.collect_statement_samples.time", elapsed_ms, tags=self._tags + self._check._get_debug_tags(), hostname=self._check.resolved_hostname, ) self._check.count( "dd.postgres.collect_statement_samples.events_submitted.count", submitted_count, tags=self._tags + self._check._get_debug_tags(), hostname=self._check.resolved_hostname, ) self._check.gauge( "dd.postgres.collect_statement_samples.seen_samples_cache.len", len(self._seen_samples_ratelimiter), tags=self._tags + self._check._get_debug_tags(), hostname=self._check.resolved_hostname, ) self._check.gauge( "dd.postgres.collect_statement_samples.explained_statements_cache.len", len(self._explained_statements_ratelimiter), tags=self._tags + self._check._get_debug_tags(), hostname=self._check.resolved_hostname, ) self._check.gauge( "dd.postgres.collect_statement_samples.explain_errors_cache.len", len(self._explain_errors_cache), tags=self._tags + self._check._get_debug_tags(), hostname=self._check.resolved_hostname, )
def __init__(self, check, config, shutdown_callback): collection_interval = float( config.statement_metrics_config.get('collection_interval', DEFAULT_COLLECTION_INTERVAL) ) if collection_interval <= 0: collection_interval = DEFAULT_COLLECTION_INTERVAL super(PostgresStatementMetrics, self).__init__( check, run_sync=is_affirmative(config.statement_metrics_config.get('run_sync', False)), enabled=is_affirmative(config.statement_metrics_config.get('enabled', True)), expected_db_exceptions=(psycopg2.errors.DatabaseError,), min_collection_interval=config.min_collection_interval, dbms="postgres", rate_limit=1 / float(collection_interval), job_name="query-metrics", shutdown_callback=shutdown_callback, ) self._metrics_collection_interval = collection_interval self._config = config self._state = StatementMetrics() self._stat_column_cache = [] self._obfuscate_options = to_native_string(json.dumps(self._config.obfuscator_options)) # full_statement_text_cache: limit the ingestion rate of full statement text events per query_signature self._full_statement_text_cache = TTLCache( maxsize=config.full_statement_text_cache_max_size, ttl=60 * 60 / config.full_statement_text_samples_per_hour_per_query, )
def refresh_scrapers(self): # Create assume_role credentials if assume_role ARN is specified in config assume_role = self.config.assume_role try: if assume_role: self.log.info('Assume role %s found. Creating temporary credentials using role...', assume_role) sts = boto3.client('sts') response = sts.assume_role( RoleArn=assume_role, RoleSessionName='dd-msk-check-session', DurationSeconds=3600 ) access_key_id = response['Credentials']['AccessKeyId'] secret_access_key = response['Credentials']['SecretAccessKey'] session_token = response['Credentials']['SessionToken'] client = boto3.client( 'kafka', aws_access_key_id=access_key_id, aws_secret_access_key=secret_access_key, aws_session_token=session_token, config=self._boto_config, region_name=self._region_name, ) else: # Always create a new client to account for changes in auth client = boto3.client( 'kafka', config=self._boto_config, region_name=self._region_name, ) response = client.list_nodes(ClusterArn=self.config.cluster_arn) self.log.debug('Received list_nodes response: %s', json.dumps(response)) except Exception as e: self.service_check(self.SERVICE_CHECK_CONNECT, self.CRITICAL, message=str(e), tags=self._service_check_tags) raise else: self.service_check(self.SERVICE_CHECK_CONNECT, self.OK, tags=self._service_check_tags) scrapers = {} for node_info in response['NodeInfoList']: broker_info = node_info['BrokerNodeInfo'] broker_id_tag = f'broker_id:{broker_info["BrokerId"]}' for endpoint in broker_info['Endpoints']: for port, metrics in self._exporter_data: if port: url = f'{self._endpoint_prefix}://{endpoint}:{port}{self.config.prometheus_metrics_path}' if url in self.scrapers: scrapers[url] = self.scrapers[url] continue scraper = self.create_scraper( {'openmetrics_endpoint': url, 'metrics': metrics, **self.instance} ) scraper.static_tags += self._static_tags scraper.set_dynamic_tags(broker_id_tag) self.configure_additional_transformers(scraper.metric_transformer.transformer_data) scrapers[url] = scraper self.scrapers = scrapers
def query(): connection = None for string in sys.stdin: if connection is None: connection_string = string.strip() try: connection = pyodbc.connect(connection_string) except Exception as e: print("{}".format(e), file=sys.stderr, flush=True) # Make the next query end immediately and fetch the error print('ENDOFQUERY', flush=True) else: query = string.strip() try: rows = [] with closing(connection.execute(query)) as c: rows = c.fetchall() for row in rows: print( json.dumps([item if item is None else str(item) for item in row]).decode("utf-8"), flush=True, ) except Exception as e: print("{}".format(e), file=sys.stderr, flush=True) print('ENDOFQUERY', flush=True)
def __init__(self, check, config, connection_args): self.collection_interval = float( config.activity_config.get("collection_interval", MySQLActivity.DEFAULT_COLLECTION_INTERVAL) ) if self.collection_interval <= 0: self.collection_interval = MySQLActivity.DEFAULT_COLLECTION_INTERVAL super(MySQLActivity, self).__init__( check, run_sync=is_affirmative(config.activity_config.get("run_sync", False)), enabled=is_affirmative(config.activity_config.get("enabled", True)), expected_db_exceptions=(pymysql.err.OperationalError, pymysql.err.InternalError), min_collection_interval=config.min_collection_interval, dbms="mysql", rate_limit=1 / float(self.collection_interval), job_name="query-activity", shutdown_callback=self._close_db_conn, ) self._check = check self._config = config self._log = check.log self._connection_args = connection_args self._db = None self._db_version = None self._obfuscator_options = to_native_string(json.dumps(self._config.obfuscator_options))
def submit_events(self, events): """ Submit the statement sample events to the event intake :return: submitted_count, failed_count """ submitted_count = 0 failed_count = 0 for chunk in _chunks(events, 100): for http, url in self._endpoints: try: r = http.request( 'post', url, data=json.dumps(chunk, cls=EventEncoder), timeout=5, headers={'Content-Type': 'application/json'}, ) r.raise_for_status() logger.debug("Submitted %s statement samples to %s", len(chunk), url) submitted_count += len(chunk) except requests.HTTPError as e: logger.warning( "Failed to submit statement samples to %s: %s", url, e) failed_count += len(chunk) except Exception: logger.exception( "Failed to submit statement samples to %s", url) failed_count += len(chunk) return submitted_count, failed_count
def _collect_statement_samples(self): self._rate_limiter.sleep() start_time = time.time() rows = self._get_new_pg_stat_activity() rows = self._filter_valid_statement_rows(rows) events = self._explain_pg_stat_activity(rows) submitted_count = 0 for e in events: self._check.database_monitoring_query_sample(json.dumps(e, default=default_json_event_encoding)) submitted_count += 1 elapsed_ms = (time.time() - start_time) * 1000 self._check.histogram("dd.postgres.collect_statement_samples.time", elapsed_ms, tags=self._tags) self._check.count( "dd.postgres.collect_statement_samples.events_submitted.count", submitted_count, tags=self._tags ) self._check.gauge( "dd.postgres.collect_statement_samples.seen_samples_cache.len", len(self._seen_samples_cache), tags=self._tags, ) self._check.gauge( "dd.postgres.collect_statement_samples.explained_statements_cache.len", len(self._explained_statements_cache), tags=self._tags, )
def __init__(self, check, config, connection_args): # (MySql, MySQLConfig) -> None collection_interval = float( config.statement_metrics_config.get('collection_interval', 10)) if collection_interval <= 0: collection_interval = 10 super(MySQLStatementMetrics, self).__init__( check, rate_limit=1 / float(collection_interval), run_sync=is_affirmative( config.statement_metrics_config.get('run_sync', False)), enabled=is_affirmative( config.statement_metrics_config.get('enabled', True)), expected_db_exceptions=(pymysql.err.DatabaseError, ), min_collection_interval=config.min_collection_interval, dbms="mysql", job_name="statement-metrics", shutdown_callback=self._close_db_conn, ) self._metric_collection_interval = collection_interval self._connection_args = connection_args self._db = None self._config = config self.log = get_check_logger() self._state = StatementMetrics() self._obfuscate_options = to_native_string( json.dumps(self._config.obfuscator_options)) # full_statement_text_cache: limit the ingestion rate of full statement text events per query_signature self._full_statement_text_cache = TTLCache( maxsize=self._config.full_statement_text_cache_max_size, ttl=60 * 60 / self._config.full_statement_text_samples_per_hour_per_query, ) # type: TTLCache
def compute_exec_plan_signature(normalized_json_plan): """ Given an already normalized json string query execution plan, generate its 64-bit hex signature. TODO: try to push this logic into the agent go code to avoid the two extra json serialization steps here """ if not normalized_json_plan: return None with_sorted_keys = json.dumps(json.loads(normalized_json_plan), **sort_keys_kwargs) return format(mmh3.hash64(with_sorted_keys, signed=False)[0], 'x')
def _collect_statement_samples(self): self._read_version_info() self._log.debug("collecting statement samples") events_statements_table, collection_interval = self._get_sample_collection_strategy( ) if not events_statements_table: return self._set_rate_limit(1.0 / collection_interval) start_time = time.time() rows = self._get_new_events_statements( events_statements_table, self._events_statements_row_limit) rows = self._filter_valid_statement_rows(rows) events = self._collect_plans_for_statements(rows) submitted_count = 0 tags = ( self._tags + ["events_statements_table:{}".format(events_statements_table)] + self._check._get_debug_tags()) for e in events: self._check.database_monitoring_query_sample( json.dumps(e, default=default_json_event_encoding)) submitted_count += 1 self._check.histogram( "dd.mysql.collect_statement_samples.time", (time.time() - start_time) * 1000, tags=tags, hostname=self._check.resolved_hostname, ) self._check.count( "dd.mysql.collect_statement_samples.events_submitted.count", submitted_count, tags=tags, hostname=self._check.resolved_hostname, ) self._check.gauge( "dd.mysql.collect_statement_samples.seen_samples_cache.len", len(self._seen_samples_ratelimiter), tags=tags, hostname=self._check.resolved_hostname, ) self._check.gauge( "dd.mysql.collect_statement_samples.explained_statements_cache.len", len(self._explained_statements_ratelimiter), tags=tags, hostname=self._check.resolved_hostname, ) self._check.gauge( "dd.mysql.collect_statement_samples.collection_strategy_cache.len", len(self._collection_strategy_cache), tags=tags, hostname=self._check.resolved_hostname, )
def obfuscate_sql(self, query, options=None): # Full obfuscation implementation is in go code. if options: # Options provided is a JSON string because the Go stub requires it, whereas # the python stub does not for things such as testing. if json.loads(options).get('return_json_metadata', False): return json.dumps({ 'query': re.sub(r'\s+', ' ', query or '').strip(), 'metadata': {} }) return re.sub(r'\s+', ' ', query or '').strip()
def _collect_activity(self): # type: () -> None with closing(self._get_db_connection().cursor(pymysql.cursors.DictCursor)) as cursor: connections = self._get_active_connections(cursor) rows = self._get_activity(cursor) rows = self._normalize_rows(rows) event = self._create_activity_event(rows, connections) payload = json.dumps(event, default=self._json_event_encoding) self._check.database_monitoring_query_activity(payload) self._check.histogram( "dd.mysql.activity.collect_activity.payload_size", len(payload), tags=self._tags + self._check._get_debug_tags(), )
def collect_statement_metrics_and_plans(self): """ Collects statement metrics and plans. :return: """ plans_submitted = 0 deadline = time.time() + self.collection_interval # re-use the check's conn module, but set extra_key=dbm- to ensure we get our own # raw connection. adodbapi and pyodbc modules are thread safe, but connections are not. with self.check.connection.open_managed_default_connection(key_prefix=self._conn_key_prefix): with self.check.connection.get_managed_cursor(key_prefix=self._conn_key_prefix) as cursor: rows = self._collect_metrics_rows(cursor) if not rows: return for event in self._rows_to_fqt_events(rows): self.check.database_monitoring_query_sample(json.dumps(event, default=default_json_event_encoding)) payload = self._to_metrics_payload(rows) self.check.database_monitoring_query_metrics(json.dumps(payload, default=default_json_event_encoding)) for event in self._collect_plans(rows, cursor, deadline): self.check.database_monitoring_query_sample(json.dumps(event, default=default_json_event_encoding)) plans_submitted += 1 self.check.count( "dd.sqlserver.statements.plans_submitted.count", plans_submitted, **self.check.debug_stats_kwargs() ) self.check.gauge( "dd.sqlserver.statements.seen_plans_cache.len", len(self._seen_plans_ratelimiter), **self.check.debug_stats_kwargs() ) self.check.gauge( "dd.sqlserver.statements.fqt_cache.len", len(self._full_statement_text_cache), **self.check.debug_stats_kwargs() )
def collect_per_statement_metrics(self, db, tags): try: rows = self._collect_metrics_rows(db) if not rows: return payload = { 'host': self._db_hostname_cached(), 'timestamp': time.time() * 1000, 'min_collection_interval': self._config.min_collection_interval, 'tags': tags, 'postgres_rows': rows, } self._check.database_monitoring_query_metrics(json.dumps(payload, default=default_json_event_encoding)) except Exception: db.rollback() self._log.exception('Unable to collect statement metrics due to an error') return []
def _collect_statement_samples(self): self._log.debug("collecting statement samples") self._rate_limiter.sleep() events_statements_table, rate_limit = self._get_sample_collection_strategy( ) if not events_statements_table: return if self._rate_limiter.rate_limit_s != rate_limit: self._rate_limiter = ConstantRateLimiter(rate_limit) start_time = time.time() tags = self._tags + [ "events_statements_table:{}".format(events_statements_table) ] rows = self._get_new_events_statements( events_statements_table, self._events_statements_row_limit) rows = self._filter_valid_statement_rows(rows) events = self._collect_plans_for_statements(rows) submitted_count = 0 for e in events: self._check.database_monitoring_query_sample( json.dumps(e, default=default_json_event_encoding)) submitted_count += 1 self._check.histogram("dd.mysql.collect_statement_samples.time", (time.time() - start_time) * 1000, tags=tags) self._check.count( "dd.mysql.collect_statement_samples.events_submitted.count", submitted_count, tags=tags) self._check.gauge( "dd.mysql.collect_statement_samples.seen_samples_cache.len", len(self._seen_samples_cache), tags=tags) self._check.gauge( "dd.mysql.collect_statement_samples.explained_statements_cache.len", len(self._explained_statements_cache), tags=tags, ) self._check.gauge( "dd.mysql.collect_statement_samples.collection_strategy_cache.len", len(self._collection_strategy_cache), tags=tags, )
def collect_per_statement_metrics(self, db, tags): # type: (pymysql.connections.Connection, List[str]) -> None try: rows = self._collect_per_statement_metrics(db) if not rows: return payload = { 'host': self._db_hostname_cached(), 'timestamp': time.time() * 1000, 'min_collection_interval': self._config.min_collection_interval, 'tags': tags, 'mysql_rows': rows, } self._check.database_monitoring_query_metrics( json.dumps(payload, default=default_json_event_encoding)) except Exception: self.log.exception( 'Unable to collect statement metrics due to an error')
def collect_activity(self): """ Collects all current activity for the SQLServer intance. :return: """ # re-use the check's conn module, but set extra_key=dbm-activity- to ensure we get our own # raw connection. adodbapi and pyodbc modules are thread safe, but connections are not. with self.check.connection.open_managed_default_connection(key_prefix=self._conn_key_prefix): with self.check.connection.get_managed_cursor(key_prefix=self._conn_key_prefix) as cursor: connections = self._get_active_connections(cursor) request_cols = self._get_exec_requests_cols_cached(cursor, DM_EXEC_REQUESTS_COLS) rows = self._get_activity(cursor, request_cols) normalized_rows = self._normalize_queries_and_filter_rows(rows, MAX_PAYLOAD_BYTES) event = self._create_activity_event(normalized_rows, connections) payload = json.dumps(event, default=default_json_event_encoding) self._check.database_monitoring_query_activity(payload) self.check.histogram( "dd.sqlserver.activity.collect_activity.payload_size", len(payload), **self.check.debug_stats_kwargs() )
for i in range(5, 10): assert cache.acquire(i), "cache should be empty again so these keys should go in OK" class TestDBExcepption(BaseException): pass @pytest.mark.parametrize( "obfuscator_return_value,expected_value", [ ( json.dumps( { 'query': 'SELECT * FROM datadog', 'metadata': {'tables_csv': 'datadog,', 'commands': ['SELECT'], 'comments': None}, } ), { 'query': 'SELECT * FROM datadog', 'metadata': {'commands': ['SELECT'], 'comments': None, 'tables': ['datadog']}, }, ), ( # Whitespace test " {\"query\":\"SELECT * FROM datadog\",\"metadata\":{\"tables_csv\":\"datadog\",\"commands\":[\"SELECT\"]," "\"comments\":null}} ", { 'query': 'SELECT * FROM datadog', 'metadata': {'commands': ['SELECT'], 'comments': None, 'tables': ['datadog']}, },
def _obfuscate_sql(sql_query, options=None): return json.dumps({'query': sql_query, 'metadata': metadata})
def submit_events(self, events): events = list(events) self._payloads.append(json.dumps(events, default=default_encoding)) return len(events), 0
def __init__(self, check, config, connection_args): collection_interval = float( config.statement_metrics_config.get('collection_interval', 1)) if collection_interval <= 0: collection_interval = 1 super(MySQLStatementSamples, self).__init__( check, rate_limit=1 / collection_interval, run_sync=is_affirmative( config.statement_samples_config.get('run_sync', False)), enabled=is_affirmative( config.statement_samples_config.get('enabled', True)), min_collection_interval=config.min_collection_interval, dbms="mysql", expected_db_exceptions=(pymysql.err.DatabaseError, ), job_name="statement-samples", shutdown_callback=self._close_db_conn, ) self._config = config self._version_processed = False self._connection_args = connection_args # checkpoint at zero so we pull the whole history table on the first run self._checkpoint = 0 self._last_check_run = 0 self._db = None self._configured_collection_interval = self._config.statement_samples_config.get( 'collection_interval', -1) self._events_statements_row_limit = self._config.statement_samples_config.get( 'events_statements_row_limit', 5000) self._explain_procedure = self._config.statement_samples_config.get( 'explain_procedure', 'explain_statement') self._fully_qualified_explain_procedure = self._config.statement_samples_config.get( 'fully_qualified_explain_procedure', 'datadog.explain_statement') self._events_statements_temp_table = self._config.statement_samples_config.get( 'events_statements_temp_table_name', 'datadog.temp_events') self._events_statements_enable_procedure = self._config.statement_samples_config.get( 'events_statements_enable_procedure', 'datadog.enable_events_statements_consumers') self._preferred_events_statements_tables = EVENTS_STATEMENTS_PREFERRED_TABLES self._has_window_functions = False events_statements_table = self._config.statement_samples_config.get( 'events_statements_table', None) if events_statements_table: if events_statements_table in DEFAULT_EVENTS_STATEMENTS_COLLECTION_INTERVAL: self._log.debug( "Configured preferred events_statements_table: %s", events_statements_table) self._preferred_events_statements_tables = [ events_statements_table ] else: self._log.warning( "Invalid events_statements_table: %s. Must be one of %s. Falling back to trying all tables.", events_statements_table, ', '.join( DEFAULT_EVENTS_STATEMENTS_COLLECTION_INTERVAL.keys()), ) self._explain_strategies = { 'PROCEDURE': self._run_explain_procedure, 'FQ_PROCEDURE': self._run_fully_qualified_explain_procedure, 'STATEMENT': self._run_explain, } self._preferred_explain_strategies = [ 'PROCEDURE', 'FQ_PROCEDURE', 'STATEMENT' ] self._obfuscate_options = to_native_string( json.dumps(self._config.obfuscator_options)) self._init_caches()
def _collect_plan_for_statement(self, row): # limit the rate of explains done to the database cache_key = (row['datname'], row['query_signature']) if not self._explained_statements_ratelimiter.acquire(cache_key): return None # Plans have several important signatures to tag events with. Note that for postgres, the # query_signature and resource_hash will be the same value. # - `plan_signature` - hash computed from the normalized JSON plan to group identical plan trees # - `resource_hash` - hash computed off the raw sql text to match apm resources # - `query_signature` - hash computed from the raw sql text to match query metrics plan_dict, explain_err_code, err_msg = self._run_and_track_explain( row['datname'], row['query'], row['statement'], row['query_signature']) collection_errors = None if explain_err_code: collection_errors = [{ 'code': explain_err_code.value, 'message': err_msg if err_msg else None }] plan, normalized_plan, obfuscated_plan, plan_signature = None, None, None, None if plan_dict: plan = json.dumps(plan_dict) # if we're using the orjson implementation then json.dumps returns bytes plan = plan.decode('utf-8') if isinstance(plan, bytes) else plan normalized_plan = datadog_agent.obfuscate_sql_exec_plan( plan, normalize=True) obfuscated_plan = datadog_agent.obfuscate_sql_exec_plan(plan) plan_signature = compute_exec_plan_signature(normalized_plan) statement_plan_sig = (row['query_signature'], plan_signature) if self._seen_samples_ratelimiter.acquire(statement_plan_sig): event = { "host": self._check.resolved_hostname, "ddagentversion": datadog_agent.get_version(), "ddsource": "postgres", "ddtags": ",".join(self._dbtags(row['datname'])), "timestamp": time.time() * 1000, "network": { "client": { "ip": row.get('client_addr', None), "port": row.get('client_port', None), "hostname": row.get('client_hostname', None), } }, "db": { "instance": row.get('datname', None), "plan": { "definition": obfuscated_plan, "signature": plan_signature, "collection_errors": collection_errors, }, "query_signature": row['query_signature'], "resource_hash": row['query_signature'], "application": row.get('application_name', None), "user": row['usename'], "statement": row['statement'], "metadata": { "tables": row['dd_tables'], "commands": row['dd_commands'], "comments": row['dd_comments'], }, "query_truncated": self._get_truncation_state( self._get_track_activity_query_size(), row['query']).value, }, 'postgres': { k: v for k, v in row.items() if k not in pg_stat_activity_sample_exclude_keys }, } if row['state'] in {'idle', 'idle in transaction'}: if row['state_change'] and row['query_start']: event['duration'] = ( row['state_change'] - row['query_start']).total_seconds() * 1e9 # If the transaction is idle then we have a more specific "end time" than the current time at # which we're collecting this event. According to the postgres docs, all of the timestamps in # pg_stat_activity are `timestamp with time zone` so the timezone should always be present. However, # if there is something wrong and it's missing then we can't use `state_change` for the timestamp # of the event else we risk the timestamp being significantly off and the event getting dropped # during ingestion. if row['state_change'].tzinfo: event['timestamp'] = get_timestamp( row['state_change']) * 1000 return event return None
def _collect_plan_for_statement(self, row): try: obfuscated_statement = datadog_agent.obfuscate_sql(row['query']) except Exception as e: self._log.debug("Failed to obfuscate statement: %s", e) self._check.count("dd.postgres.statement_samples.error", 1, tags=self._tags + ["error:sql-obfuscate"]) return None # limit the rate of explains done to the database query_signature = compute_sql_signature(obfuscated_statement) if query_signature in self._explained_statements_cache: return None self._explained_statements_cache[query_signature] = True # Plans have several important signatures to tag events with. Note that for postgres, the # query_signature and resource_hash will be the same value. # - `plan_signature` - hash computed from the normalized JSON plan to group identical plan trees # - `resource_hash` - hash computed off the raw sql text to match apm resources # - `query_signature` - hash computed from the raw sql text to match query metrics plan_dict = self._run_explain(row['query'], obfuscated_statement) plan, normalized_plan, obfuscated_plan, plan_signature, plan_cost = None, None, None, None, None if plan_dict: plan = json.dumps(plan_dict) normalized_plan = datadog_agent.obfuscate_sql_exec_plan( plan, normalize=True) obfuscated_plan = datadog_agent.obfuscate_sql_exec_plan(plan) plan_signature = compute_exec_plan_signature(normalized_plan) plan_cost = plan_dict.get('Plan', {}).get('Total Cost', 0.0) or 0.0 statement_plan_sig = (query_signature, plan_signature) if statement_plan_sig not in self._seen_samples_cache: self._seen_samples_cache[statement_plan_sig] = True event = { "host": self._db_hostname, "service": self._service, "ddsource": "postgres", "ddtags": self._tags_str, "network": { "client": { "ip": row.get('client_addr', None), "port": row.get('client_port', None), "hostname": row.get('client_hostname', None), } }, "db": { "instance": row.get('datname', None), "plan": { "definition": obfuscated_plan, "cost": plan_cost, "signature": plan_signature }, "query_signature": query_signature, "resource_hash": query_signature, "application": row.get('application_name', None), "user": row['usename'], "statement": obfuscated_statement, }, 'postgres': { k: v for k, v in row.items() if k not in pg_stat_activity_sample_exclude_keys }, } if row['state'] in {'idle', 'idle in transaction'}: if row['state_change'] and row['query_start']: event['duration'] = ( row['state_change'] - row['query_start']).total_seconds() * 1e9 event['timestamp'] = time.mktime( row['state_change'].timetuple()) * 1000 else: event['timestamp'] = time.time() * 1000 return event
def __init__(self, check, config, shutdown_callback): collection_interval = float( config.statement_samples_config.get('collection_interval', DEFAULT_COLLECTION_INTERVAL)) if collection_interval <= 0: collection_interval = DEFAULT_COLLECTION_INTERVAL super(PostgresStatementSamples, self).__init__( check, rate_limit=1 / collection_interval, run_sync=is_affirmative( config.statement_samples_config.get('run_sync', False)), enabled=is_affirmative( config.statement_samples_config.get('enabled', True)), dbms="postgres", min_collection_interval=config.min_collection_interval, expected_db_exceptions=(psycopg2.errors.DatabaseError, ), job_name="query-samples", shutdown_callback=shutdown_callback, ) self._check = check self._config = config self._tags_no_db = None self._activity_last_query_start = None # The value is loaded when connecting to the main database self._explain_function = config.statement_samples_config.get( 'explain_function', 'datadog.explain_statement') self._obfuscate_options = to_native_string( json.dumps(self._config.obfuscator_options)) self._collection_strategy_cache = TTLCache( maxsize=config.statement_samples_config.get( 'collection_strategy_cache_maxsize', 1000), ttl=config.statement_samples_config.get( 'collection_strategy_cache_ttl', 300), ) self._explain_errors_cache = TTLCache( maxsize=config.statement_samples_config.get( 'explain_errors_cache_maxsize', 5000), # only try to re-explain invalid statements once per day ttl=config.statement_samples_config.get('explain_errors_cache_ttl', 24 * 60 * 60), ) # explained_statements_ratelimiter: limit how often we try to re-explain the same query self._explained_statements_ratelimiter = RateLimitingTTLCache( maxsize=int( config.statement_samples_config.get( 'explained_queries_cache_maxsize', 5000)), ttl=60 * 60 / int( config.statement_samples_config.get( 'explained_queries_per_hour_per_query', 60)), ) # seen_samples_ratelimiter: limit the ingestion rate per (query_signature, plan_signature) self._seen_samples_ratelimiter = RateLimitingTTLCache( # assuming ~100 bytes per entry (query & plan signature, key hash, 4 pointers (ordered dict), expiry time) # total size: 10k * 100 = 1 Mb maxsize=int( config.statement_samples_config.get( 'seen_samples_cache_maxsize', 10000)), ttl=60 * 60 / int( config.statement_samples_config.get( 'samples_per_hour_per_query', 15)), ) self._activity_coll_enabled = is_affirmative( self._config.statement_activity_config.get('enabled', True)) # activity events cannot be reported more often than regular samples self._activity_coll_interval = max( self._config.statement_activity_config.get( 'collection_interval', DEFAULT_ACTIVITY_COLLECTION_INTERVAL), collection_interval, ) self._activity_max_rows = self._config.statement_activity_config.get( 'payload_row_limit', 3500) # Keep track of last time we sent an activity event self._time_since_last_activity_event = 0 self._pg_stat_activity_cols = None
def _collect_plan_for_statement(self, row): try: obfuscated_statement = datadog_agent.obfuscate_sql(row['query']) except Exception as e: self._log.debug("Failed to obfuscate statement: %s", e) self._check.count("dd.postgres.statement_samples.error", 1, tags=self._tags + ["error:sql-obfuscate"]) return None # limit the rate of explains done to the database query_signature = compute_sql_signature(obfuscated_statement) if query_signature in self._explained_statements_cache: return None self._explained_statements_cache[query_signature] = True # Plans have several important signatures to tag events with. Note that for postgres, the # query_signature and resource_hash will be the same value. # - `plan_signature` - hash computed from the normalized JSON plan to group identical plan trees # - `resource_hash` - hash computed off the raw sql text to match apm resources # - `query_signature` - hash computed from the raw sql text to match query metrics plan_dict = self._run_explain(row['query'], obfuscated_statement) plan, normalized_plan, obfuscated_plan, plan_signature, plan_cost = None, None, None, None, None if plan_dict: plan = json.dumps(plan_dict) # if we're using the orjson implementation then json.dumps returns bytes plan = plan.decode('utf-8') if isinstance(plan, bytes) else plan normalized_plan = datadog_agent.obfuscate_sql_exec_plan(plan, normalize=True) obfuscated_plan = datadog_agent.obfuscate_sql_exec_plan(plan) plan_signature = compute_exec_plan_signature(normalized_plan) plan_cost = plan_dict.get('Plan', {}).get('Total Cost', 0.0) or 0.0 statement_plan_sig = (query_signature, plan_signature) if statement_plan_sig not in self._seen_samples_cache: self._seen_samples_cache[statement_plan_sig] = True event = { "host": self._db_hostname, "service": self._service, "ddsource": "postgres", "ddtags": self._tags_str, "network": { "client": { "ip": row.get('client_addr', None), "port": row.get('client_port', None), "hostname": row.get('client_hostname', None), } }, "db": { "instance": row.get('datname', None), "plan": {"definition": obfuscated_plan, "cost": plan_cost, "signature": plan_signature}, "query_signature": query_signature, "resource_hash": query_signature, "application": row.get('application_name', None), "user": row['usename'], "statement": obfuscated_statement, }, 'postgres': {k: v for k, v in row.items() if k not in pg_stat_activity_sample_exclude_keys}, } event['timestamp'] = time.time() * 1000 if row['state'] in {'idle', 'idle in transaction'}: if row['state_change'] and row['query_start']: event['duration'] = (row['state_change'] - row['query_start']).total_seconds() * 1e9 # If the transaction is idle then we have a more specific "end time" than the current time at # which we're collecting this event. According to the postgres docs, all of the timestamps in # pg_stat_activity are `timestamp with time zone` so the timezone should always be present. However, # if there is something wrong and it's missing then we can't use `state_change` for the timestamp # of the event else we risk the timestamp being significantly off and the event getting dropped # during ingestion. if row['state_change'].tzinfo: event['timestamp'] = get_timestamp(row['state_change']) * 1000 return event
def __init__(self, name, init_config, instances): super(SQLServer, self).__init__(name, init_config, instances) self._resolved_hostname = None self._agent_hostname = None self.connection = None self.failed_connections = {} self.instance_metrics = [] self.instance_per_type_metrics = defaultdict(set) self.do_check = True self.tags = self.instance.get("tags", []) self.reported_hostname = self.instance.get('reported_hostname') self.autodiscovery = is_affirmative(self.instance.get('database_autodiscovery')) self.autodiscovery_include = self.instance.get('autodiscovery_include', ['.*']) self.autodiscovery_exclude = self.instance.get('autodiscovery_exclude', []) self.autodiscovery_db_service_check = is_affirmative(self.instance.get('autodiscovery_db_service_check', True)) self.min_collection_interval = self.instance.get('min_collection_interval', 15) self._compile_patterns() self.autodiscovery_interval = self.instance.get('autodiscovery_interval', DEFAULT_AUTODISCOVERY_INTERVAL) self.databases = set() self.ad_last_check = 0 self.proc = self.instance.get('stored_procedure') self.proc_type_mapping = {'gauge': self.gauge, 'rate': self.rate, 'histogram': self.histogram} self.custom_metrics = init_config.get('custom_metrics', []) # DBM self.dbm_enabled = self.instance.get('dbm', False) self.statement_metrics_config = self.instance.get('query_metrics', {}) or {} self.statement_metrics = SqlserverStatementMetrics(self) self.activity_config = self.instance.get('query_activity', {}) or {} self.activity = SqlserverActivity(self) self.cloud_metadata = {} aws = self.instance.get('aws', {}) gcp = self.instance.get('gcp', {}) azure = self.instance.get('azure', {}) if aws: self.cloud_metadata.update({'aws': aws}) if gcp: self.cloud_metadata.update({'gcp': gcp}) if azure: self.cloud_metadata.update({'azure': azure}) obfuscator_options_config = self.instance.get('obfuscator_options', {}) or {} self.obfuscator_options = to_native_string( json.dumps( { # Valid values for this can be found at # https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/semantic_conventions/database.md#connection-level-attributes 'dbms': 'mssql', 'replace_digits': is_affirmative( obfuscator_options_config.get( 'replace_digits', obfuscator_options_config.get('quantize_sql_tables', False), ) ), 'keep_sql_alias': is_affirmative(obfuscator_options_config.get('keep_sql_alias', True)), 'return_json_metadata': is_affirmative(obfuscator_options_config.get('collect_metadata', True)), 'table_names': is_affirmative(obfuscator_options_config.get('collect_tables', True)), 'collect_commands': is_affirmative(obfuscator_options_config.get('collect_commands', True)), 'collect_comments': is_affirmative(obfuscator_options_config.get('collect_comments', True)), } ) ) self.static_info_cache = TTLCache( maxsize=100, # cache these for a full day ttl=60 * 60 * 24, ) # Query declarations check_queries = [] if is_affirmative(self.instance.get('include_ao_metrics', False)): check_queries.extend( [ QUERY_AO_AVAILABILITY_GROUPS, QUERY_AO_FAILOVER_CLUSTER, QUERY_AO_FAILOVER_CLUSTER_MEMBER, ] ) if is_affirmative(self.instance.get('include_fci_metrics', False)): check_queries.extend([QUERY_FAILOVER_CLUSTER_INSTANCE]) self._check_queries = self._new_query_executor(check_queries) self.check_initializations.append(self._check_queries.compile_queries) self.server_state_queries = self._new_query_executor([QUERY_SERVER_STATIC_INFO]) self.check_initializations.append(self.server_state_queries.compile_queries) # use QueryManager to process custom queries self._query_manager = QueryManager( self, self.execute_query_raw, tags=self.tags, hostname=self.resolved_hostname ) self._dynamic_queries = None self.check_initializations.append(self.config_checks) self.check_initializations.append(self._query_manager.compile_queries) self.check_initializations.append(self.initialize_connection)