Example #1
0
class PostgreSql(AgentCheck):
    """Collects per-database, and optionally per-relation metrics, custom metrics"""

    SOURCE_TYPE_NAME = 'postgresql'
    SERVICE_CHECK_NAME = 'postgres.can_connect'
    METADATA_TRANSFORMERS = {'version': transform_version}

    def __init__(self, name, init_config, instances):
        super(PostgreSql, self).__init__(name, init_config, instances)
        self.db = None
        self._version = None
        self._is_aurora = None
        # Deprecate custom_metrics in favor of custom_queries
        if 'custom_metrics' in self.instance:
            self.warning(
                "DEPRECATION NOTICE: Please use the new custom_queries option "
                "rather than the now deprecated custom_metrics"
            )
        self.config = PostgresConfig(self.instance)
        self.metrics_cache = PostgresMetricsCache(self.config)
        self.statement_metrics = PostgresStatementMetrics(self.config)
        self._clean_state()

    def _clean_state(self):
        self._version = None
        self._is_aurora = None
        self.metrics_cache.clean_state()

    def _get_replication_role(self):
        cursor = self.db.cursor()
        cursor.execute('SELECT pg_is_in_recovery();')
        role = cursor.fetchone()[0]
        # value fetched for role is of <type 'bool'>
        return "standby" if role else "master"

    @property
    def version(self):
        if self._version is None:
            raw_version = get_raw_version(self.db)
            self._version = parse_version(raw_version)
            self.set_metadata('version', raw_version)
        return self._version

    @property
    def is_aurora(self):
        if self._is_aurora is None:
            self._is_aurora = is_aurora(self.db)
        return self._is_aurora

    def _build_relations_config(self, yamlconfig):
        """Builds a dictionary from relations configuration while maintaining compatibility"""
        config = {}

        for element in yamlconfig:
            if isinstance(element, str):
                config[element] = {'relation_name': element, 'schemas': [ALL_SCHEMAS]}
            elif isinstance(element, dict):
                if not ('relation_name' in element or 'relation_regex' in element):
                    self.log.warning(
                        "Parameter 'relation_name' or 'relation_regex' is required for relation element %s", element
                    )
                    continue
                if 'relation_name' in element and 'relation_regex' in element:
                    self.log.warning(
                        "Expecting only of parameters 'relation_name', 'relation_regex' for relation element %s",
                        element,
                    )
                    continue
                schemas = element.get('schemas', [])
                if not isinstance(schemas, list):
                    self.log.warning("Expected a list of schemas for %s", element)
                    continue
                name = element.get('relation_name') or element['relation_regex']
                config[name] = element.copy()
                if len(schemas) == 0:
                    config[name]['schemas'] = [ALL_SCHEMAS]
            else:
                self.log.warning('Unhandled relations config type: %s', element)
        return config

    def _run_query_scope(self, cursor, scope, is_custom_metrics, relations_config, cols, descriptors, is_relations):
        if scope is None:
            return None
        if scope == REPLICATION_METRICS or not self.version >= V9:
            log_func = self.log.debug
        else:
            log_func = self.log.warning

        results = None
        try:
            query = fmt.format(scope['query'], metrics_columns=", ".join(cols))
            # if this is a relation-specific query, we need to list all relations last
            if is_relations:
                schema_field = get_schema_field(descriptors)
                relations_filter = build_relations_filter(relations_config, schema_field)
                self.log.debug("Running query: %s with relations matching: %s", query, relations_filter)
                cursor.execute(query.format(relations=relations_filter))
            else:
                self.log.debug("Running query: %s", query)
                cursor.execute(query.replace(r'%', r'%%'))

            results = cursor.fetchall()
        except psycopg2.errors.FeatureNotSupported as e:
            # This happens for example when trying to get replication metrics from readers in Aurora. Let's ignore it.
            log_func(e)
            self.db.rollback()
            self._is_aurora = None
        except psycopg2.errors.UndefinedFunction as e:
            log_func(e)
            log_func(
                "It seems the PG version has been incorrectly identified as %s. "
                "A reattempt to identify the right version will happen on next agent run." % self._version
            )
            self._clean_state()
            self.db.rollback()
        except (psycopg2.ProgrammingError, psycopg2.errors.QueryCanceled) as e:
            log_func("Not all metrics may be available: %s" % str(e))
            self.db.rollback()

        if not results:
            return None

        if is_custom_metrics and len(results) > MAX_CUSTOM_RESULTS:
            self.warning(
                "Query: %s returned more than %s results (%s). Truncating", query, MAX_CUSTOM_RESULTS, len(results)
            )
            results = results[:MAX_CUSTOM_RESULTS]

        if is_relations and len(results) > self.config.max_relations:
            self.warning(
                "Query: %s returned more than %s results (%s). "
                "Truncating. You can edit this limit by setting the `max_relations` config option",
                query,
                self.config.max_relations,
                len(results),
            )
            results = results[: self.config.max_relations]

        return results

    def _query_scope(self, cursor, scope, instance_tags, is_custom_metrics, relations_config):
        if scope is None:
            return None
        # build query
        cols = list(scope['metrics'])  # list of metrics to query, in some order
        # we must remember that order to parse results

        # A descriptor is the association of a Postgres column name (e.g. 'schemaname')
        # to a tag name (e.g. 'schema').
        descriptors = scope['descriptors']
        is_relations = scope['relation'] and len(relations_config) > 0

        results = self._run_query_scope(
            cursor, scope, is_custom_metrics, relations_config, cols, descriptors, is_relations
        )
        if not results:
            return None

        # Parse and submit results.

        num_results = 0

        for row in results:
            # A row contains descriptor values on the left (used for tagging), and
            # metric values on the right (used as values for metrics).
            # E.g.: (descriptor, descriptor, ..., value, value, value, value, ...)

            expected_number_of_columns = len(descriptors) + len(cols)
            if len(row) != expected_number_of_columns:
                raise RuntimeError(
                    'Row does not contain enough values: '
                    'expected {} ({} descriptors + {} columns), got {}'.format(
                        expected_number_of_columns, len(descriptors), len(cols), len(row)
                    )
                )

            descriptor_values = row[: len(descriptors)]
            column_values = row[len(descriptors) :]

            # build a map of descriptors and their values
            desc_map = {name: value for (_, name), value in zip(descriptors, descriptor_values)}

            # Build tags.

            # Add tags from the instance.
            # Special-case the "db" tag, which overrides the one that is passed as instance_tag
            # The reason is that pg_stat_database returns all databases regardless of the
            # connection.
            if not scope['relation'] and not scope.get('use_global_db_tag', False):
                tags = [t for t in instance_tags if not t.startswith("db:")]
            else:
                tags = copy.copy(instance_tags)

            # Add tags from descriptors.
            tags += [("%s:%s" % (k, v)) for (k, v) in iteritems(desc_map)]

            # Submit metrics to the Agent.
            for column, value in zip(cols, column_values):
                name, submit_metric = scope['metrics'][column]
                submit_metric(self, name, value, tags=set(tags))

            num_results += 1

        return num_results

    def _collect_stats(self, instance_tags):
        """Query pg_stat_* for various metrics
        If relations is not an empty list, gather per-relation metrics
        on top of that.
        If custom_metrics is not an empty list, gather custom metrics defined in postgres.yaml
        """
        db_instance_metrics = self.metrics_cache.get_instance_metrics(self.version)
        bgw_instance_metrics = self.metrics_cache.get_bgw_metrics(self.version)
        archiver_instance_metrics = self.metrics_cache.get_archiver_metrics(self.version)

        metric_scope = [CONNECTION_METRICS]

        if self.config.collect_function_metrics:
            metric_scope.append(FUNCTION_METRICS)
        if self.config.collect_count_metrics:
            metric_scope.append(self.metrics_cache.get_count_metrics())

        # Do we need relation-specific metrics?
        relations_config = {}
        if self.config.relations:
            metric_scope += [LOCK_METRICS, REL_METRICS, IDX_METRICS, SIZE_METRICS, STATIO_METRICS]
            relations_config = self._build_relations_config(self.config.relations)

        replication_metrics = self.metrics_cache.get_replication_metrics(self.version, self.is_aurora)
        if replication_metrics:
            replication_metrics_query = copy.deepcopy(REPLICATION_METRICS)
            replication_metrics_query['metrics'] = replication_metrics
            metric_scope.append(replication_metrics_query)

        cursor = self.db.cursor()
        results_len = self._query_scope(cursor, db_instance_metrics, instance_tags, False, relations_config)
        if results_len is not None:
            self.gauge("postgresql.db.count", results_len, tags=[t for t in instance_tags if not t.startswith("db:")])

        self._query_scope(cursor, bgw_instance_metrics, instance_tags, False, relations_config)
        self._query_scope(cursor, archiver_instance_metrics, instance_tags, False, relations_config)

        if self.config.collect_activity_metrics:
            activity_metrics = self.metrics_cache.get_activity_metrics(self.version)
            self._query_scope(cursor, activity_metrics, instance_tags, False, relations_config)

        for scope in list(metric_scope) + self.config.custom_metrics:
            self._query_scope(cursor, scope, instance_tags, scope in self.config.custom_metrics, relations_config)

        cursor.close()

    def _connect(self):
        """Get and memoize connections to instances"""
        if self.db and self.db.closed:
            # Reset the connection object to retry to connect
            self.db = None

        if self.db:
            if self.db.status != psycopg2.extensions.STATUS_READY:
                # Some transaction went wrong and the connection is in an unhealthy state. Let's fix that
                self.db.rollback()
        else:
            if self.config.host == 'localhost' and self.config.password == '':
                # Use ident method
                connection_string = "user=%s dbname=%s application_name=%s" % (
                    self.config.user,
                    self.config.dbname,
                    self.config.application_name,
                )
                if self.config.query_timeout:
                    connection_string += " options='-c statement_timeout=%s'" % self.config.query_timeout
                self.db = psycopg2.connect(connection_string)
            else:
                args = {
                    'host': self.config.host,
                    'user': self.config.user,
                    'password': self.config.password,
                    'database': self.config.dbname,
                    'sslmode': self.config.ssl_mode,
                    'application_name': self.config.application_name,
                }
                if self.config.port:
                    args['port'] = self.config.port
                if self.config.query_timeout:
                    args['options'] = '-c statement_timeout=%s' % self.config.query_timeout
                self.db = psycopg2.connect(**args)

    def _collect_custom_queries(self, tags):
        """
        Given a list of custom_queries, execute each query and parse the result for metrics
        """
        for custom_query in self.config.custom_queries:
            metric_prefix = custom_query.get('metric_prefix')
            if not metric_prefix:
                self.log.error("custom query field `metric_prefix` is required")
                continue
            metric_prefix = metric_prefix.rstrip('.')

            query = custom_query.get('query')
            if not query:
                self.log.error("custom query field `query` is required for metric_prefix `%s`", metric_prefix)
                continue

            columns = custom_query.get('columns')
            if not columns:
                self.log.error("custom query field `columns` is required for metric_prefix `%s`", metric_prefix)
                continue

            cursor = self.db.cursor()
            with closing(cursor) as cursor:
                try:
                    self.log.debug("Running query: %s", query)
                    cursor.execute(query)
                except (psycopg2.ProgrammingError, psycopg2.errors.QueryCanceled) as e:
                    self.log.error("Error executing query for metric_prefix %s: %s", metric_prefix, str(e))
                    self.db.rollback()
                    continue

                for row in cursor:
                    if not row:
                        self.log.debug("query result for metric_prefix %s: returned an empty result", metric_prefix)
                        continue

                    if len(columns) != len(row):
                        self.log.error(
                            "query result for metric_prefix %s: expected %s columns, got %s",
                            metric_prefix,
                            len(columns),
                            len(row),
                        )
                        continue

                    metric_info = []
                    query_tags = list(custom_query.get('tags', []))
                    query_tags.extend(tags)

                    for column, value in zip(columns, row):
                        # Columns can be ignored via configuration.
                        if not column:
                            continue

                        name = column.get('name')
                        if not name:
                            self.log.error("column field `name` is required for metric_prefix `%s`", metric_prefix)
                            break

                        column_type = column.get('type')
                        if not column_type:
                            self.log.error(
                                "column field `type` is required for column `%s` of metric_prefix `%s`",
                                name,
                                metric_prefix,
                            )
                            break

                        if column_type == 'tag':
                            query_tags.append('{}:{}'.format(name, value))
                        else:
                            if not hasattr(self, column_type):
                                self.log.error(
                                    "invalid submission method `%s` for column `%s` of metric_prefix `%s`",
                                    column_type,
                                    name,
                                    metric_prefix,
                                )
                                break
                            try:
                                metric_info.append(('{}.{}'.format(metric_prefix, name), float(value), column_type))
                            except (ValueError, TypeError):
                                self.log.error(
                                    "non-numeric value `%s` for metric column `%s` of metric_prefix `%s`",
                                    value,
                                    name,
                                    metric_prefix,
                                )
                                break

                    # Only submit metrics if there were absolutely no errors - all or nothing.
                    else:
                        for info in metric_info:
                            metric, value, method = info
                            getattr(self, method)(metric, value, tags=set(query_tags))

    def _collect_per_statement_metrics(self, tags):
        metrics = self.statement_metrics.collect_per_statement_metrics(self.db)
        for metric_name, metric_value, metrics_tags in metrics:
            self.count(metric_name, metric_value, tags=list(set(metrics_tags + tags)))

    def check(self, _):
        tags = copy.copy(self.config.tags)
        # Collect metrics
        try:
            # Check version
            self._connect()
            if self.config.tag_replication_role:
                tags.extend(["replication_role:{}".format(self._get_replication_role())])
            self.log.debug("Running check against version %s", str(self.version))
            self._collect_stats(tags)
            self._collect_custom_queries(tags)
            if self.config.deep_database_monitoring:
                self._collect_per_statement_metrics(tags)
        except Exception as e:
            self.log.error("Unable to collect postgres metrics.")
            self._clean_state()
            self.db = None
            message = u'Error establishing connection to postgres://{}:{}/{}, error is {}'.format(
                self.config.host, self.config.port, self.config.dbname, str(e)
            )
            self.service_check(
                self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=self.config.service_check_tags, message=message
            )
            raise e
        else:
            message = u'Established connection to postgres://%s:%s/%s' % (
                self.config.host,
                self.config.port,
                self.config.dbname,
            )
            self.service_check(
                self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=self.config.service_check_tags, message=message
            )
            try:
                # commit to close the current query transaction
                self.db.commit()
            except Exception as e:
                self.log.warning("Unable to commit: %s", e)
            self._version = None  # We don't want to cache versions between runs to capture minor updates for metadata
Example #2
0
class PostgreSql(AgentCheck):
    """Collects per-database, and optionally per-relation metrics, custom metrics"""

    SOURCE_TYPE_NAME = 'postgresql'
    SERVICE_CHECK_NAME = 'postgres.can_connect'
    METADATA_TRANSFORMERS = {'version': VersionUtils.transform_version}

    def __init__(self, name, init_config, instances):
        super(PostgreSql, self).__init__(name, init_config, instances)
        self.db = None
        self._resolved_hostname = None
        self._agent_hostname = None
        self._version = None
        self._is_aurora = None
        self._version_utils = VersionUtils()
        # Deprecate custom_metrics in favor of custom_queries
        if 'custom_metrics' in self.instance:
            self.warning(
                "DEPRECATION NOTICE: Please use the new custom_queries option "
                "rather than the now deprecated custom_metrics")
        self._config = PostgresConfig(self.instance)
        self.pg_settings = {}
        self._warnings_by_code = {}
        self.metrics_cache = PostgresMetricsCache(self._config)
        self.statement_metrics = PostgresStatementMetrics(
            self, self._config, shutdown_callback=self._close_db_pool)
        self.statement_samples = PostgresStatementSamples(
            self, self._config, shutdown_callback=self._close_db_pool)
        self._relations_manager = RelationsManager(self._config.relations)
        self._clean_state()
        self.check_initializations.append(
            lambda: RelationsManager.validate_relations_config(self._config.
                                                               relations))
        # map[dbname -> psycopg connection]
        self._db_pool = {}
        self._db_pool_lock = threading.Lock()

    def cancel(self):
        self.statement_samples.cancel()
        self.statement_metrics.cancel()

    def _clean_state(self):
        self.log.debug("Cleaning state")
        self._version = None
        self._is_aurora = None
        self.metrics_cache.clean_state()

    def _get_debug_tags(self):
        return ['agent_hostname:{}'.format(self.agent_hostname)]

    def _get_service_check_tags(self):
        service_check_tags = []
        service_check_tags.extend(self._config.tags)
        return list(service_check_tags)

    def _get_replication_role(self):
        cursor = self.db.cursor()
        cursor.execute('SELECT pg_is_in_recovery();')
        role = cursor.fetchone()[0]
        # value fetched for role is of <type 'bool'>
        return "standby" if role else "master"

    def _collect_wal_metrics(self, instance_tags):
        wal_file_age = self._get_wal_file_age()
        if wal_file_age is not None:
            self.gauge(
                "postgresql.wal_age",
                wal_file_age,
                tags=[t for t in instance_tags if not t.startswith("db:")],
                hostname=self.resolved_hostname,
            )

    def _get_wal_dir(self):
        if self.version >= V10:
            wal_dir = "pg_wal"
        else:
            wal_dir = "pg_xlog"

        wal_log_dir = os.path.join(self._config.data_directory, wal_dir)

        return wal_log_dir

    def _get_wal_file_age(self):
        wal_log_dir = self._get_wal_dir()
        if not os.path.isdir(wal_log_dir):
            self.log.warning(
                "Cannot access WAL log directory: %s. Ensure that you are "
                "running the agent on your local postgres database.",
                wal_log_dir,
            )
            return None

        all_dir_contents = os.listdir(wal_log_dir)
        all_files = [
            f for f in all_dir_contents
            if os.path.isfile(os.path.join(wal_log_dir, f))
        ]

        # files extensions that are not valid WAL files
        exluded_file_exts = [".backup", ".history"]
        all_wal_files = [
            os.path.join(wal_log_dir, file_name) for file_name in all_files
            if not any(
                [ext for ext in exluded_file_exts if file_name.endswith(ext)])
        ]
        if len(all_wal_files) < 1:
            self.log.warning("No WAL files found in directory: %s.",
                             wal_log_dir)
            return None

        oldest_file = min(all_wal_files, key=os.path.getctime)
        now = time()
        oldest_file_age = now - os.path.getctime(oldest_file)
        return oldest_file_age

    @property
    def version(self):
        if self._version is None:
            raw_version = self._version_utils.get_raw_version(self.db)
            self._version = self._version_utils.parse_version(raw_version)
            self.set_metadata('version', raw_version)
        return self._version

    @property
    def is_aurora(self):
        if self._is_aurora is None:
            self._is_aurora = self._version_utils.is_aurora(self.db)
        return self._is_aurora

    @property
    def resolved_hostname(self):
        # type: () -> str
        if self._resolved_hostname is None:
            if self._config.reported_hostname:
                self._resolved_hostname = self._config.reported_hostname
            elif self._config.dbm_enabled or self.disable_generic_tags:
                self._resolved_hostname = self.resolve_db_host()
            else:
                self._resolved_hostname = self.agent_hostname
        return self._resolved_hostname

    @property
    def agent_hostname(self):
        # type: () -> str
        if self._agent_hostname is None:
            self._agent_hostname = datadog_agent.get_hostname()
        return self._agent_hostname

    def resolve_db_host(self):
        return agent_host_resolver(self._config.host)

    def _run_query_scope(self, cursor, scope, is_custom_metrics, cols,
                         descriptors):
        if scope is None:
            return None
        if scope == REPLICATION_METRICS or not self.version >= V9:
            log_func = self.log.debug
        else:
            log_func = self.log.warning

        results = None
        is_relations = scope.get(
            'relation') and self._relations_manager.has_relations
        try:
            query = fmt.format(scope['query'], metrics_columns=", ".join(cols))
            # if this is a relation-specific query, we need to list all relations last
            if is_relations:
                schema_field = get_schema_field(descriptors)
                formatted_query = self._relations_manager.filter_relation_query(
                    query, schema_field)
                cursor.execute(formatted_query)
            else:
                self.log.debug("Running query: %s", str(query))
                cursor.execute(query.replace(r'%', r'%%'))

            results = cursor.fetchall()
        except psycopg2.errors.FeatureNotSupported as e:
            # This happens for example when trying to get replication metrics from readers in Aurora. Let's ignore it.
            log_func(e)
            self.db.rollback()
            self.log.debug("Disabling replication metrics")
            self._is_aurora = False
            self.metrics_cache.replication_metrics = {}
        except psycopg2.errors.UndefinedFunction as e:
            log_func(e)
            log_func(
                "It seems the PG version has been incorrectly identified as %s. "
                "A reattempt to identify the right version will happen on next agent run."
                % self._version)
            self._clean_state()
            self.db.rollback()
        except (psycopg2.ProgrammingError, psycopg2.errors.QueryCanceled) as e:
            log_func("Not all metrics may be available: %s" % str(e))
            self.db.rollback()

        if not results:
            return None

        if is_custom_metrics and len(results) > MAX_CUSTOM_RESULTS:
            self.warning(
                "Query: %s returned more than %s results (%s). Truncating",
                query, MAX_CUSTOM_RESULTS, len(results))
            results = results[:MAX_CUSTOM_RESULTS]

        if is_relations and len(results) > self._config.max_relations:
            self.warning(
                "Query: %s returned more than %s results (%s). "
                "Truncating. You can edit this limit by setting the `max_relations` config option",
                query,
                self._config.max_relations,
                len(results),
            )
            results = results[:self._config.max_relations]

        return results

    def _query_scope(self, cursor, scope, instance_tags, is_custom_metrics):
        if scope is None:
            return None
        # build query
        cols = list(
            scope['metrics'])  # list of metrics to query, in some order
        # we must remember that order to parse results

        # A descriptor is the association of a Postgres column name (e.g. 'schemaname')
        # to a tag name (e.g. 'schema').
        descriptors = scope['descriptors']
        results = self._run_query_scope(cursor, scope, is_custom_metrics, cols,
                                        descriptors)
        if not results:
            return None

        # Parse and submit results.

        num_results = 0

        for row in results:
            # A row contains descriptor values on the left (used for tagging), and
            # metric values on the right (used as values for metrics).
            # E.g.: (descriptor, descriptor, ..., value, value, value, value, ...)

            expected_number_of_columns = len(descriptors) + len(cols)
            if len(row) != expected_number_of_columns:
                raise RuntimeError(
                    'Row does not contain enough values: '
                    'expected {} ({} descriptors + {} columns), got {}'.format(
                        expected_number_of_columns, len(descriptors),
                        len(cols), len(row)))

            descriptor_values = row[:len(descriptors)]
            column_values = row[len(descriptors):]

            # build a map of descriptors and their values
            desc_map = {
                name: value
                for (_, name), value in zip(descriptors, descriptor_values)
            }

            # Build tags.

            # Add tags from the instance.
            # Special-case the "db" tag, which overrides the one that is passed as instance_tag
            # The reason is that pg_stat_database returns all databases regardless of the
            # connection.
            if not scope['relation'] and not scope.get('use_global_db_tag',
                                                       False):
                tags = [t for t in instance_tags if not t.startswith("db:")]
            else:
                tags = copy.copy(instance_tags)

            # Add tags from descriptors.
            tags += [("%s:%s" % (k, v)) for (k, v) in iteritems(desc_map)]

            # Submit metrics to the Agent.
            for column, value in zip(cols, column_values):
                name, submit_metric = scope['metrics'][column]
                submit_metric(self,
                              name,
                              value,
                              tags=set(tags),
                              hostname=self.resolved_hostname)

            num_results += 1

        return num_results

    def _collect_stats(self, instance_tags):
        """Query pg_stat_* for various metrics
        If relations is not an empty list, gather per-relation metrics
        on top of that.
        If custom_metrics is not an empty list, gather custom metrics defined in postgres.yaml
        """
        db_instance_metrics = self.metrics_cache.get_instance_metrics(
            self.version)
        bgw_instance_metrics = self.metrics_cache.get_bgw_metrics(self.version)
        archiver_instance_metrics = self.metrics_cache.get_archiver_metrics(
            self.version)

        metric_scope = [CONNECTION_METRICS]

        if self._config.collect_function_metrics:
            metric_scope.append(FUNCTION_METRICS)
        if self._config.collect_count_metrics:
            metric_scope.append(self.metrics_cache.get_count_metrics())

        # Do we need relation-specific metrics?
        if self._config.relations:
            metric_scope.extend(RELATION_METRICS)
            if self._config.collect_bloat_metrics:
                metric_scope.extend([INDEX_BLOAT, TABLE_BLOAT])

        replication_metrics = self.metrics_cache.get_replication_metrics(
            self.version, self.is_aurora)
        if replication_metrics:
            replication_metrics_query = copy.deepcopy(REPLICATION_METRICS)
            replication_metrics_query['metrics'] = replication_metrics
            metric_scope.append(replication_metrics_query)

        replication_stats_metrics = self.metrics_cache.get_replication_stats_metrics(
            self.version)
        if replication_stats_metrics:
            metric_scope.append(replication_stats_metrics)

        cursor = self.db.cursor()
        results_len = self._query_scope(cursor, db_instance_metrics,
                                        instance_tags, False)
        if results_len is not None:
            self.gauge(
                "postgresql.db.count",
                results_len,
                tags=[t for t in instance_tags if not t.startswith("db:")],
                hostname=self.resolved_hostname,
            )

        self._query_scope(cursor, bgw_instance_metrics, instance_tags, False)
        self._query_scope(cursor, archiver_instance_metrics, instance_tags,
                          False)

        if self._config.collect_activity_metrics:
            activity_metrics = self.metrics_cache.get_activity_metrics(
                self.version)
            self._query_scope(cursor, activity_metrics, instance_tags, False)

        for scope in list(metric_scope) + self._config.custom_metrics:
            self._query_scope(cursor, scope, instance_tags, scope
                              in self._config.custom_metrics)

        cursor.close()

    def _new_connection(self, dbname):
        if self._config.host == 'localhost' and self._config.password == '':
            # Use ident method
            connection_string = "user=%s dbname=%s application_name=%s" % (
                self._config.user,
                dbname,
                self._config.application_name,
            )
            if self._config.query_timeout:
                connection_string += " options='-c statement_timeout=%s'" % self._config.query_timeout
            conn = psycopg2.connect(connection_string)
        else:
            args = {
                'host': self._config.host,
                'user': self._config.user,
                'password': self._config.password,
                'database': dbname,
                'sslmode': self._config.ssl_mode,
                'application_name': self._config.application_name,
            }
            if self._config.port:
                args['port'] = self._config.port
            if self._config.query_timeout:
                args[
                    'options'] = '-c statement_timeout=%s' % self._config.query_timeout
            if self._config.ssl_cert:
                args['sslcert'] = self._config.ssl_cert
            if self._config.ssl_root_cert:
                args['sslrootcert'] = self._config.ssl_root_cert
            if self._config.ssl_key:
                args['sslkey'] = self._config.ssl_key
            if self._config.ssl_password:
                args['sslpassword'] = self._config.ssl_password
            conn = psycopg2.connect(**args)
        # Autocommit is enabled by default for safety for all new connections (to prevent long-lived transactions).
        conn.set_session(autocommit=True)
        return conn

    def _connect(self):
        """Get and memoize connections to instances"""
        if self.db and self.db.closed:
            # Reset the connection object to retry to connect
            self.db = None

        if self.db:
            if self.db.status != psycopg2.extensions.STATUS_READY:
                # Some transaction went wrong and the connection is in an unhealthy state. Let's fix that
                self.db.rollback()
        else:
            self.db = self._new_connection(self._config.dbname)

    # Reload pg_settings on a new connection to the main db
    def _load_pg_settings(self, db):
        try:
            with db.cursor(
                    cursor_factory=psycopg2.extras.DictCursor) as cursor:
                self.log.debug("Running query [%s]", PG_SETTINGS_QUERY)
                cursor.execute(
                    PG_SETTINGS_QUERY,
                    ("pg_stat_statements.max", "track_activity_query_size"),
                )
                rows = cursor.fetchall()
                self.pg_settings.clear()
                for setting in rows:
                    name, val = setting
                    self.pg_settings[name] = val
        except (psycopg2.DatabaseError, psycopg2.OperationalError) as err:
            self.log.warning("Failed to query for pg_settings: %s", repr(err))
            self.count(
                "dd.postgres.error",
                1,
                tags=self._config.tags + ["error:load-pg-settings"] +
                self._get_debug_tags(),
                hostname=self.resolved_hostname,
            )

    def _get_db(self, dbname):
        """
        Returns a memoized psycopg2 connection to `dbname` with autocommit
        Threadsafe as long as no transactions are used
        :param dbname:
        :return: a psycopg2 connection
        """
        # TODO: migrate the rest of this check to use a connection from this pool
        with self._db_pool_lock:
            db = self._db_pool.get(dbname)
            if not db or db.closed:
                self.log.debug("initializing connection to dbname=%s", dbname)
                db = self._new_connection(dbname)
                db.set_session(autocommit=True)
                self._db_pool[dbname] = db
                if self._config.dbname == dbname:
                    # reload settings for the main DB only once every time the connection is reestablished
                    self._load_pg_settings(db)
            if db.status != psycopg2.extensions.STATUS_READY:
                # Some transaction went wrong and the connection is in an unhealthy state. Let's fix that
                db.rollback()
            return db

    def _close_db_pool(self):
        # TODO: add automatic aging out of connections after some time
        with self._db_pool_lock:
            for dbname, db in self._db_pool.items():
                if db and not db.closed:
                    try:
                        db.close()
                    except Exception:
                        self._log.exception(
                            "failed to close DB connection for db=%s", dbname)
                self._db_pool[dbname] = None

    def _collect_custom_queries(self, tags):
        """
        Given a list of custom_queries, execute each query and parse the result for metrics
        """
        for custom_query in self._config.custom_queries:
            metric_prefix = custom_query.get('metric_prefix')
            if not metric_prefix:
                self.log.error(
                    "custom query field `metric_prefix` is required")
                continue
            metric_prefix = metric_prefix.rstrip('.')

            query = custom_query.get('query')
            if not query:
                self.log.error(
                    "custom query field `query` is required for metric_prefix `%s`",
                    metric_prefix)
                continue

            columns = custom_query.get('columns')
            if not columns:
                self.log.error(
                    "custom query field `columns` is required for metric_prefix `%s`",
                    metric_prefix)
                continue

            cursor = self.db.cursor()
            with closing(cursor) as cursor:
                try:
                    self.log.debug("Running query: %s", query)
                    cursor.execute(query)
                except (psycopg2.ProgrammingError,
                        psycopg2.errors.QueryCanceled) as e:
                    self.log.error(
                        "Error executing query for metric_prefix %s: %s",
                        metric_prefix, str(e))
                    self.db.rollback()
                    continue

                for row in cursor:
                    if not row:
                        self.log.debug(
                            "query result for metric_prefix %s: returned an empty result",
                            metric_prefix)
                        continue

                    if len(columns) != len(row):
                        self.log.error(
                            "query result for metric_prefix %s: expected %s columns, got %s",
                            metric_prefix,
                            len(columns),
                            len(row),
                        )
                        continue

                    metric_info = []
                    query_tags = list(custom_query.get('tags', []))
                    query_tags.extend(tags)

                    for column, value in zip(columns, row):
                        # Columns can be ignored via configuration.
                        if not column:
                            continue

                        name = column.get('name')
                        if not name:
                            self.log.error(
                                "column field `name` is required for metric_prefix `%s`",
                                metric_prefix)
                            break

                        column_type = column.get('type')
                        if not column_type:
                            self.log.error(
                                "column field `type` is required for column `%s` of metric_prefix `%s`",
                                name,
                                metric_prefix,
                            )
                            break

                        if column_type == 'tag':
                            query_tags.append('{}:{}'.format(name, value))
                        else:
                            if not hasattr(self, column_type):
                                self.log.error(
                                    "invalid submission method `%s` for column `%s` of metric_prefix `%s`",
                                    column_type,
                                    name,
                                    metric_prefix,
                                )
                                break
                            try:
                                metric_info.append(
                                    ('{}.{}'.format(metric_prefix, name),
                                     float(value), column_type))
                            except (ValueError, TypeError):
                                self.log.error(
                                    "non-numeric value `%s` for metric column `%s` of metric_prefix `%s`",
                                    value,
                                    name,
                                    metric_prefix,
                                )
                                break

                    # Only submit metrics if there were absolutely no errors - all or nothing.
                    else:
                        for info in metric_info:
                            metric, value, method = info
                            getattr(self,
                                    method)(metric,
                                            value,
                                            tags=set(query_tags),
                                            hostname=self.resolved_hostname)

    def record_warning(self, code, message):
        # type: (DatabaseConfigurationError, str) -> None
        self._warnings_by_code[code] = message

    def _report_warnings(self):
        messages = self._warnings_by_code.values()
        # Reset the warnings for the next check run
        self._warnings_by_code = {}

        for warning in messages:
            self.warning(warning)

    def check(self, _):
        tags = copy.copy(self._config.tags)
        # Collect metrics
        try:
            # Check version
            self._connect()
            if self._config.tag_replication_role:
                tags.extend([
                    "replication_role:{}".format(self._get_replication_role())
                ])
            self.log.debug("Running check against version %s: is_aurora: %s",
                           str(self.version), str(self.is_aurora))
            self._collect_stats(tags)
            self._collect_custom_queries(tags)
            if self._config.dbm_enabled:
                self.statement_metrics.run_job_loop(tags)
                self.statement_samples.run_job_loop(tags)
            if self._config.collect_wal_metrics:
                self._collect_wal_metrics(tags)

        except Exception as e:
            self.log.exception("Unable to collect postgres metrics.")
            self._clean_state()
            self.db = None
            message = u'Error establishing connection to postgres://{}:{}/{}, error is {}'.format(
                self._config.host, self._config.port, self._config.dbname,
                str(e))
            self.service_check(
                self.SERVICE_CHECK_NAME,
                AgentCheck.CRITICAL,
                tags=self._get_service_check_tags(),
                message=message,
                hostname=self.resolved_hostname,
            )
            raise e
        else:
            self.service_check(
                self.SERVICE_CHECK_NAME,
                AgentCheck.OK,
                tags=self._get_service_check_tags(),
                hostname=self.resolved_hostname,
            )
            try:
                # commit to close the current query transaction
                self.db.commit()
            except Exception as e:
                self.log.warning("Unable to commit: %s", e)
            self._version = None  # We don't want to cache versions between runs to capture minor updates for metadata
        finally:
            # Add the warnings saved during the execution of the check
            self._report_warnings()