Beispiel #1
0
def test_custom_metrics_multiple_results(aggregator, check):
    con = mock.MagicMock()
    cursor = mock.MagicMock()
    data = [["tag_value1", "1"], ["tag_value2", "2"]]
    cursor.fetchall.side_effect = lambda: iter(data)
    con.cursor.return_value = cursor

    custom_queries = [{
        "metric_prefix":
        "oracle.test1",
        "query":
        "mocked",
        "columns": [{
            "name": "tag_name",
            "type": "tag"
        }, {
            "name": "metric",
            "type": "gauge"
        }],
        "tags": ["query_tags1"],
    }]

    check.instance['custom_queries'] = custom_queries
    check._fix_custom_queries()
    check._connection = con
    query_manager = QueryManager(check,
                                 check.execute_query_raw,
                                 tags=['custom_tag'])
    query_manager.compile_queries()

    query_manager.execute()

    aggregator.assert_metric(
        "oracle.test1.metric",
        value=1,
        count=1,
        tags=["tag_name:tag_value1", "query_tags1", "custom_tag"])
    aggregator.assert_metric(
        "oracle.test1.metric",
        value=2,
        count=1,
        tags=["tag_name:tag_value2", "query_tags1", "custom_tag"])
Beispiel #2
0
class TeradataCheck(AgentCheck, ConfigMixin):
    __NAMESPACE__ = 'teradata'

    def __init__(self, name, init_config, instances):
        super(TeradataCheck, self).__init__(name, init_config, instances)

        self._connect_params = None
        self._connection = None
        self._tags = []
        self._query_errors = 0
        self._tables_filter = None

        manager_queries = deepcopy(DEFAULT_QUERIES)
        if is_affirmative(self.instance.get('collect_res_usage_metrics',
                                            False)):
            manager_queries.extend(COLLECT_RES_USAGE)
        if is_affirmative(
                self.instance.get('collect_table_disk_metrics', False)):
            manager_queries.extend(COLLECT_ALL_SPACE)

        self._query_manager = QueryManager(
            self,
            self._execute_query_raw,
            queries=manager_queries,
            tags=self._tags,
            error_handler=self._executor_error_handler,
        )
        self.check_initializations.append(self.initialize_config)
        self.check_initializations.append(self._query_manager.compile_queries)

    def check(self, _):
        # type: (Any) -> None
        self._query_errors = 0

        try:
            with self.connect() as conn:
                if conn:
                    self._connection = conn
                    self._query_manager.execute()
            self.submit_health_checks()
        except Exception as e:
            self.service_check(SERVICE_CHECK_CONNECT,
                               ServiceCheck.CRITICAL,
                               tags=self._tags)
            raise e

    def initialize_config(self):
        # type: (Any) -> None
        self._connect_params = json.dumps({
            'host':
            self.config.server,
            'account':
            self.config.account,
            'database':
            self.config.database,
            'dbs_port':
            str(self.config.port),
            'logmech':
            self.config.auth_mechanism,
            'logdata':
            self.config.auth_data,
            'user':
            self.config.username,
            'password':
            self.config.password,
            'https_port':
            str(self.config.https_port),
            'sslmode':
            self.config.ssl_mode,
            'sslprotocol':
            self.config.ssl_protocol,
        })

        global_tags = [
            'teradata_server:{}'.format(self.instance.get('server')),
            'teradata_port:{}'.format(self.instance.get('port', 1025)),
        ]
        self._tags = list(self.config.tags)
        self._tags.extend(global_tags)
        self._query_manager.tags = self._tags

        self._tables_filter = create_tables_filter(self.config.tables)

    def _execute_query_raw(self, query):
        # type: (AnyStr) -> Iterable[Sequence]
        with closing(self._connection.cursor()) as cursor:
            query = query.format(self.config.database)
            cursor.execute(query)
            if cursor.rowcount < 1:
                self._query_errors += 1
                self.log.warning('Failed to fetch records from query: `%s`.',
                                 query)
                return None
            for row in cursor.fetchall():
                query_name = re.search(r'(DBC.[^\s]+)', query).group(1)
                try:
                    yield self._queries_processor(row, query_name)
                except Exception as e:
                    self.log.debug(
                        'Unable to process row returned from query "%s", skipping row %s. %s',
                        query_name, row, e)
                    yield row

    def _executor_error_handler(self, error):
        # type: (AnyStr) -> AnyStr
        self._query_errors += 1
        return error

    @contextmanager
    def connect(self):
        # type: () -> Iterator[teradatasql.connection]
        conn = None
        if TERADATASQL_IMPORT_ERROR:
            self.log.error(
                'Teradata SQL Driver module is unavailable. Please double check your installation and refer to the '
                'Datadog documentation for more information. %s',
                TERADATASQL_IMPORT_ERROR,
            )
            raise TERADATASQL_IMPORT_ERROR
        self.log.info('Connecting to Teradata database %s on server %s.',
                      self.config.database, self.config.server)
        try:
            conn = teradatasql.connect(self._connect_params)
            self.log.info('Connected to Teradata.')
            yield conn
        except Exception as e:
            self.log.error('Unable to connect to Teradata. %s.', e)
            raise e
        finally:
            if conn:
                conn.close()

    def submit_health_checks(self):
        # type: () -> None
        connect_status = ServiceCheck.OK
        query_status = ServiceCheck.CRITICAL if self._query_errors else ServiceCheck.OK

        self.service_check(SERVICE_CHECK_QUERY, query_status, tags=self._tags)
        self.service_check(SERVICE_CHECK_CONNECT,
                           connect_status,
                           tags=self._tags)

    def _queries_processor(self, row, query_name):
        # type: (Sequence, AnyStr) -> Sequence
        """
        Validate timestamps, filter tables, and normalize empty tags.
        """
        unprocessed_row = row

        # Return database version immediately
        if query_name == 'DBC.DBCInfoV':
            submit_version(self, row)
            return unprocessed_row

        # Only Resource Usage rows include timestamps and also do not include tags.
        if query_name == 'DBC.ResSpmaView':
            processed_row = timestamp_validator(self, unprocessed_row)
            return processed_row

        # Only AllSpaceV rows include table tags
        if (query_name == 'DBC.AllSpaceV'
                and is_affirmative(self.config.collect_table_disk_metrics)
                and self._tables_filter):
            tables_filtered_row = filter_tables(self._tables_filter,
                                                unprocessed_row)
            if tables_filtered_row:
                processed_row = tags_normalizer(tables_filtered_row,
                                                query_name)
                return processed_row
            # Discard row if empty (table is filtered out)
            return tables_filtered_row
        processed_row = tags_normalizer(unprocessed_row, query_name)
        self.log.trace('Row processor returned: %s. \nFrom query: "%s"',
                       processed_row, query_name)
        return processed_row
Beispiel #3
0
class RethinkDBCheck(AgentCheck):
    """
    Collect metrics from a RethinkDB cluster.
    """

    __NAMESPACE__ = 'rethinkdb'
    SERVICE_CHECK_CONNECT = 'can_connect'

    def __init__(self, *args, **kwargs):
        # type: (*Any, **Any) -> None
        super(RethinkDBCheck, self).__init__(*args, **kwargs)

        self._config = Config(cast(Instance, self.instance))

        if self._config.password:
            self.register_secret(self._config.password)

        self._conn = None  # type: Optional[rethinkdb.net.Connection]

        manager_queries = [
            queries.ClusterMetrics,
            queries.ServerMetrics,
            queries.DatabaseConfigMetrics,
            queries.DatabaseTableMetrics,
            queries.TableConfigMetrics,
            queries.ReplicaMetrics,
            queries.ShardMetrics,
            queries.JobMetrics,
            queries.CurrentIssuesMetrics,
        ]  # type: list

        if self.is_metadata_collection_enabled:
            manager_queries.append(queries.VersionMetadata)

        self._query_manager = QueryManager(
            self,
            executor=self._execute_raw_query,
            queries=manager_queries,
            tags=self._config.tags,
        )
        self._query_funcs = {}  # type: Dict[str, Callable]

        self.check_initializations.append(self._query_manager.compile_queries)

    def _execute_raw_query(self, query):
        # type: (str) -> List[tuple]
        query_func = self._query_funcs.get(query)

        if query_func is None:
            # QueryManager only supports `str` queries.
            # So here's the workaround: we make `query` refer to the import paths of query functions, then import here.
            # Cache the results so imports only happen on the first check run.
            module_name, _, func_name = query.partition(':')
            module = importlib.import_module(module_name, package='datadog_checks.rethinkdb')
            query_func = getattr(module, func_name)
            self._query_funcs[query] = query_func

        return query_func(self._conn)

    @contextmanager
    def connect_submitting_service_checks(self):
        # type: () -> Iterator[None]
        config = self._config
        tags = config.service_check_tags

        try:
            with rethinkdb.r.connect(
                host=config.host,
                port=config.port,
                user=config.user,
                password=config.password,
                ssl={'ca_certs': config.tls_ca_cert} if config.tls_ca_cert is not None else {},
            ) as conn:
                self._conn = conn
                yield
        except rethinkdb.errors.ReqlDriverError as exc:
            message = 'Could not connect to RethinkDB server: {!r}'.format(exc)
            self.log.error(message)
            self.service_check(self.SERVICE_CHECK_CONNECT, self.CRITICAL, tags=tags, message=message)
            raise
        except Exception as exc:
            message = 'Unexpected error while executing RethinkDB check: {!r}'.format(exc)
            self.log.error(message)
            self.service_check(self.SERVICE_CHECK_CONNECT, self.CRITICAL, tags=tags, message=message)
            raise
        else:
            self.service_check(self.SERVICE_CHECK_CONNECT, self.OK, tags=tags)
        finally:
            self._conn = None

    def collect_metrics(self):  # Exposed for mocking purposes.
        # type: () -> None
        self._query_manager.execute()

    def check(self, instance):
        # type: (Any) -> None
        with self.connect_submitting_service_checks():
            self.collect_metrics()
Beispiel #4
0
class SQLServer(AgentCheck):
    __NAMESPACE__ = 'sqlserver'

    def __init__(self, name, init_config, instances):
        super(SQLServer, self).__init__(name, init_config, instances)

        self.connection = None
        self.failed_connections = {}
        self.instance_metrics = []
        self.instance_per_type_metrics = defaultdict(list)
        self.do_check = True

        self.autodiscovery = is_affirmative(self.instance.get('database_autodiscovery'))
        self.autodiscovery_include = self.instance.get('autodiscovery_include', ['.*'])
        self.autodiscovery_exclude = self.instance.get('autodiscovery_exclude', [])
        self._compile_patterns()
        self.autodiscovery_interval = self.instance.get('autodiscovery_interval', DEFAULT_AUTODISCOVERY_INTERVAL)
        self.databases = set()
        self.ad_last_check = 0

        self.proc = self.instance.get('stored_procedure')
        self.proc_type_mapping = {'gauge': self.gauge, 'rate': self.rate, 'histogram': self.histogram}
        self.custom_metrics = init_config.get('custom_metrics', [])

        # use QueryManager to process custom queries
        self._query_manager = QueryManager(self, self.execute_query_raw, queries=[], tags=self.instance.get("tags", []))
        self.check_initializations.append(self.config_checks)
        self.check_initializations.append(self._query_manager.compile_queries)
        self.check_initializations.append(self.initialize_connection)

    def config_checks(self):
        if self.autodiscovery and self.instance.get('database'):
            self.log.warning(
                'sqlserver `database_autodiscovery` and `database` options defined in same instance - '
                'autodiscovery will take precedence.'
            )
        if not self.autodiscovery and (self.autodiscovery_include or self.autodiscovery_exclude):
            self.log.warning(
                "Autodiscovery is disabled, autodiscovery_include and autodiscovery_exclude will be ignored"
            )

    def initialize_connection(self):
        self.connection = Connection(self.init_config, self.instance, self.handle_service_check)

        # Pre-process the list of metrics to collect
        try:
            # check to see if the database exists before we try any connections to it
            db_exists, context = self.connection.check_database()

            if db_exists:
                if self.instance.get('stored_procedure') is None:
                    with self.connection.open_managed_default_connection():
                        with self.connection.get_managed_cursor() as cursor:
                            self.autodiscover_databases(cursor)
                        self._make_metric_list_to_collect(self.custom_metrics)
            else:
                # How much do we care that the DB doesn't exist?
                ignore = is_affirmative(self.instance.get("ignore_missing_database", False))
                if ignore is not None and ignore:
                    # not much : we expect it. leave checks disabled
                    self.do_check = False
                    self.log.warning("Database %s does not exist. Disabling checks for this instance.", context)
                else:
                    # yes we do. Keep trying
                    msg = "Database {} does not exist. Please resolve invalid database and restart agent".format(
                        context
                    )
                    raise ConfigurationError(msg)

        except SQLConnectionError as e:
            self.log.exception("Error connecting to database: %s", e)
        except ConfigurationError:
            raise
        except Exception as e:
            self.log.exception("Initialization exception %s", e)

    def handle_service_check(self, status, host, database, message=None):
        custom_tags = self.instance.get("tags", [])
        if custom_tags is None:
            custom_tags = []
        service_check_tags = ['host:{}'.format(host), 'db:{}'.format(database)]
        service_check_tags.extend(custom_tags)
        service_check_tags = list(set(service_check_tags))

        self.service_check(SERVICE_CHECK_NAME, status, tags=service_check_tags, message=message, raw=True)

    def _compile_patterns(self):
        self._include_patterns = self._compile_valid_patterns(self.autodiscovery_include)
        self._exclude_patterns = self._compile_valid_patterns(self.autodiscovery_exclude)

    def _compile_valid_patterns(self, patterns):
        valid_patterns = []

        for pattern in patterns:
            # Ignore empty patterns as they match everything
            if not pattern:
                continue

            try:
                re.compile(pattern, re.IGNORECASE)
            except Exception:
                self.log.warning('%s is not a valid regular expression and will be ignored', pattern)
            else:
                valid_patterns.append(pattern)

        if valid_patterns:
            return re.compile('|'.join(valid_patterns), re.IGNORECASE)
        else:
            # create unmatchable regex - https://stackoverflow.com/a/1845097/2157429
            return re.compile(r'(?!x)x')

    def autodiscover_databases(self, cursor):
        if not self.autodiscovery:
            return False

        now = time.time()
        if now - self.ad_last_check > self.autodiscovery_interval:
            self.log.info('Performing database autodiscovery')
            cursor.execute(AUTODISCOVERY_QUERY)
            all_dbs = set(row.name for row in cursor.fetchall())
            excluded_dbs = set([d for d in all_dbs if self._exclude_patterns.match(d)])
            included_dbs = set([d for d in all_dbs if self._include_patterns.match(d)])

            self.log.debug(
                'Autodiscovered databases: %s, excluding: %s, including: %s', all_dbs, excluded_dbs, included_dbs
            )

            # keep included dbs but remove any that were explicitly excluded
            filtered_dbs = all_dbs.intersection(included_dbs) - excluded_dbs

            self.log.debug('Resulting filtered databases: %s', filtered_dbs)
            self.ad_last_check = now

            if filtered_dbs != self.databases:
                self.log.debug('Databases updated from previous autodiscovery check.')
                self.databases = filtered_dbs
                return True
        return False

    def _make_metric_list_to_collect(self, custom_metrics):
        """
        Store the list of metrics to collect by instance_key.
        Will also create and cache cursors to query the db.
        """

        metrics_to_collect = []
        tags = self.instance.get('tags', [])

        # Load instance-level (previously Performance) metrics)
        # If several check instances are querying the same server host, it can be wise to turn these off
        # to avoid sending duplicate metrics
        if is_affirmative(self.instance.get('include_instance_metrics', True)):
            self._add_performance_counters(
                chain(INSTANCE_METRICS, INSTANCE_METRICS_TOTAL), metrics_to_collect, tags, db=None
            )

        # populated through autodiscovery
        if self.databases:
            for db in self.databases:
                self._add_performance_counters(INSTANCE_METRICS_TOTAL, metrics_to_collect, tags, db=db)

        # Load database statistics
        for name, table, column in DATABASE_METRICS:
            # include database as a filter option
            db_names = self.databases or [self.instance.get('database', self.connection.DEFAULT_DATABASE)]
            for db_name in db_names:
                cfg = {'name': name, 'table': table, 'column': column, 'instance_name': db_name, 'tags': tags}
                metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column))

        # Load AlwaysOn metrics
        if is_affirmative(self.instance.get('include_ao_metrics', False)):
            for name, table, column in AO_METRICS + AO_METRICS_PRIMARY + AO_METRICS_SECONDARY:
                db_name = 'master'
                cfg = {
                    'name': name,
                    'table': table,
                    'column': column,
                    'instance_name': db_name,
                    'tags': tags,
                    'ao_database': self.instance.get('ao_database', None),
                    'availability_group': self.instance.get('availability_group', None),
                    'only_emit_local': is_affirmative(self.instance.get('only_emit_local', False)),
                }
                metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column))

        # Load FCI metrics
        if is_affirmative(self.instance.get('include_fci_metrics', False)):
            for name, table, column in FCI_METRICS:
                cfg = {
                    'name': name,
                    'table': table,
                    'column': column,
                    'tags': tags,
                }
                metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column))

        # Load metrics from scheduler and task tables, if enabled
        if is_affirmative(self.instance.get('include_task_scheduler_metrics', False)):
            for name, table, column in TASK_SCHEDULER_METRICS:
                cfg = {'name': name, 'table': table, 'column': column, 'tags': tags}
                metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column))

        # Load DB Fragmentation metrics
        if is_affirmative(self.instance.get('include_db_fragmentation_metrics', False)):
            db_fragmentation_object_names = self.instance.get('db_fragmentation_object_names', [])
            db_names = self.databases or [self.instance.get('database', self.connection.DEFAULT_DATABASE)]

            if not db_fragmentation_object_names:
                self.log.debug(
                    "No fragmentation object names specified, will return fragmentation metrics for all "
                    "object_ids of current database(s): %s",
                    db_names,
                )

            for db_name in db_names:
                for name, table, column in DATABASE_FRAGMENTATION_METRICS:
                    cfg = {
                        'name': name,
                        'table': table,
                        'column': column,
                        'instance_name': db_name,
                        'tags': tags,
                        'db_fragmentation_object_names': db_fragmentation_object_names,
                    }
                    metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column))

        # Load any custom metrics from conf.d/sqlserver.yaml
        for cfg in custom_metrics:
            sql_type = None
            base_name = None

            custom_tags = tags + cfg.get('tags', [])
            cfg['tags'] = custom_tags

            db_table = cfg.get('table', DEFAULT_PERFORMANCE_TABLE)
            if db_table not in VALID_TABLES:
                self.log.error('%s has an invalid table name: %s', cfg['name'], db_table)
                continue

            if cfg.get('database', None) and cfg.get('database') != self.instance.get('database'):
                self.log.debug(
                    'Skipping custom metric %s for database %s, check instance configured for database %s',
                    cfg['name'],
                    cfg.get('database'),
                    self.instance.get('database'),
                )
                continue

            if db_table == DEFAULT_PERFORMANCE_TABLE:
                user_type = cfg.get('type')
                if user_type is not None and user_type not in VALID_METRIC_TYPES:
                    self.log.error('%s has an invalid metric type: %s', cfg['name'], user_type)
                sql_type = None
                try:
                    if user_type is None:
                        sql_type, base_name = self.get_sql_type(cfg['counter_name'])
                except Exception:
                    self.log.warning("Can't load the metric %s, ignoring", cfg['name'], exc_info=True)
                    continue

                metrics_to_collect.append(
                    self.typed_metric(
                        cfg_inst=cfg, table=db_table, base_name=base_name, user_type=user_type, sql_type=sql_type
                    )
                )

            else:
                for column in cfg['columns']:
                    metrics_to_collect.append(
                        self.typed_metric(
                            cfg_inst=cfg, table=db_table, base_name=base_name, sql_type=sql_type, column=column
                        )
                    )

        self.instance_metrics = metrics_to_collect
        self.log.debug("metrics to collect %s", metrics_to_collect)

        # create an organized grouping of metric names to their metric classes
        for m in metrics_to_collect:
            cls = m.__class__.__name__
            name = m.sql_name or m.column
            self.log.debug("Adding metric class %s named %s", cls, name)

            self.instance_per_type_metrics[cls].append(name)
            if m.base_name:
                self.instance_per_type_metrics[cls].append(m.base_name)

    def _add_performance_counters(self, metrics, metrics_to_collect, tags, db=None):
        for name, counter_name, instance_name in metrics:
            try:
                sql_type, base_name = self.get_sql_type(counter_name)
                cfg = {
                    'name': name,
                    'counter_name': counter_name,
                    'instance_name': db or instance_name,
                    'tags': tags,
                }

                metrics_to_collect.append(
                    self.typed_metric(
                        cfg_inst=cfg, table=DEFAULT_PERFORMANCE_TABLE, base_name=base_name, sql_type=sql_type
                    )
                )
            except SQLConnectionError:
                raise
            except Exception:
                self.log.warning("Can't load the metric %s, ignoring", name, exc_info=True)
                continue

    def get_sql_type(self, counter_name):
        """
        Return the type of the performance counter so that we can report it to
        Datadog correctly
        If the sql_type is one that needs a base (PERF_RAW_LARGE_FRACTION and
        PERF_AVERAGE_BULK), the name of the base counter will also be returned
        """
        with self.connection.get_managed_cursor() as cursor:
            cursor.execute(COUNTER_TYPE_QUERY, (counter_name,))
            (sql_type,) = cursor.fetchone()
            if sql_type == PERF_LARGE_RAW_BASE:
                self.log.warning("Metric %s is of type Base and shouldn't be reported this way", counter_name)
            base_name = None
            if sql_type in [PERF_AVERAGE_BULK, PERF_RAW_LARGE_FRACTION]:
                # This is an ugly hack. For certains type of metric (PERF_RAW_LARGE_FRACTION
                # and PERF_AVERAGE_BULK), we need two metrics: the metrics specified and
                # a base metrics to get the ratio. There is no unique schema so we generate
                # the possible candidates and we look at which ones exist in the db.
                candidates = (
                    counter_name + " base",
                    counter_name.replace("(ms)", "base"),
                    counter_name.replace("Avg ", "") + " base",
                )
                try:
                    cursor.execute(BASE_NAME_QUERY, candidates)
                    base_name = cursor.fetchone().counter_name.strip()
                    self.log.debug("Got base metric: %s for metric: %s", base_name, counter_name)
                except Exception as e:
                    self.log.warning("Could not get counter_name of base for metric: %s", e)

        return sql_type, base_name

    def typed_metric(self, cfg_inst, table, base_name=None, user_type=None, sql_type=None, column=None):
        """
        Create the appropriate BaseSqlServerMetric object, each implementing its method to
        fetch the metrics properly.
        If a `type` was specified in the config, it is used to report the value
        directly fetched from SQLServer. Otherwise, it is decided based on the
        sql_type, according to microsoft's documentation.
        """
        if table == DEFAULT_PERFORMANCE_TABLE:
            metric_type_mapping = {
                PERF_COUNTER_BULK_COUNT: (self.rate, metrics.SqlSimpleMetric),
                PERF_COUNTER_LARGE_RAWCOUNT: (self.gauge, metrics.SqlSimpleMetric),
                PERF_LARGE_RAW_BASE: (self.gauge, metrics.SqlSimpleMetric),
                PERF_RAW_LARGE_FRACTION: (self.gauge, metrics.SqlFractionMetric),
                PERF_AVERAGE_BULK: (self.gauge, metrics.SqlIncrFractionMetric),
            }
            if user_type is not None:
                # user type overrides any other value
                metric_type = getattr(self, user_type)
                cls = metrics.SqlSimpleMetric

            else:
                metric_type, cls = metric_type_mapping[sql_type]
        else:
            # Lookup metrics classes by their associated table
            metric_type_str, cls = metrics.TABLE_MAPPING[table]
            metric_type = getattr(self, metric_type_str)

        return cls(cfg_inst, base_name, metric_type, column, self.log)

    def check(self, _):
        if self.do_check:
            if self.proc:
                self.do_stored_procedure_check()
            else:
                self.collect_metrics()
        else:
            self.log.debug("Skipping check")

    def collect_metrics(self):
        """Fetch the metrics from all of the associated database tables."""

        with self.connection.open_managed_default_connection():
            with self.connection.get_managed_cursor() as cursor:
                # initiate autodiscovery or if the server was down at check __init__ key could be missing.
                if self.autodiscover_databases(cursor) or not self.instance_metrics:
                    self._make_metric_list_to_collect(self.custom_metrics)

                instance_results = {}

                # Execute the `fetch_all` operations first to minimize the database calls
                for cls, metric_names in six.iteritems(self.instance_per_type_metrics):
                    if not metric_names:
                        instance_results[cls] = None, None
                    else:
                        try:
                            rows, cols = getattr(metrics, cls).fetch_all_values(cursor, metric_names, self.log)
                        except Exception as e:
                            self.log.error("Error running `fetch_all` for metrics %s - skipping.  Error: %s", cls, e)
                            rows, cols = None, None

                        instance_results[cls] = rows, cols

                # Using the cached data, extract and report individual metrics
                for metric in self.instance_metrics:
                    if type(metric) is metrics.SqlIncrFractionMetric:
                        # special case, since it uses the same results as SqlFractionMetric
                        rows, cols = instance_results['SqlFractionMetric']
                        if rows is not None:
                            metric.fetch_metric(rows, cols)
                    else:
                        rows, cols = instance_results[metric.__class__.__name__]
                        if rows is not None:
                            metric.fetch_metric(rows, cols)

            # reuse connection for any custom queries
            self._query_manager.execute()

    def execute_query_raw(self, query):
        with self.connection.get_managed_cursor() as cursor:
            cursor.execute(query)
            return cursor.fetchall()

    def do_stored_procedure_check(self):
        """
        Fetch the metrics from the stored proc
        """

        proc = self.proc
        guardSql = self.instance.get('proc_only_if')
        custom_tags = self.instance.get("tags", [])

        if (guardSql and self.proc_check_guard(guardSql)) or not guardSql:
            self.connection.open_db_connections(self.connection.DEFAULT_DB_KEY)
            cursor = self.connection.get_cursor(self.connection.DEFAULT_DB_KEY)

            try:
                self.log.debug("Calling Stored Procedure : %s", proc)
                if self.connection.get_connector() == 'adodbapi':
                    cursor.callproc(proc)
                else:
                    # pyodbc does not support callproc; use execute instead.
                    # Reference: https://github.com/mkleehammer/pyodbc/wiki/Calling-Stored-Procedures
                    call_proc = '{{CALL {}}}'.format(proc)
                    cursor.execute(call_proc)

                rows = cursor.fetchall()
                self.log.debug("Row count (%s) : %s", proc, cursor.rowcount)

                for row in rows:
                    tags = [] if row.tags is None or row.tags == '' else row.tags.split(',')
                    tags.extend(custom_tags)

                    if row.type.lower() in self.proc_type_mapping:
                        self.proc_type_mapping[row.type](row.metric, row.value, tags, raw=True)
                    else:
                        self.log.warning(
                            '%s is not a recognised type from procedure %s, metric %s', row.type, proc, row.metric
                        )

            except Exception as e:
                self.log.warning("Could not call procedure %s: %s", proc, e)
                raise e

            self.connection.close_cursor(cursor)
            self.connection.close_db_connections(self.connection.DEFAULT_DB_KEY)
        else:
            self.log.info("Skipping call to %s due to only_if", proc)

    def proc_check_guard(self, sql):
        """
        check to see if the guard SQL returns a single column containing 0 or 1
        We return true if 1, else False
        """
        self.connection.open_db_connections(self.connection.PROC_GUARD_DB_KEY)
        cursor = self.connection.get_cursor(self.connection.PROC_GUARD_DB_KEY)

        should_run = False
        try:
            cursor.execute(sql, ())
            result = cursor.fetchone()
            should_run = result[0] == 1
        except Exception as e:
            self.log.error("Failed to run proc_only_if sql %s : %s", sql, e)

        self.connection.close_cursor(cursor)
        self.connection.close_db_connections(self.connection.PROC_GUARD_DB_KEY)
        return should_run
Beispiel #5
0
class Oracle(AgentCheck):
    __NAMESPACE__ = 'oracle'

    ORACLE_DRIVER_CLASS = "oracle.jdbc.OracleDriver"
    JDBC_CONNECT_STRING = "jdbc:oracle:thin:@//{}/{}"
    CX_CONNECT_STRING = "{}/{}@//{}/{}"

    SERVICE_CHECK_NAME = 'can_connect'

    def __init__(self, name, init_config, instances):
        super(Oracle, self).__init__(name, init_config, instances)
        (
            self._server,
            self._user,
            self._password,
            self._service,
            self._jdbc_driver,
            self._tags,
            only_custom_queries,
        ) = self._get_config(self.instance)

        self.check_initializations.append(self.validate_config)

        self._connection = None

        manager_queries = []
        if not only_custom_queries:
            manager_queries.extend([
                queries.ProcessMetrics, queries.SystemMetrics,
                queries.TableSpaceMetrics
            ])

        self._fix_custom_queries()

        self._query_manager = QueryManager(
            self,
            self.execute_query_raw,
            queries=manager_queries,
            tags=self._tags,
        )
        self.check_initializations.append(self._query_manager.compile_queries)

    def _fix_custom_queries(self):
        """
        For backward compatibility reasons, if a custom query specifies a
        `metric_prefix`, change the submission name to contain it.
        """
        custom_queries = self.instance.get('custom_queries', [])
        global_custom_queries = self.init_config.get('global_custom_queries',
                                                     [])
        for query in itertools.chain(custom_queries, global_custom_queries):
            prefix = query.get('metric_prefix')
            if prefix and prefix != self.__NAMESPACE__:
                if prefix.startswith(self.__NAMESPACE__ + '.'):
                    prefix = prefix[len(self.__NAMESPACE__) + 1:]
                for column in query.get('columns', []):
                    if column.get('type') != 'tag':
                        column['name'] = '{}.{}'.format(prefix, column['name'])

    def validate_config(self):
        if not self._server or not self._user:
            raise ConfigurationError("Oracle host and user are needed")

    def execute_query_raw(self, query):
        with closing(self._connection.cursor()) as cursor:
            cursor.execute(query)
            # JDBC doesn't support iter protocol
            return cursor.fetchall()

    def check(self, _):
        self.create_connection()
        with closing(self._connection):
            self._query_manager.execute()
            self._connection = None

    def _get_config(self, instance):
        server = instance.get('server')
        user = instance.get('user')
        password = instance.get('password')
        service = instance.get('service_name')
        jdbc_driver = instance.get('jdbc_driver_path')
        tags = instance.get('tags') or []
        only_custom_queries = instance.get('only_custom_queries', False)

        return server, user, password, service, jdbc_driver, tags, only_custom_queries

    def create_connection(self):
        service_check_tags = ['server:%s' % self._server]
        service_check_tags.extend(self._tags)

        try:
            # Check if the instantclient is available
            cx_Oracle.clientversion()
        except cx_Oracle.DatabaseError as e:
            # Fallback to JDBC
            use_oracle_client = False
            self.log.debug(
                'Oracle instant client unavailable, falling back to JDBC: %s',
                e)
            connect_string = self.JDBC_CONNECT_STRING.format(
                self._server, self._service)
        else:
            use_oracle_client = True
            self.log.debug('Running cx_Oracle version %s', cx_Oracle.version)
            connect_string = self.CX_CONNECT_STRING.format(
                self._user, self._password, self._server, self._service)

        try:
            if use_oracle_client:
                connection = cx_Oracle.connect(connect_string)
            elif JDBC_IMPORT_ERROR:
                self.log.error(
                    "Oracle client is unavailable and the integration is unable to import JDBC libraries. You may not "
                    "have the Microsoft Visual C++ Runtime 2015 installed on your system. Please double check your "
                    "installation and refer to the Datadog documentation for more information."
                )
                raise JDBC_IMPORT_ERROR
            else:
                try:
                    if jpype.isJVMStarted(
                    ) and not jpype.isThreadAttachedToJVM():
                        jpype.attachThreadToJVM()
                        jpype.java.lang.Thread.currentThread(
                        ).setContextClassLoader(
                            jpype.java.lang.ClassLoader.getSystemClassLoader())
                    connection = jdb.connect(self.ORACLE_DRIVER_CLASS,
                                             connect_string,
                                             [self._user, self._password],
                                             self._jdbc_driver)
                except Exception as e:
                    if "Class {} not found".format(
                            self.ORACLE_DRIVER_CLASS) in str(e):
                        msg = """Cannot run the Oracle check until either the Oracle instant client or the JDBC Driver
                        is available.
                        For the Oracle instant client, see:
                        http://www.oracle.com/technetwork/database/features/instant-client/index.html
                        You will also need to ensure the `LD_LIBRARY_PATH` is also updated so the libs are reachable.

                        For the JDBC Driver, see:
                        http://www.oracle.com/technetwork/database/application-development/jdbc/downloads/index.html
                        You will also need to ensure the jar is either listed in your $CLASSPATH or in the yaml
                        configuration file of the check.
                        """
                        self.log.error(msg)
                    raise

            self.log.debug("Connected to Oracle DB")
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.OK,
                               tags=service_check_tags)
        except Exception as e:
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.CRITICAL,
                               tags=service_check_tags)
            self.log.error(e)
            raise
        self._connection = connection
class ClickhouseCheck(AgentCheck):
    __NAMESPACE__ = 'clickhouse'
    SERVICE_CHECK_CONNECT = 'can_connect'

    def __init__(self, name, init_config, instances):
        super(ClickhouseCheck, self).__init__(name, init_config, instances)

        self._server = self.instance.get('server', '')
        self._port = self.instance.get('port')
        self._db = self.instance.get('db', 'default')
        self._user = self.instance.get('user', 'default')
        self._password = self.instance.get('password', '')
        self._connect_timeout = float(self.instance.get('connect_timeout', 10))
        self._read_timeout = float(self.instance.get('read_timeout', 10))
        self._compression = self.instance.get('compression', False)
        self._tls_verify = is_affirmative(
            self.instance.get('tls_verify', False))
        self._tags = self.instance.get('tags', [])

        # Add global tags
        self._tags.append('server:{}'.format(self._server))
        self._tags.append('port:{}'.format(self._port))
        self._tags.append('db:{}'.format(self._db))

        self._error_sanitizer = ErrorSanitizer(self._password)
        self.check_initializations.append(self.validate_config)

        # We'll connect on the first check run
        self._client = None
        self.check_initializations.append(self.create_connection)

        self._query_manager = QueryManager(
            self,
            self.execute_query_raw,
            queries=[
                queries.SystemMetrics,
                queries.SystemEvents,
                queries.SystemAsynchronousMetrics,
                queries.SystemParts,
                queries.SystemReplicas,
                queries.SystemDictionaries,
            ],
            tags=self._tags,
            error_handler=self._error_sanitizer.clean,
        )
        self.check_initializations.append(self._query_manager.compile_queries)

    def check(self, _):
        self._query_manager.execute()
        self.collect_version()

    def collect_version(self):
        version = list(self.execute_query_raw('SELECT version()'))[0][0]

        # The version comes in like `19.15.2.2` though sometimes there is no patch part
        version_parts = {
            name: part
            for name, part in zip(('year', 'major', 'minor',
                                   'patch'), version.split('.'))
        }

        self.set_metadata('version',
                          version,
                          scheme='parts',
                          final_scheme='calver',
                          part_map=version_parts)

    def execute_query_raw(self, query):
        return self._client.execute_iter(query)

    def validate_config(self):
        if not self._server:
            raise ConfigurationError('the `server` setting is required')

    def create_connection(self):
        try:
            client = clickhouse_driver.Client(
                host=self._server,
                port=self._port,
                user=self._user,
                password=self._password,
                database=self._db,
                connect_timeout=self._connect_timeout,
                send_receive_timeout=self._read_timeout,
                sync_request_timeout=self._connect_timeout,
                compression=self._compression,
                secure=self._tls_verify,
                # Don't pollute the Agent logs
                settings={'calculate_text_stack_trace': False},
                # Make every client unique for server logs
                client_name='datadog-{}'.format(self.check_id),
            )
            client.connection.connect()
        except Exception as e:
            error = 'Unable to connect to ClickHouse: {}'.format(
                self._error_sanitizer.clean(self._error_sanitizer.scrub(
                    str(e))))
            self.service_check(self.SERVICE_CHECK_CONNECT,
                               self.CRITICAL,
                               message=error,
                               tags=self._tags)

            # When an exception is raised in the context of another one, both will be printed. To avoid
            # this we set the context to None. https://www.python.org/dev/peps/pep-0409/
            raise_from(type(e)(error), None)
        else:
            self.service_check(self.SERVICE_CHECK_CONNECT,
                               self.OK,
                               tags=self._tags)
            self._client = client
Beispiel #7
0
class Oracle(AgentCheck):
    __NAMESPACE__ = 'oracle'

    ORACLE_DRIVER_CLASS = "oracle.jdbc.OracleDriver"
    JDBC_CONNECTION_STRING = "jdbc:oracle:thin:@//{}/{}"
    JDBC_CONNECTION_STRING_TCPS = "jdbc:oracle:thin:@{}"

    SERVICE_CHECK_NAME = 'can_connect'
    SERVICE_CHECK_CAN_QUERY = "can_query"

    def __init__(self, name, init_config, instances):
        super(Oracle, self).__init__(name, init_config, instances)
        self._server = self.instance.get('server')
        self._user = self.instance.get('username') or self.instance.get('user')
        self._password = self.instance.get('password')
        self._service = self.instance.get('service_name')
        self._protocol = self.instance.get("protocol", PROTOCOL_TCP)
        self._jdbc_driver = self.instance.get('jdbc_driver_path')
        self._jdbc_truststore_path = self.instance.get('jdbc_truststore_path')
        self._jdbc_truststore_type = self.instance.get('jdbc_truststore_type')
        self._jdbc_truststore_password = self.instance.get(
            'jdbc_truststore_password', '')
        self._tags = self.instance.get('tags') or []
        self._service_check_tags = ['server:{}'.format(self._server)]
        self._service_check_tags.extend(self._tags)

        self._cached_connection = None

        manager_queries = []
        if not self.instance.get('only_custom_queries', False):
            manager_queries.extend([
                queries.ProcessMetrics, queries.SystemMetrics,
                queries.TableSpaceMetrics
            ])

        self._fix_custom_queries()

        self._query_manager = QueryManager(
            self,
            self.execute_query_raw,
            queries=manager_queries,
            error_handler=self.handle_query_error,
            tags=self._tags,
        )

        # Runtime validations are only py3, so this is for manually validating config on py2
        if PY2:
            self.check_initializations.append(self.validate_config)
        self.check_initializations.append(self._query_manager.compile_queries)

        self._query_errors = 0
        self._connection_errors = 0

    def _fix_custom_queries(self):
        """
        For backward compatibility reasons, if a custom query specifies a
        `metric_prefix`, change the submission name to contain it.
        """
        custom_queries = self.instance.get('custom_queries', [])
        global_custom_queries = self.init_config.get('global_custom_queries',
                                                     [])
        for query in itertools.chain(custom_queries, global_custom_queries):
            prefix = query.get('metric_prefix')
            if prefix and prefix != self.__NAMESPACE__:
                if prefix.startswith(self.__NAMESPACE__ + '.'):
                    prefix = prefix[len(self.__NAMESPACE__) + 1:]
                for column in query.get('columns', []):
                    if column.get('type') != 'tag':
                        column['name'] = '{}.{}'.format(prefix, column['name'])

    def validate_config(self):
        if not self._server or not self._user:
            raise ConfigurationError("Oracle host and user are needed")

        if not self._protocol or self._protocol.upper() not in VALID_PROTOCOLS:
            raise ConfigurationError(
                "Protocol %s is not valid, must either be TCP or TCPS" %
                self._protocol)

        if self._jdbc_driver and self._protocol.upper() == PROTOCOL_TCPS:
            if not (self._jdbc_truststore_type and self._jdbc_truststore_path):
                raise ConfigurationError(
                    "TCPS connections to Oracle via JDBC requires both `jdbc_truststore_type` and "
                    "`jdbc_truststore_path` configuration options ")

            if self._jdbc_truststore_type and self._jdbc_truststore_type.upper(
            ) not in VALID_TRUSTSTORE_TYPES:
                raise ConfigurationError(
                    "Truststore type %s is not valid, must be one of %s" %
                    (self._jdbc_truststore_type, VALID_TRUSTSTORE_TYPES))

    def execute_query_raw(self, query):
        with closing(self._connection.cursor()) as cursor:
            cursor.execute(query)
            # JDBC doesn't support iter protocol
            return cursor.fetchall()

    def handle_query_error(self, error):
        self._query_errors += 1
        if self._cached_connection is None:
            self.log.debug(
                "Couldn't close the connection after a query failure because there was no connection"
            )
            return error

        try:
            self._cached_connection.close()
        except Exception as e:
            self.log.warning(
                "Couldn't close the connection after a query failure: %s",
                str(e))
        self._cached_connection = None

        return error

    def check(self, _):
        if self.instance.get('user'):
            self._log_deprecation('_config_renamed', 'user', 'username')

        self._query_errors = 0
        self._connection_errors = 0

        self._query_manager.execute()

        if self._query_errors:
            self.service_check(self.SERVICE_CHECK_CAN_QUERY,
                               self.CRITICAL,
                               tags=self._service_check_tags)
        else:
            self.service_check(self.SERVICE_CHECK_CAN_QUERY,
                               self.OK,
                               tags=self._service_check_tags)

        if self._connection_errors:
            self.service_check(self.SERVICE_CHECK_NAME,
                               self.CRITICAL,
                               tags=self._service_check_tags)
        else:
            self.service_check(self.SERVICE_CHECK_NAME,
                               self.OK,
                               tags=self._service_check_tags)

    @property
    def _connection(self):
        """Creates a connection or raises an exception"""
        if self._cached_connection is None:
            if self.can_use_oracle_client():
                self._cached_connection = self._oracle_client_connect()
            elif JDBC_IMPORT_ERROR:
                self._connection_errors += 1
                self.log.error(
                    "Oracle client is unavailable and the integration is unable to import JDBC libraries. You may not "
                    "have the Microsoft Visual C++ Runtime 2015 installed on your system. Please double check your "
                    "installation and refer to the Datadog documentation for more information."
                )
                raise JDBC_IMPORT_ERROR
            else:
                self._cached_connection = self._jdbc_connect()
        return self._cached_connection

    def can_use_oracle_client(self):
        try:
            # Check if the instantclient is available
            cx_Oracle.clientversion()
        except cx_Oracle.DatabaseError as e:
            # Fallback to JDBC
            self.log.debug(
                'Oracle instant client unavailable, falling back to JDBC: %s',
                e)
            return False
        else:
            self.log.debug('Running cx_Oracle version %s', cx_Oracle.version)
            return True

    def _oracle_client_connect(self):
        dsn = self._get_dsn()
        self.log.debug("Connecting via Oracle Instant Client with DSN: %s",
                       dsn)
        try:
            connection = cx_Oracle.connect(user=self._user,
                                           password=self._password,
                                           dsn=dsn)
            self.log.debug(
                "Connected to Oracle DB using Oracle Instant Client")
            return connection
        except cx_Oracle.DatabaseError as e:
            self._connection_errors += 1
            self.log.error(
                "Failed to connect to Oracle DB using Oracle Instant Client, error: %s",
                str(e))
            raise

    def _get_dsn(self):
        host = self._server
        port = 1521
        try:
            if ':' in self._server:
                host, port = self._server.split(':')
                port = int(port)
        except Exception:
            self._connection_errors += 1
            raise ConfigurationError(
                'server needs to be in the <HOST>:<PORT> format, "%s"" provided'
                % self._server)

        if self._protocol == PROTOCOL_TCPS:
            dsn = '(DESCRIPTION=(ADDRESS=(PROTOCOL={})(HOST={})(PORT={}))(CONNECT_DATA=(SERVICE_NAME={})))'.format(
                self._protocol, host, port, self._service)
            return dsn
        else:
            return cx_Oracle.makedsn(host, port, service_name=self._service)

    def _jdbc_connect(self):
        jdbc_connect_properties = {
            'user': self._user,
            'password': self._password
        }

        if self._protocol == PROTOCOL_TCPS:
            connect_string = self.JDBC_CONNECTION_STRING_TCPS.format(
                self._get_dsn())
            jdbc_connect_properties[
                'javax.net.ssl.trustStoreType'] = self._jdbc_truststore_type
            jdbc_connect_properties[
                'javax.net.ssl.trustStorePassword'] = self._jdbc_truststore_password
            jdbc_connect_properties[
                'javax.net.ssl.trustStore'] = self._jdbc_truststore_path
        else:
            connect_string = self.JDBC_CONNECTION_STRING.format(
                self._server, self._service)

        self.log.debug("Connecting via JDBC with connection string: %s",
                       connect_string)
        try:
            with jdbc_lock:
                if jpype.isJVMStarted() and not jpype.isThreadAttachedToJVM():
                    jpype.attachThreadToJVM()
                    jpype.java.lang.Thread.currentThread(
                    ).setContextClassLoader(
                        jpype.java.lang.ClassLoader.getSystemClassLoader())
                connection = jdb.connect(self.ORACLE_DRIVER_CLASS,
                                         connect_string,
                                         jdbc_connect_properties,
                                         self._jdbc_driver)
            self.log.debug("Connected to Oracle DB using JDBC connector")
            return connection
        except Exception as e:
            self._connection_errors += 1
            if "Class {} not found".format(self.ORACLE_DRIVER_CLASS) in str(e):
                msg = """Cannot run the Oracle check until either the Oracle instant client or the JDBC Driver
                is available.
                For the Oracle instant client, see:
                http://www.oracle.com/technetwork/database/features/instant-client/index.html
                You will also need to ensure the `LD_LIBRARY_PATH` is also updated so the libs are reachable.

                For the JDBC Driver, see:
                http://www.oracle.com/technetwork/database/application-development/jdbc/downloads/index.html
                You will also need to ensure the jar is either listed in your $CLASSPATH or in the yaml
                configuration file of the check.
                """
                self.log.error(msg)
            raise
Beispiel #8
0
class SnowflakeCheck(AgentCheck):
    """
    Collect Snowflake account usage metrics
    """

    __NAMESPACE__ = 'snowflake'

    SERVICE_CHECK_CONNECT = 'can_connect'

    def __init__(self, *args, **kwargs):
        super(SnowflakeCheck, self).__init__(*args, **kwargs)
        self._config = Config(self.instance)
        self._conn = None

        self.proxy_host = self.init_config.get('proxy_host', None)
        self.proxy_port = self.init_config.get('proxy_port', None)
        self.proxy_user = self.init_config.get('proxy_user', None)
        self.proxy_password = self.init_config.get('proxy_password', None)

        # Add default tags like account to all metrics
        self._tags = self._config.tags + ['account:{}'.format(self._config.account)]

        if self._config.password:
            self.register_secret(self._config.password)

        if self._config.private_key_password:
            self.register_secret(self._config.private_key_password)

        if self._config.role == 'ACCOUNTADMIN':
            self.log.info(
                'Snowflake `role` is set as `ACCOUNTADMIN` which should be used cautiously, '
                'refer to docs about custom roles.'
            )

        self.metric_queries = []
        self.errors = []
        for mgroup in self._config.metric_groups:
            try:
                if not self._config.aggregate_last_24_hours:
                    for query in range(len(METRIC_GROUPS[mgroup])):
                        METRIC_GROUPS[mgroup][query]['query'] = METRIC_GROUPS[mgroup][query]['query'].replace(
                            'DATEADD(hour, -24, current_timestamp())', 'date_trunc(day, current_date)'
                        )
                self.metric_queries.extend(METRIC_GROUPS[mgroup])
            except KeyError:
                self.errors.append(mgroup)

        if self.errors:
            self.log.warning('Invalid metric_groups found in snowflake conf.yaml: %s', (', '.join(self.errors)))
        if not self.metric_queries and not self._config.custom_queries_defined:
            raise ConfigurationError('No valid metric_groups or custom query configured, please list at least one.')

        self._query_manager = QueryManager(self, self.execute_query_raw, queries=self.metric_queries, tags=self._tags)
        self.check_initializations.append(self._query_manager.compile_queries)

    def read_token(self):
        if self._config.token_path:
            self.log.debug("Renewing Snowflake client token")
            with open(self._config.token_path, 'rb', encoding="UTF-8") as f:
                self._config.token = f.read()

        return self._config.token

    def read_key(self):
        if self._config.private_key_path:
            self.log.debug("Reading Snowflake client key for key pair authentication")
            # https://docs.snowflake.com/en/user-guide/python-connector-example.html#using-key-pair-authentication-key-pair-rotation
            with open(self._config.private_key_path, "rb") as key:
                p_key = serialization.load_pem_private_key(
                    key.read(), password=ensure_bytes(self._config.private_key_password), backend=default_backend()
                )

                pkb = p_key.private_bytes(
                    encoding=serialization.Encoding.DER,
                    format=serialization.PrivateFormat.PKCS8,
                    encryption_algorithm=serialization.NoEncryption(),
                )

                return pkb

        return None

    def check(self, _):
        if self.instance.get('user'):
            self._log_deprecation('_config_renamed', 'user', 'username')

        self.connect()

        if self._conn is not None:
            # Execute queries
            self._query_manager.execute()

            self._collect_version()

            self.log.debug("Closing connection to Snowflake...")
            self._conn.close()

    def execute_query_raw(self, query):
        """
        Executes query with timestamp from parts if comparing start_time field.
        """
        with closing(self._conn.cursor()) as cursor:
            cursor.execute(query)

            if cursor.rowcount is None or cursor.rowcount < 1:
                self.log.debug("Failed to fetch records from query: `%s`", query)
                return []
            return cursor.fetchall()

    def connect(self):
        self.log.debug(
            "Establishing a new connection to Snowflake: account=%s, user=%s, database=%s, schema=%s, warehouse=%s, "
            "role=%s, timeout=%s, authenticator=%s, ocsp_response_cache_filename=%s, proxy_host=%s, proxy_port=%s",
            self._config.account,
            self._config.user,
            self._config.database,
            self._config.schema,
            self._config.warehouse,
            self._config.role,
            self._config.login_timeout,
            self._config.authenticator,
            self._config.ocsp_response_cache_filename,
            self.proxy_host,
            self.proxy_port,
        )

        try:
            conn = sf.connect(
                user=self._config.user,
                password=self._config.password,
                account=self._config.account,
                database=self._config.database,
                schema=self._config.schema,
                warehouse=self._config.warehouse,
                role=self._config.role,
                passcode_in_password=self._config.passcode_in_password,
                passcode=self._config.passcode,
                client_prefetch_threads=self._config.client_prefetch_threads,
                login_timeout=self._config.login_timeout,
                ocsp_response_cache_filename=self._config.ocsp_response_cache_filename,
                authenticator=self._config.authenticator,
                token=self.read_token(),
                private_key=self.read_key(),
                client_session_keep_alive=self._config.client_keep_alive,
                proxy_host=self.proxy_host,
                proxy_port=self.proxy_port,
                proxy_user=self.proxy_user,
                proxy_password=self.proxy_password,
            )
        except Exception as e:
            msg = "Unable to connect to Snowflake: {}".format(e)
            self.service_check(self.SERVICE_CHECK_CONNECT, self.CRITICAL, message=msg, tags=self._tags)
            self.warning(msg)
        else:
            self.service_check(self.SERVICE_CHECK_CONNECT, self.OK, tags=self._tags)
            self._conn = conn

    @AgentCheck.metadata_entrypoint
    def _collect_version(self):
        try:
            raw_version = self.execute_query_raw("select current_version();")
            version = raw_version[0][0]
        except Exception as e:
            self.log.error("Error collecting version for Snowflake: %s", e)
        else:
            if version:
                self.set_metadata('version', version)

    # override
    def _normalize_tags_type(self, tags, device_name=None, metric_name=None):
        if self.disable_generic_tags:
            return super(SnowflakeCheck, self)._normalize_tags_type(tags, device_name, metric_name)

        # If disable_generic_tags is not enabled, for each generic tag we emmit both the generic and the non generic
        # version to ease transition.
        normalized_tags = []
        for tag in tags:
            if tag is not None:
                try:
                    tag = to_native_string(tag)
                except UnicodeError:
                    self.log.warning('Encoding error with tag `%s` for metric `%s`, ignoring tag', tag, metric_name)
                    continue
                normalized_tags.extend(list({tag, self.degeneralise_tag(tag)}))
        return normalized_tags
Beispiel #9
0
class VoltDBCheck(AgentCheck):
    __NAMESPACE__ = 'voltdb'

    def __init__(self, name, init_config, instances):
        # type: (str, dict, list) -> None
        super(VoltDBCheck, self).__init__(name, init_config, instances)

        self._config = Config(cast(Instance, self.instance),
                              debug=self.log.debug)
        self.register_secret(self._config.password)
        self._client = Client(
            url=self._config.url,
            http_get=self.http.get,
            username=self._config.username,
            password=self._config.password,
            password_hashed=self._config.password_hashed,
        )

        manager_queries = [
            queries.CPUMetrics,
            queries.MemoryMetrics,
            queries.SnapshotStatusMetrics,
            queries.CommandLogMetrics,
            queries.ProcedureMetrics,
            queries.LatencyMetrics,
            queries.GCMetrics,
            queries.IOStatsMetrics,
            queries.TableMetrics,
            queries.IndexMetrics,
        ]

        if BASE_PARSED_VERSION < pkg_resources.parse_version('15.0.0'):
            # On Agent < 7.24.0 we must to pass `Query` objects instead of dicts.
            manager_queries = [Query(query)
                               for query in manager_queries]  # type: ignore

        self._query_manager = QueryManager(
            self,
            self._execute_query_raw,
            queries=manager_queries,
            tags=self._config.tags,
        )
        self.check_initializations.append(self._query_manager.compile_queries)

    def _raise_for_status_with_details(self, response):
        # type: (requests.Response) -> None
        try:
            response.raise_for_status()
        except Exception as exc:
            message = 'Error response from VoltDB: {}'.format(exc)
            try:
                # Try including detailed error message from response.
                details = response.json()['statusstring']
            except Exception:
                pass
            else:
                message += ' (details: {})'.format(details)
            raise_from(Exception(message), exc)

    def _fetch_version(self):
        # type: () -> Optional[str]
        # See: https://docs.voltdb.com/UsingVoltDB/sysprocsysteminfo.php#sysprocsysinforetvalovervw
        response = self._client.request('@SystemInformation',
                                        parameters=['OVERVIEW'])
        self._raise_for_status_with_details(response)

        data = response.json()
        rows = data['results'][0]['data']  # type: List[tuple]

        # NOTE: there will be one VERSION row per server in the cluster.
        # Arbitrarily use the first one we see.
        for _, column, value in rows:
            if column == 'VERSION':
                return self._transform_version(value)

        self.log.debug('VERSION column not found: %s',
                       [column for _, column, _ in rows])
        return None

    def _transform_version(self, raw):
        # type: (str) -> Optional[str]
        # VoltDB does not include .0 patch numbers (eg 10.0, not 10.0.0).
        # Need to ensure they're present so the version is always in 3 parts: major.minor.patch.
        try:
            major, rest = raw.split('.', 1)
        except ValueError:
            self.log.debug('Malformed version string: %s', raw)
            return None
        minor, found, patch = rest.partition('.')
        if not found:
            patch = '0'
        return '{}.{}.{}'.format(major, minor, patch)

    @AgentCheck.metadata_entrypoint
    def _submit_version(self, version):
        # type: (str) -> None
        self.set_metadata('version', version)

    def _check_can_connect_and_submit_version(self):
        # type () -> None
        host, port = self._config.netloc
        tags = ['host:{}'.format(host), 'port:{}'.format(port)
                ] + self._config.tags

        try:
            version = self._fetch_version()
        except Exception as exc:
            message = 'Unable to connect to VoltDB: {}'.format(exc)
            self.service_check('can_connect',
                               self.CRITICAL,
                               message=message,
                               tags=tags)
            raise

        self.service_check('can_connect', self.OK, tags=tags)

        if version is not None:
            self._submit_version(version)

    def _execute_query_raw(self, query):
        # type: (str) -> List[tuple]
        # Ad-hoc format, close to the HTTP API format.
        # Eg 'A:[B, C]' -> '?Procedure=A&Parameters=[B, C]'
        procedure, _, parameters = query.partition(":")

        response = self._client.request(procedure, parameters=parameters)
        self._raise_for_status_with_details(response)

        data = response.json()
        return data['results'][0]['data']

    def check(self, _):
        # type: (Any) -> None
        self._check_can_connect_and_submit_version()
        self._query_manager.execute()
Beispiel #10
0
class ProxysqlCheck(AgentCheck):

    SERVICE_CHECK_NAME = "can_connect"
    __NAMESPACE__ = "proxysql"

    def __init__(self, name, init_config, instances):
        super(ProxysqlCheck, self).__init__(name, init_config, instances)
        self.host = self.instance.get("host", "")
        self.port = int(self.instance.get("port", 0))
        self.user = self.instance.get("username", "")
        self.password = str(self.instance.get("password", ""))

        if not all((self.host, self.port, self.user, self.password)):
            raise ConfigurationError(
                "ProxySQL host, port, username and password are needed")

        self.tls_verify = self.instance.get("tls_verify", False)
        self.validate_hostname = self.instance.get("validate_hostname", True)
        self.tls_ca_cert = self.instance.get("tls_ca_cert")
        self.connect_timeout = self.instance.get("connect_timeout", 10)
        self.read_timeout = self.instance.get("read_timeout")

        self.tags = self.instance.get("tags", [])
        self.tags.append("proxysql_server:{}".format(self.host))
        self.tags.append("proxysql_port:{}".format(self.port))

        manager_queries = [STATS_MYSQL_GLOBAL]
        if self.is_metadata_collection_enabled():
            # Add the query to collect the ProxySQL version
            manager_queries.append(VERSION_METADATA)

        additional_metrics = self.instance.get("additional_metrics", [])
        for additional_group in additional_metrics:
            if additional_group not in ADDITIONAL_METRICS_MAPPING:
                raise ConfigurationError(
                    "There is no additional metric group called '{}' for the ProxySQL integration, it should be one "
                    "of ({})".format(
                        additional_group,
                        ", ".join(ADDITIONAL_METRICS_MAPPING),
                    ))
            manager_queries.append(
                ADDITIONAL_METRICS_MAPPING[additional_group])
        self._connection = None
        self._query_manager = QueryManager(self,
                                           self.execute_query_raw,
                                           queries=manager_queries,
                                           tags=self.tags)
        self.check_initializations.append(self._query_manager.compile_queries)

    def check(self, _):
        with self.connect() as conn:
            self._connection = conn
            self._query_manager.execute()

    def execute_query_raw(self, query):
        with closing(self._connection.cursor()) as cursor:
            cursor.execute(query)
            if cursor.rowcount < 1:
                self.log.warning("Failed to fetch records from query: `%s`.",
                                 query)
                return []

            return cursor.fetchall()

    @contextmanager
    def connect(self):
        if self.tls_verify:
            # If ca_cert is None, will load the default certificates
            ssl_context = make_secure_ssl_client_context(
                ca_cert=self.tls_ca_cert,
                check_hostname=self.validate_hostname)
        else:
            ssl_context = make_insecure_ssl_client_context()

        db = None
        try:
            db = pymysql.connect(
                host=self.host,
                user=self.user,
                port=self.port,
                passwd=self.password,
                connect_timeout=self.connect_timeout,
                read_timeout=self.read_timeout,
                ssl=ssl_context,
            )
            self.log.debug("Connected to ProxySQL")
            yield db
        except Exception:
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.CRITICAL,
                               tags=self.tags)
            self.log.exception("Can't connect to ProxySQL")
            raise
        else:
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.OK,
                               tags=self.tags)
        finally:
            if db:
                db.close()
class SnowflakeCheck(AgentCheck):
    """
    Collect Snowflake account usage metrics
    """

    __NAMESPACE__ = 'snowflake'

    SERVICE_CHECK_CONNECT = 'snowflake.can_connect'

    MONKEY_PATCH_LOCK = threading.Lock()

    def __init__(self, *args, **kwargs):
        super(SnowflakeCheck, self).__init__(*args, **kwargs)
        self.config = Config(self.instance)
        self._conn = None

        # Add default tags like account to all metrics
        self._tags = self.config.tags + [
            'account:{}'.format(self.config.account)
        ]

        if self.config.password:
            self.register_secret(self.config.password)

        if self.config.role == 'ACCOUNTADMIN':
            self.log.info(
                'Snowflake `role` is set as `ACCOUNTADMIN` which should be used cautiously, '
                'refer to docs about custom roles.')

        self.metric_queries = []
        self.errors = []
        for mgroup in self.config.metric_groups:
            try:
                self.metric_queries.extend(METRIC_GROUPS[mgroup])
            except KeyError:
                self.errors.append(mgroup)

        if self.errors:
            self.log.warning(
                'Invalid metric_groups found in snowflake conf.yaml: %s',
                (', '.join(self.errors)))
        if not self.metric_queries:
            raise ConfigurationError(
                'No valid metric_groups configured, please list at least one.')

        self._proxies = self.http.options['proxies']  # SKIP_HTTP_VALIDATION
        self._query_manager = QueryManager(self,
                                           self.execute_query_raw,
                                           queries=self.metric_queries,
                                           tags=self._tags)
        self.check_initializations.append(self._query_manager.compile_queries)

    def check(self, _):
        self.connect()

        if self._conn is not None:
            # Execute queries
            self._query_manager.execute()

            self._collect_version()

            self.log.debug("Closing connection to Snowflake...")
            self._conn.close()

    def execute_query_raw(self, query):
        """
        Executes query with timestamp from parts if comparing start_time field.
        """
        with closing(self._conn.cursor()) as cursor:
            cursor.execute(query)

            if cursor.rowcount is None or cursor.rowcount < 1:
                self.log.debug("Failed to fetch records from query: `%s`",
                               query)
                return []
            return cursor.fetchall()

    def connect(self):
        self.log.debug(
            "Establishing a new connection to Snowflake: account=%s, user=%s, database=%s, schema=%s, warehouse=%s, "
            "role=%s, login_timeout=%s, authenticator=%s, ocsp_response_cache_filename=%s",
            self.config.account,
            self.config.user,
            self.config.database,
            self.config.schema,
            self.config.warehouse,
            self.config.role,
            self.config.login_timeout,
            self.config.authenticator,
            self.config.ocsp_response_cache_filename,
        )

        try:
            with self.MONKEY_PATCH_LOCK:
                # Monkey patch proxies to request_exec
                SnowflakeRestful._request_exec = self._make_snowflake_request_func(
                    self._proxies, SnowflakeRestful._request_exec)
                conn = sf.connect(
                    user=self.config.user,
                    password=self.config.password,
                    account=self.config.account,
                    database=self.config.database,
                    schema=self.config.schema,
                    warehouse=self.config.warehouse,
                    role=self.config.role,
                    passcode_in_password=self.config.passcode_in_password,
                    passcode=self.config.passcode,
                    client_prefetch_threads=self.config.
                    client_prefetch_threads,
                    login_timeout=self.config.login_timeout,
                    ocsp_response_cache_filename=self.config.
                    ocsp_response_cache_filename,
                    authenticator=self.config.authenticator,
                    token=self.config.token,
                    client_session_keep_alive=self.config.client_keep_alive,
                )
        except Exception as e:
            msg = "Unable to connect to Snowflake: {}".format(e)
            self.service_check(self.SERVICE_CHECK_CONNECT,
                               self.CRITICAL,
                               message=msg,
                               tags=self._tags)
            self.warning(msg)
        else:
            self.service_check(self.SERVICE_CHECK_CONNECT,
                               self.OK,
                               tags=self._tags)
            self._conn = conn

    def _make_snowflake_request_func(self, proxies, method):
        """
        This is a workaround to include proxy config in the Snowflake connection.
        The current Snowflake logic applies global proxy configs via env vars.

        TODO: Remove when https://github.com/snowflakedb/snowflake-connector-python/pull/352 gets merged
        """
        def _request_exec(*args, **kwargs):
            session = kwargs.get('session') or args[1]
            session.proxies = proxies
            try:
                return method(*args, **kwargs)
            except Exception as e:
                msg = "Encountered error while attempting to connect to Snowflake "
                if proxies:
                    self.log.error("%s via proxy settings: %s", msg, str(e))
                else:
                    self.log.error("%s: %s", msg, str(e))
                return

        return _request_exec

    @AgentCheck.metadata_entrypoint
    def _collect_version(self):
        try:
            raw_version = self.execute_query_raw("select current_version();")
            version = raw_version[0][0]
        except Exception as e:
            self.log.error("Error collecting version for Snowflake: %s", e)
        else:
            if version:
                self.set_metadata('version', version)
Beispiel #12
0
class SQLServer(AgentCheck):
    __NAMESPACE__ = 'sqlserver'

    SERVICE_CHECK_NAME = 'sqlserver.can_connect'

    # Default performance table metrics - Database Instance level
    # datadog metric name, counter name, instance name
    INSTANCE_METRICS = [
        # SQLServer:General Statistics
        ('sqlserver.stats.connections', 'User Connections', ''
         ),  # LARGE_RAWCOUNT
        ('sqlserver.stats.procs_blocked', 'Processes blocked',
         ''),  # LARGE_RAWCOUNT
        # SQLServer:Access Methods
        ('sqlserver.access.page_splits', 'Page Splits/sec', ''),  # BULK_COUNT
        # SQLServer:Memory Manager
        ('sqlserver.memory.memory_grants_pending', 'Memory Grants Pending', ''
         ),
        ('sqlserver.memory.total_server_memory', 'Total Server Memory (KB)',
         ''),
        # SQLServer:Buffer Manager
        ('sqlserver.buffer.cache_hit_ratio', 'Buffer cache hit ratio', ''
         ),  # RAW_LARGE_FRACTION
        ('sqlserver.buffer.page_life_expectancy', 'Page life expectancy',
         ''),  # LARGE_RAWCOUNT
        ('sqlserver.buffer.page_reads', 'Page reads/sec',
         ''),  # LARGE_RAWCOUNT
        ('sqlserver.buffer.page_writes', 'Page writes/sec',
         ''),  # LARGE_RAWCOUNT
        ('sqlserver.buffer.checkpoint_pages', 'Checkpoint pages/sec',
         ''),  # BULK_COUNT
        # SQLServer:SQL Statistics
        ('sqlserver.stats.auto_param_attempts', 'Auto-Param Attempts/sec', ''),
        ('sqlserver.stats.failed_auto_param_attempts',
         'Failed Auto-Params/sec', ''),
        ('sqlserver.stats.safe_auto_param_attempts', 'Safe Auto-Params/sec',
         ''),
        ('sqlserver.stats.batch_requests', 'Batch Requests/sec',
         ''),  # BULK_COUNT
        ('sqlserver.stats.sql_compilations', 'SQL Compilations/sec',
         ''),  # BULK_COUNT
        ('sqlserver.stats.sql_recompilations', 'SQL Re-Compilations/sec',
         ''),  # BULK_COUNT
    ]

    # Performance table metrics, initially configured to track at instance-level only
    # With auto-discovery enabled, these metrics will be extended accordingly
    # datadog metric name, counter name, instance name
    INSTANCE_METRICS_TOTAL = [
        # SQLServer:Locks
        ('sqlserver.stats.lock_waits', 'Lock Waits/sec', '_Total'
         ),  # BULK_COUNT
        # SQLServer:Plan Cache
        ('sqlserver.cache.object_counts', 'Cache Object Counts', '_Total'),
        ('sqlserver.cache.pages', 'Cache Pages', '_Total'),
        # SQLServer:Databases
        ('sqlserver.database.backup_restore_throughput',
         'Backup/Restore Throughput/sec', '_Total'),
        ('sqlserver.database.log_bytes_flushed', 'Log Bytes Flushed/sec',
         '_Total'),
        ('sqlserver.database.log_flushes', 'Log Flushes/sec', '_Total'),
        ('sqlserver.database.log_flush_wait', 'Log Flush Wait Time', '_Total'),
        ('sqlserver.database.transactions', 'Transactions/sec',
         '_Total'),  # BULK_COUNT
        ('sqlserver.database.write_transactions', 'Write Transactions/sec',
         '_Total'),  # BULK_COUNT
        ('sqlserver.database.active_transactions', 'Active Transactions',
         '_Total'),  # BULK_COUNT
    ]

    # AlwaysOn metrics
    # datadog metric name, sql table, column name, tag
    AO_METRICS = [
        ('sqlserver.ao.ag_sync_health',
         'sys.dm_hadr_availability_group_states', 'synchronization_health'),
        ('sqlserver.ao.replica_sync_state',
         'sys.dm_hadr_database_replica_states', 'synchronization_state'),
        ('sqlserver.ao.replica_failover_mode', 'sys.availability_replicas',
         'failover_mode'),
        ('sqlserver.ao.replica_failover_readiness',
         'sys.availability_replicas', 'is_failover_ready'),
    ]

    AO_METRICS_PRIMARY = [
        ('sqlserver.ao.primary_replica_health',
         'sys.dm_hadr_availability_group_states', 'primary_recovery_health'),
    ]

    AO_METRICS_SECONDARY = [
        ('sqlserver.ao.secondary_replica_health',
         'sys.dm_hadr_availability_group_states', 'secondary_recovery_health'),
    ]

    # AlwaysOn metrics for Failover Cluster Instances (FCI).
    # This is in a separate category than other AlwaysOn metrics
    # because FCI specifies a different SQLServer setup
    # compared to Availability Groups (AG).
    # datadog metric name, sql table, column name
    # FCI status enum:
    #   0 = Up, 1 = Down, 2 = Paused, 3 = Joining, -1 = Unknown
    FCI_METRICS = [
        ('sqlserver.fci.status', 'sys.dm_os_cluster_nodes', 'status'),
        ('sqlserver.fci.is_current_owner', 'sys.dm_os_cluster_nodes',
         'is_current_owner'),
    ]

    # Non-performance table metrics - can be database specific
    # datadog metric name, sql table, column name
    TASK_SCHEDULER_METRICS = [
        ('sqlserver.scheduler.current_tasks_count', 'sys.dm_os_schedulers',
         'current_tasks_count'),
        ('sqlserver.scheduler.current_workers_count', 'sys.dm_os_schedulers',
         'current_workers_count'),
        ('sqlserver.scheduler.active_workers_count', 'sys.dm_os_schedulers',
         'active_workers_count'),
        ('sqlserver.scheduler.runnable_tasks_count', 'sys.dm_os_schedulers',
         'runnable_tasks_count'),
        ('sqlserver.scheduler.work_queue_count', 'sys.dm_os_schedulers',
         'work_queue_count'),
        ('sqlserver.task.context_switches_count', 'sys.dm_os_tasks',
         'context_switches_count'),
        ('sqlserver.task.pending_io_count', 'sys.dm_os_tasks',
         'pending_io_count'),
        ('sqlserver.task.pending_io_byte_count', 'sys.dm_os_tasks',
         'pending_io_byte_count'),
        ('sqlserver.task.pending_io_byte_average', 'sys.dm_os_tasks',
         'pending_io_byte_average'),
    ]

    # Non-performance table metrics
    # datadog metric name, sql table, column name
    # Files State enum:
    #   0 = Online, 1 = Restoring, 2 = Recovering, 3 = Recovery_Pending,
    #   4 = Suspect, 5 = Unknown, 6 = Offline, 7 = Defunct
    # Database State enum:
    #   0 = Online, 1 = Restoring, 2 = Recovering, 3 = Recovery_Pending,
    #   4 = Suspect, 5 = Emergency, 6 = Offline, 7 = Copying, 10 = Offline_Secondary
    # Is Sync with Backup enum:
    #   0 = False, 1 = True
    DATABASE_METRICS = [
        ('sqlserver.database.files.size', 'sys.database_files', 'size'),
        ('sqlserver.database.files.state', 'sys.database_files', 'state'),
        ('sqlserver.database.state', 'sys.databases', 'state'),
        ('sqlserver.database.is_sync_with_backup', 'sys.databases',
         'is_sync_with_backup'),
        ('sqlserver.database.backup_count', 'msdb.dbo.backupset',
         'backup_set_id_count'),
    ]

    DATABASE_FRAGMENTATION_METRICS = [
        (
            'sqlserver.database.avg_fragmentation_in_percent',
            'sys.dm_db_index_physical_stats',
            'avg_fragmentation_in_percent',
        ),
        ('sqlserver.database.fragment_count', 'sys.dm_db_index_physical_stats',
         'fragment_count'),
        (
            'sqlserver.database.avg_fragment_size_in_pages',
            'sys.dm_db_index_physical_stats',
            'avg_fragment_size_in_pages',
        ),
    ]

    def __init__(self, name, init_config, instances):
        super(SQLServer, self).__init__(name, init_config, instances)

        self.connection = None
        self.failed_connections = {}
        self.instance_metrics = []
        self.instance_per_type_metrics = defaultdict(list)
        self.do_check = True

        self.autodiscovery = is_affirmative(
            self.instance.get('database_autodiscovery'))
        if self.autodiscovery and self.instance.get('database'):
            self.log.warning(
                'sqlserver `database_autodiscovery` and `database` options defined in same instance - '
                'autodiscovery will take precedence.')
        self.autodiscovery_include = self.instance.get('autodiscovery_include',
                                                       ['.*'])
        self.autodiscovery_exclude = self.instance.get('autodiscovery_exclude',
                                                       [])
        self._compile_patterns()
        self.autodiscovery_interval = self.instance.get(
            'autodiscovery_interval', DEFAULT_AUTODISCOVERY_INTERVAL)
        self.databases = set()
        self.ad_last_check = 0

        self.proc = self.instance.get('stored_procedure')
        self.proc_type_mapping = {
            'gauge': self.gauge,
            'rate': self.rate,
            'histogram': self.histogram
        }
        self.custom_metrics = init_config.get('custom_metrics', [])

        # use QueryManager to process custom queries
        self._query_manager = QueryManager(self,
                                           self.execute_query_raw,
                                           queries=[],
                                           tags=self.instance.get("tags", []))
        self.check_initializations.append(self._query_manager.compile_queries)
        self.check_initializations.append(self.initialize_connection)

    def initialize_connection(self):
        self.connection = Connection(self.init_config, self.instance,
                                     self.handle_service_check, self.log)

        # Pre-process the list of metrics to collect
        try:
            # check to see if the database exists before we try any connections to it
            db_exists, context = self.connection.check_database()

            if db_exists:
                if self.instance.get('stored_procedure') is None:
                    with self.connection.open_managed_default_connection():
                        with self.connection.get_managed_cursor() as cursor:
                            self.autodiscover_databases(cursor)
                        self._make_metric_list_to_collect(self.custom_metrics)
            else:
                # How much do we care that the DB doesn't exist?
                ignore = is_affirmative(
                    self.instance.get("ignore_missing_database", False))
                if ignore is not None and ignore:
                    # not much : we expect it. leave checks disabled
                    self.do_check = False
                    self.log.warning(
                        "Database %s does not exist. Disabling checks for this instance.",
                        context)
                else:
                    # yes we do. Keep trying
                    msg = "Database {} does not exist. Please resolve invalid database and restart agent".format(
                        context)
                    raise ConfigurationError(msg)

        except SQLConnectionError as e:
            self.log.exception("Error connecting to database: %s", e)
        except ConfigurationError:
            raise
        except Exception as e:
            self.log.exception("Initialization exception %s", e)

    def handle_service_check(self, status, host, database, message=None):
        custom_tags = self.instance.get("tags", [])
        if custom_tags is None:
            custom_tags = []
        service_check_tags = ['host:{}'.format(host), 'db:{}'.format(database)]
        service_check_tags.extend(custom_tags)
        service_check_tags = list(set(service_check_tags))

        self.service_check(self.SERVICE_CHECK_NAME,
                           status,
                           tags=service_check_tags,
                           message=message,
                           raw=True)

    def _compile_patterns(self):
        self._include_patterns = self._compile_valid_patterns(
            self.autodiscovery_include)
        self._exclude_patterns = self._compile_valid_patterns(
            self.autodiscovery_exclude)

    def _compile_valid_patterns(self, patterns):
        valid_patterns = []

        for pattern in patterns:
            # Ignore empty patterns as they match everything
            if not pattern:
                continue

            try:
                re.compile(pattern, re.IGNORECASE)
            except Exception:
                self.log.warning(
                    '%s is not a valid regular expression and will be ignored',
                    pattern)
            else:
                valid_patterns.append(pattern)

        if valid_patterns:
            return re.compile('|'.join(valid_patterns), re.IGNORECASE)
        else:
            # create unmatchable regex - https://stackoverflow.com/a/1845097/2157429
            return re.compile(r'(?!x)x')

    def autodiscover_databases(self, cursor):
        if not self.autodiscovery:
            return False

        now = time.time()
        if now - self.ad_last_check > self.autodiscovery_interval:
            self.log.info('Performing database autodiscovery')
            cursor.execute(AUTODISCOVERY_QUERY)
            all_dbs = set(row.name for row in cursor.fetchall())
            excluded_dbs = set(
                [d for d in all_dbs if self._exclude_patterns.match(d)])
            included_dbs = set(
                [d for d in all_dbs if self._include_patterns.match(d)])

            self.log.debug(
                'Autodiscovered databases: %s, excluding: %s, including: %s',
                all_dbs, excluded_dbs, included_dbs)

            # keep included dbs but remove any that were explicitly excluded
            filtered_dbs = all_dbs.intersection(included_dbs) - excluded_dbs

            self.log.debug('Resulting filtered databases: %s', filtered_dbs)
            self.ad_last_check = now

            if filtered_dbs != self.databases:
                self.log.debug(
                    'Databases updated from previous autodiscovery check.')
                self.databases = filtered_dbs
                return True
        return False

    def _make_metric_list_to_collect(self, custom_metrics):
        """
        Store the list of metrics to collect by instance_key.
        Will also create and cache cursors to query the db.
        """

        metrics_to_collect = []
        tags = self.instance.get('tags', [])

        # Load instance-level (previously Performance) metrics)
        # If several check instances are querying the same server host, it can be wise to turn these off
        # to avoid sending duplicate metrics
        if is_affirmative(self.instance.get('include_instance_metrics', True)):
            self._add_performance_counters(chain(self.INSTANCE_METRICS,
                                                 self.INSTANCE_METRICS_TOTAL),
                                           metrics_to_collect,
                                           tags,
                                           db=None)

        # populated through autodiscovery
        if self.databases:
            for db in self.databases:
                self._add_performance_counters(self.INSTANCE_METRICS_TOTAL,
                                               metrics_to_collect,
                                               tags,
                                               db=db)

        # Load database statistics
        for name, table, column in self.DATABASE_METRICS:
            # include database as a filter option
            db_names = self.databases or [
                self.instance.get('database', self.connection.DEFAULT_DATABASE)
            ]
            for db_name in db_names:
                cfg = {
                    'name': name,
                    'table': table,
                    'column': column,
                    'instance_name': db_name,
                    'tags': tags
                }
                metrics_to_collect.append(
                    self.typed_metric(cfg_inst=cfg, table=table,
                                      column=column))

        # Load AlwaysOn metrics
        if is_affirmative(self.instance.get('include_ao_metrics', False)):
            for name, table, column in self.AO_METRICS + self.AO_METRICS_PRIMARY + self.AO_METRICS_SECONDARY:
                db_name = 'master'
                cfg = {
                    'name':
                    name,
                    'table':
                    table,
                    'column':
                    column,
                    'instance_name':
                    db_name,
                    'tags':
                    tags,
                    'ao_database':
                    self.instance.get('ao_database', None),
                    'availability_group':
                    self.instance.get('availability_group', None),
                    'only_emit_local':
                    is_affirmative(self.instance.get('only_emit_local',
                                                     False)),
                }
                metrics_to_collect.append(
                    self.typed_metric(cfg_inst=cfg, table=table,
                                      column=column))

        # Load FCI metrics
        if is_affirmative(self.instance.get('include_fci_metrics', False)):
            for name, table, column in self.FCI_METRICS:
                cfg = {
                    'name': name,
                    'table': table,
                    'column': column,
                    'tags': tags,
                }
                metrics_to_collect.append(
                    self.typed_metric(cfg_inst=cfg, table=table,
                                      column=column))

        # Load metrics from scheduler and task tables, if enabled
        if is_affirmative(
                self.instance.get('include_task_scheduler_metrics', False)):
            for name, table, column in self.TASK_SCHEDULER_METRICS:
                cfg = {
                    'name': name,
                    'table': table,
                    'column': column,
                    'tags': tags
                }
                metrics_to_collect.append(
                    self.typed_metric(cfg_inst=cfg, table=table,
                                      column=column))

        # Load DB Fragmentation metrics
        if is_affirmative(
                self.instance.get('include_db_fragmentation_metrics', False)):
            db_fragmentation_object_names = self.instance.get(
                'db_fragmentation_object_names', [])
            db_names = self.databases or [
                self.instance.get('database', self.connection.DEFAULT_DATABASE)
            ]

            if not db_fragmentation_object_names:
                self.log.debug(
                    "No fragmentation object names specified, will return fragmentation metrics for all "
                    "object_ids of current database(s): %s",
                    db_names,
                )

            for db_name in db_names:
                for name, table, column in self.DATABASE_FRAGMENTATION_METRICS:
                    cfg = {
                        'name':
                        name,
                        'table':
                        table,
                        'column':
                        column,
                        'instance_name':
                        db_name,
                        'tags':
                        tags,
                        'db_fragmentation_object_names':
                        db_fragmentation_object_names,
                    }
                    metrics_to_collect.append(
                        self.typed_metric(cfg_inst=cfg,
                                          table=table,
                                          column=column))

        # Load any custom metrics from conf.d/sqlserver.yaml
        for cfg in custom_metrics:
            sql_type = None
            base_name = None

            custom_tags = tags + cfg.get('tags', [])
            cfg['tags'] = custom_tags

            db_table = cfg.get('table', DEFAULT_PERFORMANCE_TABLE)
            if db_table not in VALID_TABLES:
                self.log.error('%s has an invalid table name: %s', cfg['name'],
                               db_table)
                continue

            if cfg.get('database', None) and cfg.get(
                    'database') != self.instance.get('database'):
                self.log.debug(
                    'Skipping custom metric %s for database %s, check instance configured for database %s',
                    cfg['name'],
                    cfg.get('database'),
                    self.instance.get('database'),
                )
                continue

            if db_table == DEFAULT_PERFORMANCE_TABLE:
                user_type = cfg.get('type')
                if user_type is not None and user_type not in VALID_METRIC_TYPES:
                    self.log.error('%s has an invalid metric type: %s',
                                   cfg['name'], user_type)
                sql_type = None
                try:
                    if user_type is None:
                        sql_type, base_name = self.get_sql_type(
                            cfg['counter_name'])
                except Exception:
                    self.log.warning("Can't load the metric %s, ignoring",
                                     cfg['name'],
                                     exc_info=True)
                    continue

                metrics_to_collect.append(
                    self.typed_metric(cfg_inst=cfg,
                                      table=db_table,
                                      base_name=base_name,
                                      user_type=user_type,
                                      sql_type=sql_type))

            else:
                for column in cfg['columns']:
                    metrics_to_collect.append(
                        self.typed_metric(cfg_inst=cfg,
                                          table=db_table,
                                          base_name=base_name,
                                          sql_type=sql_type,
                                          column=column))

        self.instance_metrics = metrics_to_collect
        self.log.debug("metrics to collect %s", metrics_to_collect)

        # create an organized grouping of metric names to their metric classes
        for m in metrics_to_collect:
            cls = m.__class__.__name__
            name = m.sql_name or m.column
            self.log.debug("Adding metric class %s named %s", cls, name)

            self.instance_per_type_metrics[cls].append(name)
            if m.base_name:
                self.instance_per_type_metrics[cls].append(m.base_name)

    def _add_performance_counters(self,
                                  metrics,
                                  metrics_to_collect,
                                  tags,
                                  db=None):
        for name, counter_name, instance_name in metrics:
            try:
                sql_type, base_name = self.get_sql_type(counter_name)
                cfg = {
                    'name': name,
                    'counter_name': counter_name,
                    'instance_name': db or instance_name,
                    'tags': tags,
                }

                metrics_to_collect.append(
                    self.typed_metric(cfg_inst=cfg,
                                      table=DEFAULT_PERFORMANCE_TABLE,
                                      base_name=base_name,
                                      sql_type=sql_type))
            except SQLConnectionError:
                raise
            except Exception:
                self.log.warning("Can't load the metric %s, ignoring",
                                 name,
                                 exc_info=True)
                continue

    def get_sql_type(self, counter_name):
        """
        Return the type of the performance counter so that we can report it to
        Datadog correctly
        If the sql_type is one that needs a base (PERF_RAW_LARGE_FRACTION and
        PERF_AVERAGE_BULK), the name of the base counter will also be returned
        """
        with self.connection.get_managed_cursor() as cursor:
            cursor.execute(COUNTER_TYPE_QUERY, (counter_name, ))
            (sql_type, ) = cursor.fetchone()
            if sql_type == PERF_LARGE_RAW_BASE:
                self.log.warning(
                    "Metric %s is of type Base and shouldn't be reported this way",
                    counter_name)
            base_name = None
            if sql_type in [PERF_AVERAGE_BULK, PERF_RAW_LARGE_FRACTION]:
                # This is an ugly hack. For certains type of metric (PERF_RAW_LARGE_FRACTION
                # and PERF_AVERAGE_BULK), we need two metrics: the metrics specified and
                # a base metrics to get the ratio. There is no unique schema so we generate
                # the possible candidates and we look at which ones exist in the db.
                candidates = (
                    counter_name + " base",
                    counter_name.replace("(ms)", "base"),
                    counter_name.replace("Avg ", "") + " base",
                )
                try:
                    cursor.execute(BASE_NAME_QUERY, candidates)
                    base_name = cursor.fetchone().counter_name.strip()
                    self.log.debug("Got base metric: %s for metric: %s",
                                   base_name, counter_name)
                except Exception as e:
                    self.log.warning(
                        "Could not get counter_name of base for metric: %s", e)

        return sql_type, base_name

    def typed_metric(self,
                     cfg_inst,
                     table,
                     base_name=None,
                     user_type=None,
                     sql_type=None,
                     column=None):
        """
        Create the appropriate BaseSqlServerMetric object, each implementing its method to
        fetch the metrics properly.
        If a `type` was specified in the config, it is used to report the value
        directly fetched from SQLServer. Otherwise, it is decided based on the
        sql_type, according to microsoft's documentation.
        """
        if table == DEFAULT_PERFORMANCE_TABLE:
            metric_type_mapping = {
                PERF_COUNTER_BULK_COUNT: (self.rate, metrics.SqlSimpleMetric),
                PERF_COUNTER_LARGE_RAWCOUNT:
                (self.gauge, metrics.SqlSimpleMetric),
                PERF_LARGE_RAW_BASE: (self.gauge, metrics.SqlSimpleMetric),
                PERF_RAW_LARGE_FRACTION:
                (self.gauge, metrics.SqlFractionMetric),
                PERF_AVERAGE_BULK: (self.gauge, metrics.SqlIncrFractionMetric),
            }
            if user_type is not None:
                # user type overrides any other value
                metric_type = getattr(self, user_type)
                cls = metrics.SqlSimpleMetric

            else:
                metric_type, cls = metric_type_mapping[sql_type]
        else:
            # Lookup metrics classes by their associated table
            metric_type_str, cls = metrics.TABLE_MAPPING[table]
            metric_type = getattr(self, metric_type_str)

        return cls(cfg_inst, base_name, metric_type, column, self.log)

    def check(self, _):
        if self.do_check:
            if self.proc:
                self.do_stored_procedure_check()
            else:
                self.collect_metrics()
        else:
            self.log.debug("Skipping check")

    def collect_metrics(self):
        """Fetch the metrics from all of the associated database tables."""

        with self.connection.open_managed_default_connection():
            with self.connection.get_managed_cursor() as cursor:
                # initiate autodiscovery or if the server was down at check __init__ key could be missing.
                if self.autodiscover_databases(
                        cursor) or not self.instance_metrics:
                    self._make_metric_list_to_collect(self.custom_metrics)

                instance_results = {}

                # Execute the `fetch_all` operations first to minimize the database calls
                for cls, metric_names in six.iteritems(
                        self.instance_per_type_metrics):
                    if not metric_names:
                        instance_results[cls] = None, None
                    else:
                        rows, cols = getattr(metrics, cls).fetch_all_values(
                            cursor, metric_names, self.log)
                        instance_results[cls] = rows, cols

                # Using the cached data, extract and report individual metrics
                for metric in self.instance_metrics:
                    if type(metric) is metrics.SqlIncrFractionMetric:
                        # special case, since it uses the same results as SqlFractionMetric
                        rows, cols = instance_results['SqlFractionMetric']
                        metric.fetch_metric(rows, cols)
                    else:
                        rows, cols = instance_results[
                            metric.__class__.__name__]
                        metric.fetch_metric(rows, cols)

            # reuse connection for any custom queries
            self._query_manager.execute()

    def execute_query_raw(self, query):
        with self.connection.get_managed_cursor() as cursor:
            cursor.execute(query)
            return cursor.fetchall()

    def do_stored_procedure_check(self):
        """
        Fetch the metrics from the stored proc
        """

        proc = self.proc
        guardSql = self.instance.get('proc_only_if')
        custom_tags = self.instance.get("tags", [])

        if (guardSql and self.proc_check_guard(guardSql)) or not guardSql:
            self.connection.open_db_connections(self.connection.DEFAULT_DB_KEY)
            cursor = self.connection.get_cursor(self.connection.DEFAULT_DB_KEY)

            try:
                self.log.debug("Calling Stored Procedure : %s", proc)
                if self.connection.get_connector() == 'adodbapi':
                    cursor.callproc(proc)
                else:
                    # pyodbc does not support callproc; use execute instead.
                    # Reference: https://github.com/mkleehammer/pyodbc/wiki/Calling-Stored-Procedures
                    call_proc = '{{CALL {}}}'.format(proc)
                    cursor.execute(call_proc)

                rows = cursor.fetchall()
                self.log.debug("Row count (%s) : %s", proc, cursor.rowcount)

                for row in rows:
                    tags = [] if row.tags is None or row.tags == '' else row.tags.split(
                        ',')
                    tags.extend(custom_tags)

                    if row.type.lower() in self.proc_type_mapping:
                        self.proc_type_mapping[row.type](row.metric,
                                                         row.value,
                                                         tags,
                                                         raw=True)
                    else:
                        self.log.warning(
                            '%s is not a recognised type from procedure %s, metric %s',
                            row.type, proc, row.metric)

            except Exception as e:
                self.log.warning("Could not call procedure %s: %s", proc, e)
                raise e

            self.connection.close_cursor(cursor)
            self.connection.close_db_connections(
                self.connection.DEFAULT_DB_KEY)
        else:
            self.log.info("Skipping call to %s due to only_if", proc)

    def proc_check_guard(self, sql):
        """
        check to see if the guard SQL returns a single column containing 0 or 1
        We return true if 1, else False
        """
        self.connection.open_db_connections(self.connection.PROC_GUARD_DB_KEY)
        cursor = self.connection.get_cursor(self.connection.PROC_GUARD_DB_KEY)

        should_run = False
        try:
            cursor.execute(sql, ())
            result = cursor.fetchone()
            should_run = result[0] == 1
        except Exception as e:
            self.log.error("Failed to run proc_only_if sql %s : %s", sql, e)

        self.connection.close_cursor(cursor)
        self.connection.close_db_connections(self.connection.PROC_GUARD_DB_KEY)
        return should_run
Beispiel #13
0
class SQLServer(AgentCheck):
    __NAMESPACE__ = 'sqlserver'

    def __init__(self, name, init_config, instances):
        super(SQLServer, self).__init__(name, init_config, instances)

        self._resolved_hostname = None
        self._agent_hostname = None
        self.connection = None
        self.failed_connections = {}
        self.instance_metrics = []
        self.instance_per_type_metrics = defaultdict(set)
        self.do_check = True

        self.tags = self.instance.get("tags", [])
        self.reported_hostname = self.instance.get('reported_hostname')
        self.autodiscovery = is_affirmative(self.instance.get('database_autodiscovery'))
        self.autodiscovery_include = self.instance.get('autodiscovery_include', ['.*'])
        self.autodiscovery_exclude = self.instance.get('autodiscovery_exclude', [])
        self.autodiscovery_db_service_check = is_affirmative(self.instance.get('autodiscovery_db_service_check', True))
        self.min_collection_interval = self.instance.get('min_collection_interval', 15)
        self._compile_patterns()
        self.autodiscovery_interval = self.instance.get('autodiscovery_interval', DEFAULT_AUTODISCOVERY_INTERVAL)
        self.databases = set()
        self.ad_last_check = 0

        self.proc = self.instance.get('stored_procedure')
        self.proc_type_mapping = {'gauge': self.gauge, 'rate': self.rate, 'histogram': self.histogram}
        self.custom_metrics = init_config.get('custom_metrics', [])

        # DBM
        self.dbm_enabled = self.instance.get('dbm', False)
        self.statement_metrics_config = self.instance.get('query_metrics', {}) or {}
        self.statement_metrics = SqlserverStatementMetrics(self)
        self.activity_config = self.instance.get('query_activity', {}) or {}
        self.activity = SqlserverActivity(self)
        self.cloud_metadata = {}
        aws = self.instance.get('aws', {})
        gcp = self.instance.get('gcp', {})
        azure = self.instance.get('azure', {})
        if aws:
            self.cloud_metadata.update({'aws': aws})
        if gcp:
            self.cloud_metadata.update({'gcp': gcp})
        if azure:
            self.cloud_metadata.update({'azure': azure})
        obfuscator_options_config = self.instance.get('obfuscator_options', {}) or {}
        self.obfuscator_options = to_native_string(
            json.dumps(
                {
                    # Valid values for this can be found at
                    # https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/semantic_conventions/database.md#connection-level-attributes
                    'dbms': 'mssql',
                    'replace_digits': is_affirmative(
                        obfuscator_options_config.get(
                            'replace_digits',
                            obfuscator_options_config.get('quantize_sql_tables', False),
                        )
                    ),
                    'keep_sql_alias': is_affirmative(obfuscator_options_config.get('keep_sql_alias', True)),
                    'return_json_metadata': is_affirmative(obfuscator_options_config.get('collect_metadata', True)),
                    'table_names': is_affirmative(obfuscator_options_config.get('collect_tables', True)),
                    'collect_commands': is_affirmative(obfuscator_options_config.get('collect_commands', True)),
                    'collect_comments': is_affirmative(obfuscator_options_config.get('collect_comments', True)),
                }
            )
        )

        self.static_info_cache = TTLCache(
            maxsize=100,
            # cache these for a full day
            ttl=60 * 60 * 24,
        )

        # Query declarations
        check_queries = []
        if is_affirmative(self.instance.get('include_ao_metrics', False)):
            check_queries.extend(
                [
                    QUERY_AO_AVAILABILITY_GROUPS,
                    QUERY_AO_FAILOVER_CLUSTER,
                    QUERY_AO_FAILOVER_CLUSTER_MEMBER,
                ]
            )
        if is_affirmative(self.instance.get('include_fci_metrics', False)):
            check_queries.extend([QUERY_FAILOVER_CLUSTER_INSTANCE])
        self._check_queries = self._new_query_executor(check_queries)
        self.check_initializations.append(self._check_queries.compile_queries)

        self.server_state_queries = self._new_query_executor([QUERY_SERVER_STATIC_INFO])
        self.check_initializations.append(self.server_state_queries.compile_queries)

        # use QueryManager to process custom queries
        self._query_manager = QueryManager(
            self, self.execute_query_raw, tags=self.tags, hostname=self.resolved_hostname
        )

        self._dynamic_queries = None

        self.check_initializations.append(self.config_checks)
        self.check_initializations.append(self._query_manager.compile_queries)
        self.check_initializations.append(self.initialize_connection)

    def cancel(self):
        self.statement_metrics.cancel()
        self.activity.cancel()

    def config_checks(self):
        if self.autodiscovery and self.instance.get('database'):
            self.log.warning(
                'sqlserver `database_autodiscovery` and `database` options defined in same instance - '
                'autodiscovery will take precedence.'
            )
        if not self.autodiscovery and (self.autodiscovery_include or self.autodiscovery_exclude):
            self.log.warning(
                "Autodiscovery is disabled, autodiscovery_include and autodiscovery_exclude will be ignored"
            )

    def split_sqlserver_host_port(self, host):
        """
        Splits the host & port out of the provided SQL Server host connection string, returning (host, port).
        """
        if not host:
            return host, None
        host_split = [s.strip() for s in host.split(',')]
        if len(host_split) == 1:
            return host_split[0], None
        if len(host_split) == 2:
            return host_split
        # else len > 2
        s_host, s_port = host_split[0:2]
        self.log.warning(
            "invalid sqlserver host string has more than one comma: %s. using only 1st two items: host:%s, port:%s",
            host,
            s_host,
            s_port,
        )
        return s_host, s_port

    def _new_query_executor(self, queries):
        return QueryExecutor(
            self.execute_query_raw,
            self,
            queries=queries,
            tags=self.tags,
            hostname=self.resolved_hostname,
        )

    @property
    def resolved_hostname(self):
        if self._resolved_hostname is None:
            if self.reported_hostname:
                self._resolved_hostname = self.reported_hostname
            elif self.dbm_enabled:
                host, port = self.split_sqlserver_host_port(self.instance.get('host'))
                self._resolved_hostname = resolve_db_host(host)
            else:
                self._resolved_hostname = self.agent_hostname
        return self._resolved_hostname

    def load_static_information(self):
        expected_keys = {STATIC_INFO_VERSION, STATIC_INFO_MAJOR_VERSION, STATIC_INFO_ENGINE_EDITION}
        missing_keys = expected_keys - set(self.static_info_cache.keys())
        if missing_keys:
            with self.connection.open_managed_default_connection():
                with self.connection.get_managed_cursor() as cursor:
                    if STATIC_INFO_VERSION not in self.static_info_cache:
                        cursor.execute("select @@version")
                        results = cursor.fetchall()
                        if results and len(results) > 0 and len(results[0]) > 0 and results[0][0]:
                            version = results[0][0]
                            self.static_info_cache[STATIC_INFO_VERSION] = version
                            self.static_info_cache[STATIC_INFO_MAJOR_VERSION] = parse_sqlserver_major_version(version)
                            if not self.static_info_cache[STATIC_INFO_MAJOR_VERSION]:
                                self.log.warning("failed to parse SQL Server major version from version: %s", version)
                        else:
                            self.log.warning("failed to load version static information due to empty results")
                    if STATIC_INFO_ENGINE_EDITION not in self.static_info_cache:
                        cursor.execute("SELECT CAST(ServerProperty('EngineEdition') AS INT) AS Edition")
                        result = cursor.fetchone()
                        if result:
                            self.static_info_cache[STATIC_INFO_ENGINE_EDITION] = result
                        else:
                            self.log.warning("failed to load version static information due to empty results")

    def debug_tags(self):
        return self.tags + ['agent_hostname:{}'.format(self.agent_hostname)]

    def debug_stats_kwargs(self, tags=None):
        tags = tags if tags else []
        return {
            "tags": self.debug_tags() + tags,
            "hostname": self.resolved_hostname,
            "raw": True,
        }

    @property
    def agent_hostname(self):
        # type: () -> str
        if self._agent_hostname is None:
            self._agent_hostname = datadog_agent.get_hostname()
        return self._agent_hostname

    def initialize_connection(self):
        self.connection = Connection(self.init_config, self.instance, self.handle_service_check)

        # Pre-process the list of metrics to collect
        try:
            # check to see if the database exists before we try any connections to it
            db_exists, context = self.connection.check_database()

            if db_exists:
                if self.instance.get('stored_procedure') is None:
                    with self.connection.open_managed_default_connection():
                        with self.connection.get_managed_cursor() as cursor:
                            self.autodiscover_databases(cursor)
                        self._make_metric_list_to_collect(self.custom_metrics)
            else:
                # How much do we care that the DB doesn't exist?
                ignore = is_affirmative(self.instance.get("ignore_missing_database", False))
                if ignore is not None and ignore:
                    # not much : we expect it. leave checks disabled
                    self.do_check = False
                    self.log.warning("Database %s does not exist. Disabling checks for this instance.", context)
                else:
                    # yes we do. Keep trying
                    msg = "Database {} does not exist. Please resolve invalid database and restart agent".format(
                        context
                    )
                    raise ConfigurationError(msg)

        except SQLConnectionError as e:
            self.log.exception("Error connecting to database: %s", e)
        except ConfigurationError:
            raise
        except Exception as e:
            self.log.exception("Initialization exception %s", e)

    def handle_service_check(self, status, host, database, message=None, is_default=True):
        custom_tags = self.instance.get("tags", [])
        disable_generic_tags = self.instance.get('disable_generic_tags', False)
        if custom_tags is None:
            custom_tags = []
        if disable_generic_tags:
            service_check_tags = ['sqlserver_host:{}'.format(host), 'db:{}'.format(database)]
        else:
            service_check_tags = ['host:{}'.format(host), 'sqlserver_host:{}'.format(host), 'db:{}'.format(database)]
        service_check_tags.extend(custom_tags)
        service_check_tags = list(set(service_check_tags))

        if status is AgentCheck.OK:
            message = None

        if is_default:
            self.service_check(SERVICE_CHECK_NAME, status, tags=service_check_tags, message=message, raw=True)
        if self.autodiscovery and self.autodiscovery_db_service_check:
            self.service_check(DATABASE_SERVICE_CHECK_NAME, status, tags=service_check_tags, message=message, raw=True)

    def _compile_patterns(self):
        self._include_patterns = self._compile_valid_patterns(self.autodiscovery_include)
        self._exclude_patterns = self._compile_valid_patterns(self.autodiscovery_exclude)

    def _compile_valid_patterns(self, patterns):
        valid_patterns = []

        for pattern in patterns:
            # Ignore empty patterns as they match everything
            if not pattern:
                continue

            try:
                re.compile(pattern, re.IGNORECASE)
            except Exception:
                self.log.warning('%s is not a valid regular expression and will be ignored', pattern)
            else:
                valid_patterns.append(pattern)

        if valid_patterns:
            return re.compile('|'.join(valid_patterns), re.IGNORECASE)
        else:
            # create unmatchable regex - https://stackoverflow.com/a/1845097/2157429
            return re.compile(r'(?!x)x')

    def autodiscover_databases(self, cursor):
        if not self.autodiscovery:
            return False

        now = time.time()
        if now - self.ad_last_check > self.autodiscovery_interval:
            self.log.info('Performing database autodiscovery')
            cursor.execute(AUTODISCOVERY_QUERY)
            all_dbs = set(row.name for row in cursor.fetchall())
            excluded_dbs = set([d for d in all_dbs if self._exclude_patterns.match(d)])
            included_dbs = set([d for d in all_dbs if self._include_patterns.match(d)])

            self.log.debug(
                'Autodiscovered databases: %s, excluding: %s, including: %s', all_dbs, excluded_dbs, included_dbs
            )

            # keep included dbs but remove any that were explicitly excluded
            filtered_dbs = all_dbs.intersection(included_dbs) - excluded_dbs

            self.log.debug('Resulting filtered databases: %s', filtered_dbs)
            self.ad_last_check = now

            if filtered_dbs != self.databases:
                self.log.debug('Databases updated from previous autodiscovery check.')
                self.databases = filtered_dbs
                return True
        return False

    def _make_metric_list_to_collect(self, custom_metrics):
        """
        Store the list of metrics to collect by instance_key.
        Will also create and cache cursors to query the db.
        """

        metrics_to_collect = []
        tags = self.instance.get('tags', [])

        # Load instance-level (previously Performance) metrics)
        # If several check instances are querying the same server host, it can be wise to turn these off
        # to avoid sending duplicate metrics
        if is_affirmative(self.instance.get('include_instance_metrics', True)):
            common_metrics = INSTANCE_METRICS
            if not self.dbm_enabled:
                common_metrics = common_metrics + DBM_MIGRATED_METRICS

            self._add_performance_counters(
                chain(common_metrics, INSTANCE_METRICS_TOTAL), metrics_to_collect, tags, db=None
            )

        # populated through autodiscovery
        if self.databases:
            for db in self.databases:
                self._add_performance_counters(INSTANCE_METRICS_TOTAL, metrics_to_collect, tags, db=db)

        # Load database statistics
        for name, table, column in DATABASE_METRICS:
            # include database as a filter option
            db_names = self.databases or [self.instance.get('database', self.connection.DEFAULT_DATABASE)]
            for db_name in db_names:
                cfg = {'name': name, 'table': table, 'column': column, 'instance_name': db_name, 'tags': tags}
                metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column))

        # Load AlwaysOn metrics
        if is_affirmative(self.instance.get('include_ao_metrics', False)):
            for name, table, column in AO_METRICS + AO_METRICS_PRIMARY + AO_METRICS_SECONDARY:
                db_name = 'master'
                cfg = {
                    'name': name,
                    'table': table,
                    'column': column,
                    'instance_name': db_name,
                    'tags': tags,
                    'ao_database': self.instance.get('ao_database', None),
                    'availability_group': self.instance.get('availability_group', None),
                    'only_emit_local': is_affirmative(self.instance.get('only_emit_local', False)),
                }
                metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column))

        # Load metrics from scheduler and task tables, if enabled
        if is_affirmative(self.instance.get('include_task_scheduler_metrics', False)):
            for name, table, column in TASK_SCHEDULER_METRICS:
                cfg = {'name': name, 'table': table, 'column': column, 'tags': tags}
                metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column))

        # Load sys.master_files metrics
        if is_affirmative(self.instance.get('include_master_files_metrics', False)):
            for name, table, column in DATABASE_MASTER_FILES:
                cfg = {'name': name, 'table': table, 'column': column, 'tags': tags}
                metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column))

        # Load DB Fragmentation metrics
        if is_affirmative(self.instance.get('include_db_fragmentation_metrics', False)):
            db_fragmentation_object_names = self.instance.get('db_fragmentation_object_names', [])
            db_names = self.databases or [self.instance.get('database', self.connection.DEFAULT_DATABASE)]

            if not db_fragmentation_object_names:
                self.log.debug(
                    "No fragmentation object names specified, will return fragmentation metrics for all "
                    "object_ids of current database(s): %s",
                    db_names,
                )

            for db_name in db_names:
                for name, table, column in DATABASE_FRAGMENTATION_METRICS:
                    cfg = {
                        'name': name,
                        'table': table,
                        'column': column,
                        'instance_name': db_name,
                        'tags': tags,
                        'db_fragmentation_object_names': db_fragmentation_object_names,
                    }
                    metrics_to_collect.append(self.typed_metric(cfg_inst=cfg, table=table, column=column))

        # Load any custom metrics from conf.d/sqlserver.yaml
        for cfg in custom_metrics:
            sql_type = None
            base_name = None

            custom_tags = tags + cfg.get('tags', [])
            cfg['tags'] = custom_tags

            db_table = cfg.get('table', DEFAULT_PERFORMANCE_TABLE)
            if db_table not in VALID_TABLES:
                self.log.error('%s has an invalid table name: %s', cfg['name'], db_table)
                continue

            if cfg.get('database', None) and cfg.get('database') != self.instance.get('database'):
                self.log.debug(
                    'Skipping custom metric %s for database %s, check instance configured for database %s',
                    cfg['name'],
                    cfg.get('database'),
                    self.instance.get('database'),
                )
                continue

            if db_table == DEFAULT_PERFORMANCE_TABLE:
                user_type = cfg.get('type')
                if user_type is not None and user_type not in VALID_METRIC_TYPES:
                    self.log.error('%s has an invalid metric type: %s', cfg['name'], user_type)
                sql_type = None
                try:
                    if user_type is None:
                        sql_type, base_name = self.get_sql_type(cfg['counter_name'])
                except Exception:
                    self.log.warning("Can't load the metric %s, ignoring", cfg['name'], exc_info=True)
                    continue

                metrics_to_collect.append(
                    self.typed_metric(
                        cfg_inst=cfg, table=db_table, base_name=base_name, user_type=user_type, sql_type=sql_type
                    )
                )

            else:
                for column in cfg['columns']:
                    metrics_to_collect.append(
                        self.typed_metric(
                            cfg_inst=cfg, table=db_table, base_name=base_name, sql_type=sql_type, column=column
                        )
                    )

        self.instance_metrics = metrics_to_collect
        self.log.debug("metrics to collect %s", metrics_to_collect)

        # create an organized grouping of metric names to their metric classes
        for m in metrics_to_collect:
            cls = m.__class__.__name__
            name = m.sql_name or m.column
            self.log.debug("Adding metric class %s named %s", cls, name)

            self.instance_per_type_metrics[cls].add(name)
            if m.base_name:
                self.instance_per_type_metrics[cls].add(m.base_name)

    def _add_performance_counters(self, metrics, metrics_to_collect, tags, db=None):
        if db is not None:
            tags = tags + ['database:{}'.format(db)]
        for name, counter_name, instance_name in metrics:
            try:
                sql_type, base_name = self.get_sql_type(counter_name)
                cfg = {
                    'name': name,
                    'counter_name': counter_name,
                    'instance_name': db or instance_name,
                    'tags': tags,
                }

                metrics_to_collect.append(
                    self.typed_metric(
                        cfg_inst=cfg, table=DEFAULT_PERFORMANCE_TABLE, base_name=base_name, sql_type=sql_type
                    )
                )
            except SQLConnectionError:
                raise
            except Exception:
                self.log.warning("Can't load the metric %s, ignoring", name, exc_info=True)
                continue

    def get_sql_type(self, counter_name):
        """
        Return the type of the performance counter so that we can report it to
        Datadog correctly
        If the sql_type is one that needs a base (PERF_RAW_LARGE_FRACTION and
        PERF_AVERAGE_BULK), the name of the base counter will also be returned
        """
        with self.connection.get_managed_cursor() as cursor:
            cursor.execute(COUNTER_TYPE_QUERY, (counter_name,))
            (sql_type,) = cursor.fetchone()
            if sql_type == PERF_LARGE_RAW_BASE:
                self.log.warning("Metric %s is of type Base and shouldn't be reported this way", counter_name)
            base_name = None
            if sql_type in [PERF_AVERAGE_BULK, PERF_RAW_LARGE_FRACTION]:
                # This is an ugly hack. For certains type of metric (PERF_RAW_LARGE_FRACTION
                # and PERF_AVERAGE_BULK), we need two metrics: the metrics specified and
                # a base metrics to get the ratio. There is no unique schema so we generate
                # the possible candidates and we look at which ones exist in the db.
                candidates = (
                    counter_name + " base",
                    counter_name.replace("(ms)", "base"),
                    counter_name.replace("Avg ", "") + " base",
                )
                try:
                    cursor.execute(BASE_NAME_QUERY, candidates)
                    base_name = cursor.fetchone().counter_name.strip()
                    self.log.debug("Got base metric: %s for metric: %s", base_name, counter_name)
                except Exception as e:
                    self.log.warning("Could not get counter_name of base for metric: %s", e)

        return sql_type, base_name

    def typed_metric(self, cfg_inst, table, base_name=None, user_type=None, sql_type=None, column=None):
        """
        Create the appropriate BaseSqlServerMetric object, each implementing its method to
        fetch the metrics properly.
        If a `type` was specified in the config, it is used to report the value
        directly fetched from SQLServer. Otherwise, it is decided based on the
        sql_type, according to microsoft's documentation.
        """
        if table == DEFAULT_PERFORMANCE_TABLE:
            metric_type_mapping = {
                PERF_COUNTER_BULK_COUNT: (self.rate, metrics.SqlSimpleMetric),
                PERF_COUNTER_LARGE_RAWCOUNT: (self.gauge, metrics.SqlSimpleMetric),
                PERF_LARGE_RAW_BASE: (self.gauge, metrics.SqlSimpleMetric),
                PERF_RAW_LARGE_FRACTION: (self.gauge, metrics.SqlFractionMetric),
                PERF_AVERAGE_BULK: (self.gauge, metrics.SqlIncrFractionMetric),
            }
            if user_type is not None:
                # user type overrides any other value
                metric_type = getattr(self, user_type)
                cls = metrics.SqlSimpleMetric

            else:
                metric_type, cls = metric_type_mapping[sql_type]
        else:
            # Lookup metrics classes by their associated table
            metric_type_str, cls = metrics.TABLE_MAPPING[table]
            metric_type = getattr(self, metric_type_str)

        cfg_inst['hostname'] = self.resolved_hostname

        return cls(cfg_inst, base_name, metric_type, column, self.log)

    def check(self, _):
        if self.do_check:
            self.load_static_information()
            if self.proc:
                self.do_stored_procedure_check()
            else:
                self.collect_metrics()
            if self.autodiscovery and self.autodiscovery_db_service_check:
                for db_name in self.databases:
                    if db_name != self.connection.DEFAULT_DATABASE:
                        try:
                            self.connection.check_database_conns(db_name)
                        except Exception as e:
                            # service_check errors on auto discovered databases should not abort the check
                            self.log.warning("failed service check for auto discovered database: %s", e)

            if self.dbm_enabled:
                self.statement_metrics.run_job_loop(self.tags)
                self.activity.run_job_loop(self.tags)

        else:
            self.log.debug("Skipping check")

    @property
    def dynamic_queries(self):
        """
        Initializes dynamic queries which depend on static information loaded from the database
        """
        if self._dynamic_queries:
            return self._dynamic_queries

        major_version = self.static_info_cache.get(STATIC_INFO_MAJOR_VERSION)
        if not major_version:
            self.log.warning("missing major_version, cannot initialize dynamic queries")
            return None

        queries = [get_query_file_stats(major_version)]
        self._dynamic_queries = self._new_query_executor(queries)
        self._dynamic_queries.compile_queries()
        self.log.debug("initialized dynamic queries")
        return self._dynamic_queries

    def collect_metrics(self):
        """Fetch the metrics from all of the associated database tables."""

        with self.connection.open_managed_default_connection():
            with self.connection.get_managed_cursor() as cursor:
                # initiate autodiscovery or if the server was down at check __init__ key could be missing.
                if self.autodiscover_databases(cursor) or not self.instance_metrics:
                    self._make_metric_list_to_collect(self.custom_metrics)

                instance_results = {}

                # Execute the `fetch_all` operations first to minimize the database calls
                for cls, metric_names in six.iteritems(self.instance_per_type_metrics):
                    if not metric_names:
                        instance_results[cls] = None, None
                    else:
                        try:
                            db_names = self.databases or [
                                self.instance.get('database', self.connection.DEFAULT_DATABASE)
                            ]
                            rows, cols = getattr(metrics, cls).fetch_all_values(
                                cursor, list(metric_names), self.log, databases=db_names
                            )
                        except Exception as e:
                            self.log.error("Error running `fetch_all` for metrics %s - skipping.  Error: %s", cls, e)
                            rows, cols = None, None

                        instance_results[cls] = rows, cols

                # Using the cached data, extract and report individual metrics
                for metric in self.instance_metrics:
                    if type(metric) is metrics.SqlIncrFractionMetric:
                        # special case, since it uses the same results as SqlFractionMetric
                        key = 'SqlFractionMetric'
                    else:
                        key = metric.__class__.__name__

                    if key not in instance_results:
                        self.log.warning("No %s metrics found, skipping", str(key))
                    else:
                        rows, cols = instance_results[key]
                        if rows is not None:
                            metric.fetch_metric(rows, cols)

            # Neither pyodbc nor adodbapi are able to read results of a query if the number of rows affected
            # statement are returned as part of the result set, so we disable for the entire connection
            # this is important mostly for custom_queries or the stored_procedure feature
            # https://docs.microsoft.com/en-us/sql/t-sql/statements/set-nocount-transact-sql
            with self.connection.get_managed_cursor() as cursor:
                cursor.execute("SET NOCOUNT ON")
            try:
                # Server state queries require VIEW SERVER STATE permissions, which some managed database
                # versions do not support.
                if self.static_info_cache.get(STATIC_INFO_ENGINE_EDITION) not in [
                    ENGINE_EDITION_SQL_DATABASE,
                ]:
                    self.server_state_queries.execute()

                self._check_queries.execute()
                if self.dynamic_queries:
                    self.dynamic_queries.execute()
                # reuse connection for any custom queries
                self._query_manager.execute()
            finally:
                with self.connection.get_managed_cursor() as cursor:
                    cursor.execute("SET NOCOUNT OFF")

    def execute_query_raw(self, query):
        with self.connection.get_managed_cursor() as cursor:
            cursor.execute(query)
            return cursor.fetchall()

    def do_stored_procedure_check(self):
        """
        Fetch the metrics from the stored proc
        """

        proc = self.proc
        guardSql = self.instance.get('proc_only_if')
        custom_tags = self.instance.get("tags", [])

        if (guardSql and self.proc_check_guard(guardSql)) or not guardSql:
            self.connection.open_db_connections(self.connection.DEFAULT_DB_KEY)
            cursor = self.connection.get_cursor(self.connection.DEFAULT_DB_KEY)

            try:
                self.log.debug("Calling Stored Procedure : %s", proc)
                if self.connection.get_connector() == 'adodbapi':
                    cursor.callproc(proc)
                else:
                    # pyodbc does not support callproc; use execute instead.
                    # Reference: https://github.com/mkleehammer/pyodbc/wiki/Calling-Stored-Procedures
                    call_proc = '{{CALL {}}}'.format(proc)
                    cursor.execute(call_proc)

                rows = cursor.fetchall()
                self.log.debug("Row count (%s) : %s", proc, cursor.rowcount)

                for row in rows:
                    tags = [] if row.tags is None or row.tags == '' else row.tags.split(',')
                    tags.extend(custom_tags)

                    if row.type.lower() in self.proc_type_mapping:
                        self.proc_type_mapping[row.type](row.metric, row.value, tags, raw=True)
                    else:
                        self.log.warning(
                            '%s is not a recognised type from procedure %s, metric %s', row.type, proc, row.metric
                        )

            except Exception as e:
                self.log.warning("Could not call procedure %s: %s", proc, e)
                raise e

            self.connection.close_cursor(cursor)
            self.connection.close_db_connections(self.connection.DEFAULT_DB_KEY)
        else:
            self.log.info("Skipping call to %s due to only_if", proc)

    def proc_check_guard(self, sql):
        """
        check to see if the guard SQL returns a single column containing 0 or 1
        We return true if 1, else False
        """
        self.connection.open_db_connections(self.connection.PROC_GUARD_DB_KEY)
        cursor = self.connection.get_cursor(self.connection.PROC_GUARD_DB_KEY)

        should_run = False
        try:
            cursor.execute(sql, ())
            result = cursor.fetchone()
            should_run = result[0] == 1
        except Exception as e:
            self.log.error("Failed to run proc_only_if sql %s : %s", sql, e)

        self.connection.close_cursor(cursor)
        self.connection.close_db_connections(self.connection.PROC_GUARD_DB_KEY)
        return should_run
Beispiel #14
0
class MySql(AgentCheck):
    SERVICE_CHECK_NAME = 'mysql.can_connect'
    SLAVE_SERVICE_CHECK_NAME = 'mysql.replication.slave_running'
    DEFAULT_MAX_CUSTOM_QUERIES = 20

    def __init__(self, name, init_config, instances):
        super(MySql, self).__init__(name, init_config, instances)
        self.qcache_stats = {}
        self.version = None
        self.config = MySQLConfig(self.instance)

        # Create a new connection on every check run
        self._conn = None

        self._query_manager = QueryManager(self,
                                           self.execute_query_raw,
                                           queries=[],
                                           tags=self.config.tags)
        self._statement_metrics = MySQLStatementMetrics(self.config)
        self.check_initializations.append(self._query_manager.compile_queries)
        self.innodb_stats = InnoDBMetrics()
        self.check_initializations.append(self.config.configuration_checks)

    def execute_query_raw(self, query):
        with closing(self._conn.cursor(pymysql.cursors.SSCursor)) as cursor:
            cursor.execute(query)
            for row in cursor.fetchall_unbuffered():
                yield row

    @AgentCheck.metadata_entrypoint
    def _send_metadata(self):
        self.set_metadata('version',
                          self.version.version + '+' + self.version.build)
        self.set_metadata('flavor', self.version.flavor)

    @classmethod
    def get_library_versions(cls):
        return {'pymysql': pymysql.__version__}

    def check(self, _):
        self._set_qcache_stats()
        with self._connect() as db:
            try:
                self._conn = db

                # version collection
                self.version = get_version(db)
                self._send_metadata()

                # Metric collection
                self._collect_metrics(db)
                self._collect_system_metrics(self.config.host, db,
                                             self.config.tags)
                if self.config.deep_database_monitoring:
                    self._collect_statement_metrics(db, self.config.tags)

                # keeping track of these:
                self._put_qcache_stats()

                # Custom queries
                self._query_manager.execute()

            except Exception as e:
                self.log.exception("error!")
                raise e
            finally:
                self._conn = None

    def _set_qcache_stats(self):
        host_key = self._get_host_key()
        qcache_st = self.qcache_stats.get(host_key, (None, None, None))

        self._qcache_hits = qcache_st[0]
        self._qcache_inserts = qcache_st[1]
        self._qcache_not_cached = qcache_st[2]

    def _put_qcache_stats(self):
        host_key = self._get_host_key()
        self.qcache_stats[host_key] = (self._qcache_hits, self._qcache_inserts,
                                       self._qcache_not_cached)

    def _get_host_key(self):
        if self.config.defaults_file:
            return self.config.defaults_file

        hostkey = self.config.host
        if self.config.mysql_sock:
            hostkey = "{0}:{1}".format(hostkey, self.config.mysql_sock)
        elif self.config.port:
            hostkey = "{0}:{1}".format(hostkey, self.config.port)

        return hostkey

    def _get_connection_args(self):
        ssl = dict(self.config.ssl) if self.config.ssl else None
        connection_args = {
            'ssl': ssl,
            'connect_timeout': self.config.connect_timeout,
        }
        if self.config.charset:
            connection_args['charset'] = self.config.charset

        if self.config.defaults_file != '':
            connection_args['read_default_file'] = self.config.defaults_file
            return connection_args

        connection_args.update({
            'user': self.config.user,
            'passwd': self.config.password
        })
        if self.config.mysql_sock != '':
            self.service_check_tags = [
                'server:{0}'.format(self.config.mysql_sock),
                'port:unix_socket',
            ] + self.config.tags
            connection_args.update({'unix_socket': self.config.mysql_sock})
        else:
            connection_args.update({'host': self.config.host})

        if self.config.port:
            connection_args.update({'port': self.config.port})
        return connection_args

    @contextmanager
    def _connect(self):
        service_check_tags = [
            'server:{0}'.format(
                (self.config.mysql_sock
                 if self.config.mysql_sock != '' else self.config.host)),
            'port:{}'.format(
                self.config.port if self.config.port else 'unix_socket'),
        ] + self.config.tags
        db = None
        try:
            connect_args = self._get_connection_args()
            db = pymysql.connect(**connect_args)
            self.log.debug("Connected to MySQL")
            self.service_check_tags = list(set(service_check_tags))
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.OK,
                               tags=service_check_tags)
            yield db
        except Exception:
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.CRITICAL,
                               tags=service_check_tags)
            raise
        finally:
            if db:
                db.close()

    def _collect_metrics(self, db):

        # Get aggregate of all VARS we want to collect
        metrics = STATUS_VARS

        # collect results from db
        results = self._get_stats_from_status(db)
        results.update(self._get_stats_from_variables(db))

        if not is_affirmative(
                self.config.options.get(
                    'disable_innodb_metrics',
                    False)) and self._is_innodb_engine_enabled(db):
            results.update(self.innodb_stats.get_stats_from_innodb_status(db))
            self.innodb_stats.process_innodb_stats(results,
                                                   self.config.options,
                                                   metrics)

        # Binary log statistics
        if self._get_variable_enabled(results, 'log_bin'):
            results['Binlog_space_usage_bytes'] = self._get_binary_log_stats(
                db)

        # Compute key cache utilization metric
        key_blocks_unused = collect_scalar('Key_blocks_unused', results)
        key_cache_block_size = collect_scalar('key_cache_block_size', results)
        key_buffer_size = collect_scalar('key_buffer_size', results)
        results['Key_buffer_size'] = key_buffer_size

        try:
            # can be null if the unit is missing in the user config (4 instead of 4G for eg.)
            if key_buffer_size != 0:
                key_cache_utilization = 1 - (
                    (key_blocks_unused * key_cache_block_size) /
                    key_buffer_size)
                results['Key_cache_utilization'] = key_cache_utilization

            results['Key_buffer_bytes_used'] = collect_scalar(
                'Key_blocks_used', results) * key_cache_block_size
            results['Key_buffer_bytes_unflushed'] = (
                collect_scalar('Key_blocks_not_flushed', results) *
                key_cache_block_size)
        except TypeError as e:
            self.log.error(
                "Not all Key metrics are available, unable to compute: %s", e)

        metrics.update(VARIABLES_VARS)
        metrics.update(INNODB_VARS)
        metrics.update(BINLOG_VARS)

        if is_affirmative(
                self.config.options.get('extra_status_metrics', False)):
            self.log.debug("Collecting Extra Status Metrics")
            metrics.update(OPTIONAL_STATUS_VARS)

            if self.version.version_compatible((5, 6, 6)):
                metrics.update(OPTIONAL_STATUS_VARS_5_6_6)

        if is_affirmative(self.config.options.get('galera_cluster', False)):
            # already in result-set after 'SHOW STATUS' just add vars to collect
            self.log.debug("Collecting Galera Metrics.")
            metrics.update(GALERA_VARS)

        performance_schema_enabled = self._get_variable_enabled(
            results, 'performance_schema')
        above_560 = self.version.version_compatible((5, 6, 0))
        if (is_affirmative(
                self.config.options.get('extra_performance_metrics', False))
                and above_560 and performance_schema_enabled):
            # report avg query response time per schema to Datadog
            results[
                'perf_digest_95th_percentile_avg_us'] = self._get_query_exec_time_95th_us(
                    db)
            results['query_run_time_avg'] = self._query_exec_time_per_schema(
                db)
            metrics.update(PERFORMANCE_VARS)

        if is_affirmative(self.config.options.get('schema_size_metrics',
                                                  False)):
            # report avg query response time per schema to Datadog
            results['information_schema_size'] = self._query_size_per_schema(
                db)
            metrics.update(SCHEMA_VARS)

        if is_affirmative(self.config.options.get('replication', False)):
            replication_metrics = self._collect_replication_metrics(
                db, results, above_560)
            metrics.update(replication_metrics)
            self._check_replication_status(results)

        # "synthetic" metrics
        metrics.update(SYNTHETIC_VARS)
        self._compute_synthetic_results(results)

        # remove uncomputed metrics
        for k in SYNTHETIC_VARS:
            if k not in results:
                metrics.pop(k, None)

        # add duped metrics - reporting some as both rate and gauge
        dupes = [
            ('Table_locks_waited', 'Table_locks_waited_rate'),
            ('Table_locks_immediate', 'Table_locks_immediate_rate'),
        ]
        for src, dst in dupes:
            if src in results:
                results[dst] = results[src]

        self._submit_metrics(metrics, results, self.config.tags)

        # Collect custom query metrics
        # Max of 20 queries allowed
        if isinstance(self.config.queries, list):
            for check in self.config.queries[:self.config.max_custom_queries]:
                total_tags = self.config.tags + check.get('tags', [])
                self._collect_dict(check['type'],
                                   {check['field']: check['metric']},
                                   check['query'],
                                   db,
                                   tags=total_tags)

            if len(self.config.queries) > self.config.max_custom_queries:
                self.warning(
                    "Maximum number (%s) of custom queries reached.  Skipping the rest.",
                    self.config.max_custom_queries)

    def _collect_replication_metrics(self, db, results, above_560):
        # Get replica stats
        is_mariadb = self.version.flavor == "MariaDB"
        replication_channel = self.config.options.get('replication_channel')
        results.update(
            self._get_replica_stats(db, is_mariadb, replication_channel))
        nonblocking = is_affirmative(
            self.config.options.get('replication_non_blocking_status', False))
        results.update(self._get_slave_status(db, above_560, nonblocking))
        return REPLICA_VARS

    def _check_replication_status(self, results):
        # get slave running form global status page
        slave_running_status = AgentCheck.UNKNOWN
        # Slave_IO_Running: Whether the I/O thread for reading the source's binary log is running.
        # You want this to be Yes unless you have not yet started replication or have explicitly stopped it.
        slave_io_running = collect_type('Slave_IO_Running', results, dict)
        # Slave_SQL_Running: Whether the SQL thread for executing events in the relay log is running.
        slave_sql_running = collect_type('Slave_SQL_Running', results, dict)
        if slave_io_running:
            slave_io_running = any(v.lower().strip() == 'yes'
                                   for v in itervalues(slave_io_running))
        if slave_sql_running:
            slave_sql_running = any(v.lower().strip() == 'yes'
                                    for v in itervalues(slave_sql_running))
        binlog_running = results.get('Binlog_enabled', False)
        # slaves will only be collected iff user has PROCESS privileges.
        slaves = collect_scalar('Slaves_connected', results)

        if not (slave_io_running is None and slave_sql_running is None):
            if not slave_io_running and not slave_sql_running:
                self.log.debug(
                    "Slave_IO_Running and Slave_SQL_Running are not ok")
                slave_running_status = AgentCheck.CRITICAL
            if not slave_io_running or not slave_sql_running:
                self.log.debug(
                    "Either Slave_IO_Running or Slave_SQL_Running are not ok")
                slave_running_status = AgentCheck.WARNING

        if slave_running_status == AgentCheck.UNKNOWN:
            if self._is_master(slaves, results):  # master
                if slaves > 0 and binlog_running:
                    self.log.debug(
                        "Host is master, there are replicas and binlog is running"
                    )
                    slave_running_status = AgentCheck.OK
                else:
                    slave_running_status = AgentCheck.WARNING
            else:  # replica (or standalone)
                if not (slave_io_running is None
                        and slave_sql_running is None):
                    if slave_io_running and slave_sql_running:
                        self.log.debug(
                            "Slave_IO_Running and Slave_SQL_Running are ok")
                        slave_running_status = AgentCheck.OK

        # deprecated in favor of service_check("mysql.replication.slave_running")
        self.gauge(self.SLAVE_SERVICE_CHECK_NAME,
                   1 if slave_running_status == AgentCheck.OK else 0,
                   tags=self.config.tags)
        self.service_check(self.SLAVE_SERVICE_CHECK_NAME,
                           slave_running_status,
                           tags=self.service_check_tags)

    def _collect_statement_metrics(self, db, tags):
        tags = self.service_check_tags + tags
        metrics = self._statement_metrics.collect_per_statement_metrics(db)
        for metric_name, metric_value, metric_tags in metrics:
            self.count(metric_name,
                       metric_value,
                       tags=list(set(tags + metric_tags)))

    def _is_master(self, slaves, results):
        # master uuid only collected in slaves
        master_host = collect_string('Master_Host', results)
        if slaves > 0 or not master_host:
            return True

        return False

    def _submit_metrics(self, variables, db_results, tags):
        for variable, metric in iteritems(variables):
            metric_name, metric_type = metric
            for tag, value in collect_all_scalars(variable, db_results):
                metric_tags = list(tags)
                if tag:
                    metric_tags.append(tag)
                if value is not None:
                    if metric_type == RATE:
                        self.rate(metric_name, value, tags=metric_tags)
                    elif metric_type == GAUGE:
                        self.gauge(metric_name, value, tags=metric_tags)
                    elif metric_type == COUNT:
                        self.count(metric_name, value, tags=metric_tags)
                    elif metric_type == MONOTONIC:
                        self.monotonic_count(metric_name,
                                             value,
                                             tags=metric_tags)

    def _collect_dict(self, metric_type, field_metric_map, query, db, tags):
        """
        Query status and get a dictionary back.
        Extract each field out of the dictionary
        and stuff it in the corresponding metric.

        query: show status...
        field_metric_map: {"Seconds_behind_master": "mysqlSecondsBehindMaster"}
        """
        try:
            with closing(db.cursor()) as cursor:
                cursor.execute(query)
                result = cursor.fetchone()
                if result is not None:
                    for field, metric in list(iteritems(field_metric_map)):
                        # Find the column name in the cursor description to identify the column index
                        # http://www.python.org/dev/peps/pep-0249/
                        # cursor.description is a tuple of (column_name, ..., ...)
                        try:
                            col_idx = [
                                d[0].lower() for d in cursor.description
                            ].index(field.lower())
                            self.log.debug("Collecting metric: %s", metric)
                            if result[col_idx] is not None:
                                self.log.debug("Collecting done, value %s",
                                               result[col_idx])
                                if metric_type == GAUGE:
                                    self.gauge(metric,
                                               float(result[col_idx]),
                                               tags=tags)
                                elif metric_type == RATE:
                                    self.rate(metric,
                                              float(result[col_idx]),
                                              tags=tags)
                                else:
                                    self.gauge(metric,
                                               float(result[col_idx]),
                                               tags=tags)
                            else:
                                self.log.debug(
                                    "Received value is None for index %d",
                                    col_idx)
                        except ValueError:
                            self.log.exception(
                                "Cannot find %s in the columns %s", field,
                                cursor.description)
        except Exception:
            self.warning("Error while running %s\n%s", query,
                         traceback.format_exc())
            self.log.exception("Error while running %s", query)

    def _collect_system_metrics(self, host, db, tags):
        pid = None
        # The server needs to run locally, accessed by TCP or socket
        if host in ["localhost", "127.0.0.1", "0.0.0.0"] or db.port == long(0):
            pid = self._get_server_pid(db)

        if pid:
            self.log.debug("System metrics for mysql w/ pid: %s", pid)
            # At last, get mysql cpu data out of psutil or procfs

            try:
                ucpu, scpu = None, None
                if PSUTIL_AVAILABLE:
                    proc = psutil.Process(pid)

                    ucpu = proc.cpu_times()[0]
                    scpu = proc.cpu_times()[1]

                if ucpu and scpu:
                    self.rate("mysql.performance.user_time", ucpu, tags=tags)
                    # should really be system_time
                    self.rate("mysql.performance.kernel_time", scpu, tags=tags)
                    self.rate("mysql.performance.cpu_time",
                              ucpu + scpu,
                              tags=tags)

            except Exception:
                self.warning(
                    "Error while reading mysql (pid: %s) procfs data\n%s", pid,
                    traceback.format_exc())

    def _get_pid_file_variable(self, db):
        """
        Get the `pid_file` variable
        """
        pid_file = None
        try:
            with closing(db.cursor()) as cursor:
                cursor.execute("SHOW VARIABLES LIKE 'pid_file'")
                pid_file = cursor.fetchone()[1]
        except Exception:
            self.warning("Error while fetching pid_file variable of MySQL.")

        return pid_file

    def _get_server_pid(self, db):
        pid = None

        # Try to get pid from pid file, it can fail for permission reason
        pid_file = self._get_pid_file_variable(db)
        if pid_file is not None:
            self.log.debug("pid file: %s", str(pid_file))
            try:
                with open(pid_file, 'rb') as f:
                    pid = int(f.readline())
            except IOError:
                self.log.debug("Cannot read mysql pid file %s", pid_file)

        # If pid has not been found, read it from ps
        if pid is None and PSUTIL_AVAILABLE:
            for proc in psutil.process_iter():
                try:
                    if proc.name() == PROC_NAME:
                        pid = proc.pid
                except (psutil.AccessDenied, psutil.ZombieProcess,
                        psutil.NoSuchProcess):
                    continue
                except Exception:
                    self.log.exception(
                        "Error while fetching mysql pid from psutil")

        return pid

    @classmethod
    def _get_stats_from_status(cls, db):
        with closing(db.cursor()) as cursor:
            cursor.execute("SHOW /*!50002 GLOBAL */ STATUS;")
            results = dict(cursor.fetchall())

            return results

    @classmethod
    def _get_stats_from_variables(cls, db):
        with closing(db.cursor()) as cursor:
            cursor.execute("SHOW GLOBAL VARIABLES;")
            results = dict(cursor.fetchall())

            return results

    def _get_binary_log_stats(self, db):
        try:
            with closing(db.cursor()) as cursor:
                cursor.execute("SHOW BINARY LOGS;")
                cursor_results = cursor.fetchall()
                master_logs = {
                    result[0]: result[1]
                    for result in cursor_results
                }

                binary_log_space = 0
                for value in itervalues(master_logs):
                    binary_log_space += value

                return binary_log_space
        except (pymysql.err.InternalError, pymysql.err.OperationalError) as e:
            self.warning(
                "Privileges error accessing the BINARY LOGS (must grant REPLICATION CLIENT): %s",
                e)
            return None

    def _is_innodb_engine_enabled(self, db):
        # Whether InnoDB engine is available or not can be found out either
        # from the output of SHOW ENGINES or from information_schema.ENGINES
        # table. Later is choosen because that involves no string parsing.
        try:
            with closing(db.cursor()) as cursor:
                cursor.execute(SQL_INNODB_ENGINES)
                return cursor.rowcount > 0

        except (pymysql.err.InternalError, pymysql.err.OperationalError,
                pymysql.err.NotSupportedError) as e:
            self.warning(
                "Possibly innodb stats unavailable - error querying engines table: %s",
                e)
            return False

    def _get_replica_stats(self, db, is_mariadb, replication_channel):
        replica_results = defaultdict(dict)
        try:
            with closing(db.cursor(pymysql.cursors.DictCursor)) as cursor:
                if is_mariadb and replication_channel:
                    cursor.execute(
                        "SET @@default_master_connection = '{0}';".format(
                            replication_channel))
                    cursor.execute("SHOW SLAVE STATUS;")
                elif replication_channel:
                    cursor.execute(
                        "SHOW SLAVE STATUS FOR CHANNEL '{0}';".format(
                            replication_channel))
                else:
                    cursor.execute("SHOW SLAVE STATUS;")

                results = cursor.fetchall()
                self.log.debug("Getting replication status: %s", results)
                for slave_result in results:
                    # MySQL <5.7 does not have Channel_Name.
                    # For MySQL >=5.7 'Channel_Name' is set to an empty string by default
                    channel = replication_channel or slave_result.get(
                        'Channel_Name') or 'default'
                    for key, value in iteritems(slave_result):
                        if value is not None:
                            replica_results[key]['channel:{0}'.format(
                                channel)] = value
        except (pymysql.err.InternalError, pymysql.err.OperationalError) as e:
            errno, msg = e.args
            if errno == 1617 and msg == "There is no master connection '{0}'".format(
                    replication_channel):
                # MariaDB complains when you try to get slave status with a
                # connection name on the master, without connection name it
                # responds an empty string as expected.
                # Mysql behaves the same with or without connection name.
                pass
            else:
                self.warning(
                    "Privileges error getting replication status (must grant REPLICATION CLIENT): %s",
                    e)

        try:
            with closing(db.cursor(pymysql.cursors.DictCursor)) as cursor:
                cursor.execute("SHOW MASTER STATUS;")
                binlog_results = cursor.fetchone()
                if binlog_results:
                    replica_results.update({'Binlog_enabled': True})
        except (pymysql.err.InternalError, pymysql.err.OperationalError) as e:
            self.warning(
                "Privileges error getting binlog information (must grant REPLICATION CLIENT): %s",
                e)

        return replica_results

    def _get_slave_status(self, db, above_560, nonblocking):
        """
        Retrieve the slaves' statuses using:
        1. The `performance_schema.threads` table. Non-blocking, requires version > 5.6.0
        2. The `information_schema.processlist` table. Blocking
        """
        try:
            with closing(db.cursor()) as cursor:
                if above_560 and nonblocking:
                    # Query `performance_schema.threads` instead of `
                    # information_schema.processlist` to avoid mutex impact on performance.
                    cursor.execute(SQL_WORKER_THREADS)
                else:
                    cursor.execute(SQL_PROCESS_LIST)
                slave_results = cursor.fetchall()
                slaves = 0
                for _ in slave_results:
                    slaves += 1

                return {'Slaves_connected': slaves}

        except (pymysql.err.InternalError, pymysql.err.OperationalError) as e:
            self.warning(
                "Privileges error accessing the process tables (must grant PROCESS): %s",
                e)
            return {}

    @classmethod
    def _are_values_numeric(cls, array):
        return all(v.isdigit() for v in array)

    def _get_variable_enabled(self, results, var):
        enabled = collect_string(var, results)
        return enabled and enabled.lower().strip() == 'on'

    def _get_query_exec_time_95th_us(self, db):
        # Fetches the 95th percentile query execution time and returns the value
        # in microseconds
        try:
            with closing(db.cursor()) as cursor:
                cursor.execute(SQL_95TH_PERCENTILE)

                if cursor.rowcount < 1:
                    self.warning(
                        "Failed to fetch records from the perf schema \
                                 'events_statements_summary_by_digest' table.")
                    return None

                row = cursor.fetchone()
                query_exec_time_95th_per = row[0]

                return query_exec_time_95th_per
        except (pymysql.err.InternalError, pymysql.err.OperationalError) as e:
            self.warning(
                "95th percentile performance metrics unavailable at this time: %s",
                e)
            return None

    def _query_exec_time_per_schema(self, db):
        # Fetches the avg query execution time per schema and returns the
        # value in microseconds
        try:
            with closing(db.cursor()) as cursor:
                cursor.execute(SQL_AVG_QUERY_RUN_TIME)

                if cursor.rowcount < 1:
                    self.warning(
                        "Failed to fetch records from the perf schema \
                                 'events_statements_summary_by_digest' table.")
                    return None

                schema_query_avg_run_time = {}
                for row in cursor.fetchall():
                    schema_name = str(row[0])
                    avg_us = long(row[1])

                    # set the tag as the dictionary key
                    schema_query_avg_run_time["schema:{0}".format(
                        schema_name)] = avg_us

                return schema_query_avg_run_time
        except (pymysql.err.InternalError, pymysql.err.OperationalError) as e:
            self.warning(
                "Avg exec time performance metrics unavailable at this time: %s",
                e)
            return None

    def _query_size_per_schema(self, db):
        # Fetches the avg query execution time per schema and returns the
        # value in microseconds
        try:
            with closing(db.cursor()) as cursor:
                cursor.execute(SQL_QUERY_SCHEMA_SIZE)

                if cursor.rowcount < 1:
                    self.warning(
                        "Failed to fetch records from the information schema 'tables' table."
                    )
                    return None

                schema_size = {}
                for row in cursor.fetchall():
                    schema_name = str(row[0])
                    size = long(row[1])

                    # set the tag as the dictionary key
                    schema_size["schema:{0}".format(schema_name)] = size

                return schema_size
        except (pymysql.err.InternalError, pymysql.err.OperationalError) as e:
            self.warning(
                "Avg exec time performance metrics unavailable at this time: %s",
                e)

        return {}

    def _compute_synthetic_results(self, results):
        if ('Qcache_hits' in results) and ('Qcache_inserts'
                                           in results) and ('Qcache_not_cached'
                                                            in results):
            if not int(results['Qcache_hits']):
                results['Qcache_utilization'] = 0
            else:
                results['Qcache_utilization'] = (
                    float(results['Qcache_hits']) /
                    (int(results['Qcache_inserts']) +
                     int(results['Qcache_not_cached']) +
                     int(results['Qcache_hits'])) * 100)

            if all(v is not None
                   for v in (self._qcache_hits, self._qcache_inserts,
                             self._qcache_not_cached)):
                if not (int(results['Qcache_hits']) - self._qcache_hits):
                    results['Qcache_instant_utilization'] = 0
                else:
                    top = float(results['Qcache_hits']) - self._qcache_hits
                    bottom = (
                        (int(results['Qcache_inserts']) - self._qcache_inserts)
                        + (int(results['Qcache_not_cached']) -
                           self._qcache_not_cached) +
                        (int(results['Qcache_hits']) - self._qcache_hits))
                    results['Qcache_instant_utilization'] = (top /
                                                             bottom) * 100

            # update all three, or none - for consistent samples.
            self._qcache_hits = int(results['Qcache_hits'])
            self._qcache_inserts = int(results['Qcache_inserts'])
            self._qcache_not_cached = int(results['Qcache_not_cached'])
Beispiel #15
0
class MySql(AgentCheck):
    SERVICE_CHECK_NAME = 'mysql.can_connect'
    SLAVE_SERVICE_CHECK_NAME = 'mysql.replication.slave_running'
    REPLICA_SERVICE_CHECK_NAME = 'mysql.replication.replica_running'
    GROUP_REPLICATION_SERVICE_CHECK_NAME = 'mysql.replication.group.status'
    DEFAULT_MAX_CUSTOM_QUERIES = 20

    def __init__(self, name, init_config, instances):
        super(MySql, self).__init__(name, init_config, instances)
        self.qcache_stats = {}
        self.version = None
        self.is_mariadb = None
        self._resolved_hostname = None
        self._agent_hostname = None
        self._is_aurora = None
        self._config = MySQLConfig(self.instance)

        # Create a new connection on every check run
        self._conn = None

        self._query_manager = QueryManager(self, self.execute_query_raw, queries=[])
        self.check_initializations.append(self._query_manager.compile_queries)
        self.innodb_stats = InnoDBMetrics()
        self.check_initializations.append(self._config.configuration_checks)
        self.performance_schema_enabled = None
        self._warnings_by_code = {}
        self._statement_metrics = MySQLStatementMetrics(self, self._config, self._get_connection_args())
        self._statement_samples = MySQLStatementSamples(self, self._config, self._get_connection_args())
        self._query_activity = MySQLActivity(self, self._config, self._get_connection_args())

    def execute_query_raw(self, query):
        with closing(self._conn.cursor(pymysql.cursors.SSCursor)) as cursor:
            cursor.execute(query)
            for row in cursor.fetchall_unbuffered():
                yield row

    @AgentCheck.metadata_entrypoint
    def _send_metadata(self):
        self.set_metadata('version', self.version.version + '+' + self.version.build)
        self.set_metadata('flavor', self.version.flavor)

    @property
    def resolved_hostname(self):
        if self._resolved_hostname is None:
            if self._config.reported_hostname:
                self._resolved_hostname = self._config.reported_hostname
            elif self._config.dbm_enabled or self.disable_generic_tags:
                self._resolved_hostname = self.resolve_db_host()
            else:
                self._resolved_hostname = self.agent_hostname
        return self._resolved_hostname

    @property
    def agent_hostname(self):
        # type: () -> str
        if self._agent_hostname is None:
            self._agent_hostname = datadog_agent.get_hostname()
        return self._agent_hostname

    def check_performance_schema_enabled(self, db):
        if self.performance_schema_enabled is None:
            with closing(db.cursor()) as cursor:
                cursor.execute("SHOW VARIABLES LIKE 'performance_schema'")
                results = dict(cursor.fetchall())
                self.performance_schema_enabled = self._get_variable_enabled(results, 'performance_schema')

        return self.performance_schema_enabled

    def resolve_db_host(self):
        return agent_host_resolver(self._config.host)

    def _get_debug_tags(self):
        return ['agent_hostname:{}'.format(datadog_agent.get_hostname())]

    @classmethod
    def get_library_versions(cls):
        return {'pymysql': pymysql.__version__}

    def check(self, _):
        if self.instance.get('user'):
            self._log_deprecation('_config_renamed', 'user', 'username')

        if self.instance.get('pass'):
            self._log_deprecation('_config_renamed', 'pass', 'password')

        tags = list(self._config.tags)
        self._set_qcache_stats()
        with self._connect() as db:
            try:
                self._conn = db

                # version collection
                self.version = get_version(db)
                self._send_metadata()

                self.is_mariadb = self.version.flavor == "MariaDB"
                if self._get_is_aurora(db):
                    tags = tags + self._get_runtime_aurora_tags(db)

                self.check_performance_schema_enabled(db)

                # Metric collection
                if not self._config.only_custom_queries:
                    self._collect_metrics(db, tags=tags)
                    self._collect_system_metrics(self._config.host, db, tags)

                if self._config.dbm_enabled:
                    dbm_tags = list(set(self.service_check_tags) | set(tags))
                    self._statement_metrics.run_job_loop(dbm_tags)
                    self._statement_samples.run_job_loop(dbm_tags)
                    self._query_activity.run_job_loop(dbm_tags)

                # keeping track of these:
                self._put_qcache_stats()

                # Custom queries
                self._query_manager.execute(extra_tags=tags)

            except Exception as e:
                self.log.exception("error!")
                raise e
            finally:
                self._conn = None
                self._report_warnings()

    def cancel(self):
        self._statement_samples.cancel()
        self._statement_metrics.cancel()
        self._query_activity.cancel()

    def _set_qcache_stats(self):
        host_key = self._get_host_key()
        qcache_st = self.qcache_stats.get(host_key, (None, None, None))

        self._qcache_hits = qcache_st[0]
        self._qcache_inserts = qcache_st[1]
        self._qcache_not_cached = qcache_st[2]

    def _put_qcache_stats(self):
        host_key = self._get_host_key()
        self.qcache_stats[host_key] = (self._qcache_hits, self._qcache_inserts, self._qcache_not_cached)

    def _get_host_key(self):
        if self._config.defaults_file:
            return self._config.defaults_file

        hostkey = self._config.host
        if self._config.mysql_sock:
            hostkey = "{0}:{1}".format(hostkey, self._config.mysql_sock)
        elif self._config.port:
            hostkey = "{0}:{1}".format(hostkey, self._config.port)

        return hostkey

    def _get_connection_args(self):
        ssl = dict(self._config.ssl) if self._config.ssl else None
        connection_args = {
            'ssl': ssl,
            'connect_timeout': self._config.connect_timeout,
            'autocommit': True,
        }
        if self._config.charset:
            connection_args['charset'] = self._config.charset

        if self._config.defaults_file != '':
            connection_args['read_default_file'] = self._config.defaults_file
            return connection_args

        connection_args.update({'user': self._config.user, 'passwd': self._config.password})
        if self._config.mysql_sock != '':
            self.service_check_tags = self._service_check_tags(self._config.mysql_sock)
            connection_args.update({'unix_socket': self._config.mysql_sock})
        else:
            connection_args.update({'host': self._config.host})

        if self._config.port:
            connection_args.update({'port': self._config.port})
        return connection_args

    def _service_check_tags(self, server=None):
        # type: (Optional[str]) -> List[str]
        if server is None:
            server = self._config.mysql_sock if self._config.mysql_sock != '' else self._config.host
        service_check_tags = [
            'port:{}'.format(self._config.port if self._config.port else 'unix_socket'),
        ] + self._config.tags
        if not self.disable_generic_tags:
            service_check_tags.append('server:{0}'.format(server))
        return service_check_tags

    @contextmanager
    def _connect(self):
        service_check_tags = self._service_check_tags()
        db = None
        try:
            connect_args = self._get_connection_args()
            db = pymysql.connect(**connect_args)
            self.log.debug("Connected to MySQL")
            self.service_check_tags = list(set(service_check_tags))
            self.service_check(
                self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags, hostname=self.resolved_hostname
            )
            yield db
        except Exception:
            self.service_check(
                self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags, hostname=self.resolved_hostname
            )
            raise
        finally:
            if db:
                db.close()

    def _collect_metrics(self, db, tags):

        # Get aggregate of all VARS we want to collect
        metrics = copy.deepcopy(STATUS_VARS)

        # collect results from db
        results = self._get_stats_from_status(db)
        results.update(self._get_stats_from_variables(db))

        if not is_affirmative(
            self._config.options.get('disable_innodb_metrics', False)
        ) and self._is_innodb_engine_enabled(db):
            results.update(self.innodb_stats.get_stats_from_innodb_status(db))
            self.innodb_stats.process_innodb_stats(results, self._config.options, metrics)

        # Binary log statistics
        if self._get_variable_enabled(results, 'log_bin'):
            results['Binlog_space_usage_bytes'] = self._get_binary_log_stats(db)

        # Compute key cache utilization metric
        key_blocks_unused = collect_scalar('Key_blocks_unused', results)
        key_cache_block_size = collect_scalar('key_cache_block_size', results)
        key_buffer_size = collect_scalar('key_buffer_size', results)
        results['Key_buffer_size'] = key_buffer_size

        try:
            # can be null if the unit is missing in the user config (4 instead of 4G for eg.)
            if key_buffer_size != 0:
                key_cache_utilization = 1 - ((key_blocks_unused * key_cache_block_size) / key_buffer_size)
                results['Key_cache_utilization'] = key_cache_utilization

            results['Key_buffer_bytes_used'] = collect_scalar('Key_blocks_used', results) * key_cache_block_size
            results['Key_buffer_bytes_unflushed'] = (
                collect_scalar('Key_blocks_not_flushed', results) * key_cache_block_size
            )
        except TypeError as e:
            self.log.error("Not all Key metrics are available, unable to compute: %s", e)

        metrics.update(VARIABLES_VARS)
        metrics.update(INNODB_VARS)
        metrics.update(BINLOG_VARS)

        if is_affirmative(self._config.options.get('extra_status_metrics', self._config.dbm_enabled)):
            self.log.debug("Collecting Extra Status Metrics")
            metrics.update(OPTIONAL_STATUS_VARS)

            if self.version.version_compatible((5, 6, 6)):
                metrics.update(OPTIONAL_STATUS_VARS_5_6_6)

        if is_affirmative(self._config.options.get('galera_cluster', False)):
            # already in result-set after 'SHOW STATUS' just add vars to collect
            self.log.debug("Collecting Galera Metrics.")
            metrics.update(GALERA_VARS)

        above_560 = self.version.version_compatible((5, 6, 0))
        if (
            is_affirmative(self._config.options.get('extra_performance_metrics', False))
            and above_560
            and self.performance_schema_enabled
        ):
            # report size of schemas in MiB to Datadog
            results['perf_digest_95th_percentile_avg_us'] = self._get_query_exec_time_95th_us(db)
            results['query_run_time_avg'] = self._query_exec_time_per_schema(db)
            metrics.update(PERFORMANCE_VARS)

        if is_affirmative(self._config.options.get('schema_size_metrics', False)):
            # report avg query response time per schema to Datadog
            results['information_schema_size'] = self._query_size_per_schema(db)
            metrics.update(SCHEMA_VARS)

        if is_affirmative(self._config.options.get('table_size_metrics', False)):
            # report size of tables in MiB to Datadog
            (table_index_size, table_data_size) = self._query_size_per_table(db)
            results['information_table_index_size'] = table_index_size
            results['information_table_data_size'] = table_data_size
            metrics.update(TABLE_VARS)

        if is_affirmative(self._config.options.get('system_table_size_metrics', False)):
            # report size of tables in MiB to Datadog
            (table_index_size, table_data_size) = self._query_size_per_table(db, system_tables=True)
            results['information_table_index_size'] = table_index_size
            results['information_table_data_size'] = table_data_size
            metrics.update(TABLE_VARS)

        if is_affirmative(self._config.options.get('replication', self._config.dbm_enabled)):
            if self.performance_schema_enabled and self._is_group_replication_active(db):
                self.log.debug('Collecting group replication metrics.')
                self._collect_group_replica_metrics(db, results)
            else:
                replication_metrics = self._collect_replication_metrics(db, results, above_560)
                metrics.update(replication_metrics)
                self._check_replication_status(results)

        if len(self._config.additional_status) > 0:
            additional_status_dict = {}
            for status_dict in self._config.additional_status:
                status_name = status_dict["name"]
                status_metric = status_dict["metric_name"]
                if status_name in metrics.keys():
                    collected_metric = metrics.get(status_name)[0]
                    self.log.debug(
                        "Skipping status variable %s for metric %s as it is already collected by %s",
                        status_name,
                        status_metric,
                        collected_metric,
                    )
                else:
                    additional_status_dict[status_dict["name"]] = (status_dict["metric_name"], status_dict["type"])
            metrics.update(additional_status_dict)

        if len(self._config.additional_variable) > 0:
            additional_variable_dict = {}
            for variable_dict in self._config.additional_variable:
                variable_name = variable_dict["name"]
                variable_metric = variable_dict["metric_name"]
                if variable_name in metrics.keys():
                    collected_metric = metrics.get(variable_name)[0]
                    self.log.debug(
                        "Skipping variable %s for metric %s as it is already collected by %s",
                        variable_name,
                        variable_metric,
                        collected_metric,
                    )
                else:
                    additional_variable_dict[variable_name] = (variable_metric, variable_dict["type"])

            metrics.update(additional_variable_dict)

        # "synthetic" metrics
        metrics.update(SYNTHETIC_VARS)
        self._compute_synthetic_results(results)

        # remove uncomputed metrics
        for k in SYNTHETIC_VARS:
            if k not in results:
                metrics.pop(k, None)

        # add duped metrics - reporting some as both rate and gauge
        dupes = [
            ('Table_locks_waited', 'Table_locks_waited_rate'),
            ('Table_locks_immediate', 'Table_locks_immediate_rate'),
        ]
        for src, dst in dupes:
            if src in results:
                results[dst] = results[src]

        self._submit_metrics(metrics, results, tags)

        # Collect custom query metrics
        # Max of 20 queries allowed
        if isinstance(self._config.queries, list):
            for check in self._config.queries[: self._config.max_custom_queries]:
                total_tags = tags + check.get('tags', [])
                self._collect_dict(
                    check['type'], {check['field']: check['metric']}, check['query'], db, tags=total_tags
                )

            if len(self._config.queries) > self._config.max_custom_queries:
                self.warning(
                    "Maximum number (%s) of custom queries reached. Skipping the rest.", self._config.max_custom_queries
                )

    def _collect_replication_metrics(self, db, results, above_560):
        # Get replica stats
        replication_channel = self._config.options.get('replication_channel')
        results.update(self._get_replica_stats(db, self.is_mariadb, replication_channel))
        nonblocking = is_affirmative(self._config.options.get('replication_non_blocking_status', False))
        results.update(self._get_replica_status(db, above_560, nonblocking))
        return REPLICA_VARS

    def _collect_group_replica_metrics(self, db, results):
        try:
            with closing(db.cursor()) as cursor:
                cursor.execute(SQL_GROUP_REPLICATION_MEMBER)
                replica_results = cursor.fetchone()
                status = self.OK
                additional_tags = []
                if replica_results is None or len(replica_results) < 3:
                    self.log.warning(
                        'Unable to get group replica status, setting mysql.replication.group.status as CRITICAL'
                    )
                    status = self.CRITICAL
                else:
                    status = self.OK if replica_results[1] == 'ONLINE' else self.CRITICAL
                    additional_tags = [
                        'channel_name:{}'.format(replica_results[0]),
                        'member_state:{}'.format(replica_results[1]),
                        'member_role:{}'.format(replica_results[2]),
                    ]
                    self.gauge('mysql.replication.group.member_status', 1, tags=additional_tags + self._config.tags)

                self.service_check(
                    self.GROUP_REPLICATION_SERVICE_CHECK_NAME,
                    status=status,
                    tags=self._service_check_tags() + additional_tags,
                )

                cursor.execute(SQL_GROUP_REPLICATION_METRICS)
                r = cursor.fetchone()

                if r is None:
                    self.log.warning('Unable to get group replication metrics')
                    return {}

                results = {
                    'Transactions_count': r[1],
                    'Transactions_check': r[2],
                    'Conflict_detected': r[3],
                    'Transactions_row_validating': r[4],
                    'Transactions_remote_applier_queue': r[5],
                    'Transactions_remote_applied': r[6],
                    'Transactions_local_proposed': r[7],
                    'Transactions_local_rollback': r[8],
                }
                # Submit metrics now so it's possible to attach `channel_name` tag
                self._submit_metrics(
                    GROUP_REPLICATION_VARS, results, self._config.tags + ['channel_name:{}'.format(r[0])]
                )

                return GROUP_REPLICATION_VARS
        except Exception as e:
            self.warning("Internal error happened during the group replication check: %s", e)
            return {}

    def _check_replication_status(self, results):
        # Replica_IO_Running: Whether the I/O thread for reading the source's binary log is running.
        # You want this to be Yes unless you have not yet started replication or have explicitly stopped it.
        replica_io_running = collect_type('Slave_IO_Running', results, dict)
        if replica_io_running is None:
            replica_io_running = collect_type('Replica_IO_Running', results, dict)
        # Replica_SQL_Running: Whether the SQL thread for executing events in the relay log is running.
        replica_sql_running = collect_type('Slave_SQL_Running', results, dict)
        if replica_sql_running is None:
            replica_sql_running = collect_type('Replica_SQL_Running', results, dict)
        if replica_io_running:
            replica_io_running = any(v.lower().strip() == 'yes' for v in itervalues(replica_io_running))
        if replica_sql_running:
            replica_sql_running = any(v.lower().strip() == 'yes' for v in itervalues(replica_sql_running))
        binlog_running = results.get('Binlog_enabled', False)

        # replicas will only be collected if user has PROCESS privileges.
        replicas = collect_scalar('Slaves_connected', results)
        if replicas is None:
            replicas = collect_scalar('Replicas_connected', results)

        # If the host act as a source
        source_repl_running_status = AgentCheck.UNKNOWN
        if self._is_source_host(replicas, results):
            if replicas > 0 and binlog_running:
                self.log.debug("Host is master, there are replicas and binlog is running")
                source_repl_running_status = AgentCheck.OK
            else:
                source_repl_running_status = AgentCheck.WARNING

            self._submit_replication_status(source_repl_running_status, ['replication_mode:source'])

        # If the host act as a replica
        # A host can be both a source and a replica
        # See https://dev.mysql.com/doc/refman/8.0/en/replication-solutions-performance.html
        # get replica running form global status page
        replica_running_status = AgentCheck.UNKNOWN
        if self._is_replica_host(replicas, results):
            if not (replica_io_running is None and replica_sql_running is None):
                if not replica_io_running and not replica_sql_running:
                    self.log.debug("Replica_IO_Running and Replica_SQL_Running are not ok")
                    replica_running_status = AgentCheck.CRITICAL
                elif not replica_io_running or not replica_sql_running:
                    self.log.debug("Either Replica_IO_Running or Replica_SQL_Running are not ok")
                    replica_running_status = AgentCheck.WARNING
                else:
                    self.log.debug("Replica_IO_Running and Replica_SQL_Running are ok")
                    replica_running_status = AgentCheck.OK

                self._submit_replication_status(replica_running_status, ['replication_mode:replica'])

    def _submit_replication_status(self, status, additional_tags):
        # deprecated in favor of service_check("mysql.replication.slave_running")
        self.gauge(
            name=self.SLAVE_SERVICE_CHECK_NAME,
            value=1 if status == AgentCheck.OK else 0,
            tags=self._config.tags + additional_tags,
            hostname=self.resolved_hostname,
        )
        # deprecated in favor of service_check("mysql.replication.replica_running")
        self.service_check(
            self.SLAVE_SERVICE_CHECK_NAME,
            status,
            tags=self.service_check_tags + additional_tags,
            hostname=self.resolved_hostname,
        )
        self.service_check(
            self.REPLICA_SERVICE_CHECK_NAME,
            status,
            tags=self.service_check_tags + additional_tags,
            hostname=self.resolved_hostname,
        )

    def _is_source_host(self, replicas, results):
        # type: (float, Dict[str, Any]) -> bool
        # master uuid only collected in replicas
        source_host = collect_string('Master_Host', results) or collect_string('Source_Host', results)
        if replicas > 0 or not source_host:
            return True

        return False

    def _is_replica_host(self, replicas, results):
        return collect_string('Master_Host', results) or collect_string('Source_Host', results)

    def _is_group_replication_active(self, db):
        with closing(db.cursor()) as cursor:
            cursor.execute(SQL_GROUP_REPLICATION_PLUGIN_STATUS)
            r = cursor.fetchone()

            # Plugin is installed
            if r is not None and r[0].lower() == 'active':
                self.log.debug('Group replication plugin is detected and active')
                return True
        self.log.debug('Group replication plugin not detected')
        return False

    def _submit_metrics(self, variables, db_results, tags):
        for variable, metric in iteritems(variables):
            if isinstance(metric, list):
                for m in metric:
                    metric_name, metric_type = m
                    self.__submit_metric(metric_name, metric_type, variable, db_results, tags)
            else:
                metric_name, metric_type = metric
                self.__submit_metric(metric_name, metric_type, variable, db_results, tags)

    def __submit_metric(self, metric_name, metric_type, variable, db_results, tags):
        for tag, value in collect_all_scalars(variable, db_results):
            metric_tags = list(tags)
            if tag:
                if "," in tag:
                    t_split = tag.split(",")
                    for t in t_split:
                        metric_tags.append(t)
                else:
                    metric_tags.append(tag)
            if value is not None:
                if metric_type == RATE:
                    self.rate(metric_name, value, tags=metric_tags, hostname=self.resolved_hostname)
                elif metric_type == GAUGE:
                    self.gauge(metric_name, value, tags=metric_tags, hostname=self.resolved_hostname)
                elif metric_type == COUNT:
                    self.count(metric_name, value, tags=metric_tags, hostname=self.resolved_hostname)
                elif metric_type == MONOTONIC:
                    self.monotonic_count(metric_name, value, tags=metric_tags, hostname=self.resolved_hostname)

    def _collect_dict(self, metric_type, field_metric_map, query, db, tags):
        """
        Query status and get a dictionary back.
        Extract each field out of the dictionary
        and stuff it in the corresponding metric.

        query: show status...
        field_metric_map: {"Seconds_behind_master": "mysqlSecondsBehindMaster"}
        """
        try:
            with closing(db.cursor()) as cursor:
                cursor.execute(query)
                result = cursor.fetchone()
                if result is not None:
                    for field, metric in list(iteritems(field_metric_map)):
                        # Find the column name in the cursor description to identify the column index
                        # http://www.python.org/dev/peps/pep-0249/
                        # cursor.description is a tuple of (column_name, ..., ...)
                        try:
                            col_idx = [d[0].lower() for d in cursor.description].index(field.lower())
                            self.log.debug("Collecting metric: %s", metric)
                            if result[col_idx] is not None:
                                self.log.debug("Collecting done, value %s", result[col_idx])
                                if metric_type == GAUGE:
                                    self.gauge(
                                        metric, float(result[col_idx]), tags=tags, hostname=self.resolved_hostname
                                    )
                                elif metric_type == RATE:
                                    self.rate(
                                        metric, float(result[col_idx]), tags=tags, hostname=self.resolved_hostname
                                    )
                                else:
                                    self.gauge(
                                        metric, float(result[col_idx]), tags=tags, hostname=self.resolved_hostname
                                    )
                            else:
                                self.log.debug("Received value is None for index %d", col_idx)
                        except ValueError:
                            self.log.exception("Cannot find %s in the columns %s", field, cursor.description)
        except Exception:
            self.warning("Error while running %s\n%s", query, traceback.format_exc())
            self.log.exception("Error while running %s", query)

    def _get_runtime_aurora_tags(self, db):
        runtime_tags = []

        try:
            with closing(db.cursor()) as cursor:
                cursor.execute(SQL_REPLICATION_ROLE_AWS_AURORA)
                replication_role = cursor.fetchone()[0]

                if replication_role in {'writer', 'reader'}:
                    runtime_tags.append('replication_role:' + replication_role)
        except Exception:
            self.log.warning("Error occurred while fetching Aurora runtime tags: %s", traceback.format_exc())

        return runtime_tags

    def _collect_system_metrics(self, host, db, tags):
        pid = None
        # The server needs to run locally, accessed by TCP or socket
        if host in ["localhost", "127.0.0.1", "0.0.0.0"] or db.port == long(0):
            pid = self._get_server_pid(db)

        if pid:
            self.log.debug("System metrics for mysql w/ pid: %s", pid)
            # At last, get mysql cpu data out of psutil or procfs

            try:
                if PSUTIL_AVAILABLE:
                    self.log.debug("psutil is available, attempting to collect mysql.performance.* metrics")
                    proc = psutil.Process(pid)

                    ucpu = proc.cpu_times()[0]
                    scpu = proc.cpu_times()[1]

                    if ucpu and scpu:
                        self.rate("mysql.performance.user_time", ucpu, tags=tags, hostname=self.resolved_hostname)
                        # should really be system_time
                        self.rate("mysql.performance.kernel_time", scpu, tags=tags, hostname=self.resolved_hostname)
                        self.rate("mysql.performance.cpu_time", ucpu + scpu, tags=tags, hostname=self.resolved_hostname)
                else:
                    self.log.debug("psutil is not available, will not collect mysql.performance.* metrics")
            except Exception:
                self.warning("Error while reading mysql (pid: %s) procfs data\n%s", pid, traceback.format_exc())

    def _get_pid_file_variable(self, db):
        """
        Get the `pid_file` variable
        """
        pid_file = None
        try:
            with closing(db.cursor()) as cursor:
                cursor.execute("SHOW VARIABLES LIKE 'pid_file'")
                pid_file = cursor.fetchone()[1]
        except Exception:
            self.warning("Error while fetching pid_file variable of MySQL.")

        return pid_file

    def _get_server_pid(self, db):
        pid = None

        # Try to get pid from pid file, it can fail for permission reason
        pid_file = self._get_pid_file_variable(db)
        if pid_file is not None:
            self.log.debug("pid file: %s", str(pid_file))
            try:
                with open(pid_file, 'rb') as f:
                    pid = int(f.readline())
            except IOError:
                self.log.debug("Cannot read mysql pid file %s", pid_file)

        process_name = [PROC_NAME]
        if self.is_mariadb and self.version.version_compatible((10, 5, 0)):
            process_name.append("mariadbd")

        # If pid has not been found, read it from ps
        if pid is None and PSUTIL_AVAILABLE:
            for proc in psutil.process_iter():
                try:
                    if proc.name() in process_name:
                        pid = proc.pid
                except (psutil.AccessDenied, psutil.ZombieProcess, psutil.NoSuchProcess):
                    continue
                except Exception:
                    self.log.exception("Error while fetching mysql pid from psutil")

        return pid

    def _get_is_aurora(self, db):
        """
        Tests if the instance is an AWS Aurora database and caches the result.
        """
        if self._is_aurora is not None:
            return self._is_aurora

        try:
            with closing(db.cursor()) as cursor:
                cursor.execute(SQL_SERVER_ID_AWS_AURORA)
                if len(cursor.fetchall()) > 0:
                    self._is_aurora = True
                else:
                    self._is_aurora = False

        except Exception:
            self.warning(
                "Unable to determine if server is Aurora. If this is an Aurora database, some "
                "information may be unavailable: %s",
                traceback.format_exc(),
            )
            return False

        return self._is_aurora

    @classmethod
    def _get_stats_from_status(cls, db):
        with closing(db.cursor()) as cursor:
            cursor.execute("SHOW /*!50002 GLOBAL */ STATUS;")
            results = dict(cursor.fetchall())

            return results

    @classmethod
    def _get_stats_from_variables(cls, db):
        with closing(db.cursor()) as cursor:
            cursor.execute("SHOW GLOBAL VARIABLES;")
            results = dict(cursor.fetchall())

            return results

    def _get_binary_log_stats(self, db):
        try:
            with closing(db.cursor()) as cursor:
                cursor.execute("SHOW BINARY LOGS;")
                cursor_results = cursor.fetchall()
                master_logs = {result[0]: result[1] for result in cursor_results}

                binary_log_space = 0
                for value in itervalues(master_logs):
                    binary_log_space += value

                return binary_log_space
        except (pymysql.err.InternalError, pymysql.err.OperationalError) as e:
            self.warning("Privileges error accessing the BINARY LOGS (must grant REPLICATION CLIENT): %s", e)
            return None

    def _is_innodb_engine_enabled(self, db):
        # Whether InnoDB engine is available or not can be found out either
        # from the output of SHOW ENGINES or from information_schema.ENGINES
        # table. Later is chosen because that involves no string parsing.
        try:
            with closing(db.cursor()) as cursor:
                cursor.execute(SQL_INNODB_ENGINES)
                return cursor.rowcount > 0

        except (pymysql.err.InternalError, pymysql.err.OperationalError, pymysql.err.NotSupportedError) as e:
            self.warning("Possibly innodb stats unavailable - error querying engines table: %s", e)
            return False

    def _get_replica_stats(self, db, is_mariadb, replication_channel):
        replica_results = defaultdict(dict)
        try:
            with closing(db.cursor(pymysql.cursors.DictCursor)) as cursor:
                if is_mariadb and replication_channel:
                    cursor.execute("SET @@default_master_connection = '{0}';".format(replication_channel))
                cursor.execute(show_replica_status_query(self.version, is_mariadb, replication_channel))

                results = cursor.fetchall()
                self.log.debug("Getting replication status: %s", results)
                for replica_result in results:
                    # MySQL <5.7 does not have Channel_Name.
                    # For MySQL >=5.7 'Channel_Name' is set to an empty string by default
                    channel = replication_channel or replica_result.get('Channel_Name') or 'default'
                    for key, value in iteritems(replica_result):
                        if value is not None:
                            replica_results[key]['channel:{0}'.format(channel)] = value
        except (pymysql.err.InternalError, pymysql.err.OperationalError) as e:
            errno, msg = e.args
            if errno == 1617 and msg == "There is no master connection '{0}'".format(replication_channel):
                # MariaDB complains when you try to get replica status with a
                # connection name on the master, without connection name it
                # responds an empty string as expected.
                # Mysql behaves the same with or without connection name.
                pass
            else:
                self.warning("Privileges error getting replication status (must grant REPLICATION CLIENT): %s", e)

        try:
            with closing(db.cursor(pymysql.cursors.DictCursor)) as cursor:
                cursor.execute("SHOW MASTER STATUS;")
                binlog_results = cursor.fetchone()
                if binlog_results:
                    replica_results.update({'Binlog_enabled': True})
        except (pymysql.err.InternalError, pymysql.err.OperationalError) as e:
            self.warning("Privileges error getting binlog information (must grant REPLICATION CLIENT): %s", e)

        return replica_results

    def _get_replica_status(self, db, above_560, nonblocking):
        """
        Retrieve the replicas statuses using:
        1. The `performance_schema.threads` table. Non-blocking, requires version > 5.6.0
        2. The `information_schema.processlist` table. Blocking
        """
        try:
            with closing(db.cursor()) as cursor:
                if above_560 and nonblocking:
                    # Query `performance_schema.threads` instead of `
                    # information_schema.processlist` to avoid mutex impact on performance.
                    cursor.execute(SQL_WORKER_THREADS)
                else:
                    cursor.execute(SQL_PROCESS_LIST)
                replica_results = cursor.fetchall()
                replicas = 0
                for _ in replica_results:
                    replicas += 1

                return {'Replicas_connected': replicas}

        except (pymysql.err.InternalError, pymysql.err.OperationalError) as e:
            self.warning("Privileges error accessing the process tables (must grant PROCESS): %s", e)
            return {}

    @classmethod
    def _are_values_numeric(cls, array):
        return all(v.isdigit() for v in array)

    def _get_variable_enabled(self, results, var):
        enabled = collect_string(var, results)
        return enabled and enabled.lower().strip() == 'on'

    def _get_query_exec_time_95th_us(self, db):
        # Fetches the 95th percentile query execution time and returns the value
        # in microseconds
        try:
            with closing(db.cursor()) as cursor:
                cursor.execute(SQL_95TH_PERCENTILE)

                if cursor.rowcount < 1:
                    self.warning(
                        "Failed to fetch records from the perf schema \
                                 'events_statements_summary_by_digest' table."
                    )
                    return None

                row = cursor.fetchone()
                query_exec_time_95th_per = row[0]

                return query_exec_time_95th_per
        except (pymysql.err.InternalError, pymysql.err.OperationalError) as e:
            self.warning("95th percentile performance metrics unavailable at this time: %s", e)
            return None

    def _query_exec_time_per_schema(self, db):
        # Fetches the avg query execution time per schema and returns the
        # value in microseconds
        try:
            with closing(db.cursor()) as cursor:
                cursor.execute(SQL_AVG_QUERY_RUN_TIME)

                if cursor.rowcount < 1:
                    self.warning(
                        "Failed to fetch records from the perf schema \
                                 'events_statements_summary_by_digest' table."
                    )
                    return None

                schema_query_avg_run_time = {}
                for row in cursor.fetchall():
                    schema_name = str(row[0])
                    avg_us = long(row[1])

                    # set the tag as the dictionary key
                    schema_query_avg_run_time["schema:{0}".format(schema_name)] = avg_us

                return schema_query_avg_run_time
        except (pymysql.err.InternalError, pymysql.err.OperationalError) as e:
            self.warning("Size of schemas metrics unavailable at this time: %s", e)

        return {}

    def _query_size_per_table(self, db, system_tables=False):
        try:
            with closing(db.cursor()) as cursor:
                if system_tables:
                    cursor.execute(SQL_QUERY_SYSTEM_TABLE_SIZE)
                else:
                    cursor.execute(SQL_QUERY_TABLE_SIZE)

                if cursor.rowcount < 1:
                    self.warning("Failed to fetch records from the information schema 'tables' table.")
                    return None

                table_index_size = {}
                table_data_size = {}
                for row in cursor.fetchall():
                    table_schema = str(row[0])
                    table_name = str(row[1])
                    index_size = float(row[2])
                    data_size = float(row[3])

                    # set the tag as the dictionary key
                    table_index_size["schema:{},table:{}".format(table_schema, table_name)] = index_size
                    table_data_size["schema:{},table:{}".format(table_schema, table_name)] = data_size

                return table_index_size, table_data_size
        except (pymysql.err.InternalError, pymysql.err.OperationalError) as e:
            self.warning("Size of tables metrics unavailable at this time: %s", e)

            return None

    def _query_size_per_schema(self, db):
        # Fetches the avg query execution time per schema and returns the
        # value in microseconds
        try:
            with closing(db.cursor()) as cursor:
                cursor.execute(SQL_QUERY_SCHEMA_SIZE)

                if cursor.rowcount < 1:
                    self.warning("Failed to fetch records from the information schema 'tables' table.")
                    return None

                schema_size = {}
                for row in cursor.fetchall():
                    schema_name = str(row[0])
                    size = long(row[1])

                    # set the tag as the dictionary key
                    schema_size["schema:{0}".format(schema_name)] = size

                return schema_size
        except (pymysql.err.InternalError, pymysql.err.OperationalError) as e:
            self.warning("Avg exec time performance metrics unavailable at this time: %s", e)

        return {}

    def _compute_synthetic_results(self, results):
        if ('Qcache_hits' in results) and ('Qcache_inserts' in results) and ('Qcache_not_cached' in results):
            if not int(results['Qcache_hits']):
                results['Qcache_utilization'] = 0
            else:
                results['Qcache_utilization'] = (
                    float(results['Qcache_hits'])
                    / (int(results['Qcache_inserts']) + int(results['Qcache_not_cached']) + int(results['Qcache_hits']))
                    * 100
                )

            if all(v is not None for v in (self._qcache_hits, self._qcache_inserts, self._qcache_not_cached)):
                if not (int(results['Qcache_hits']) - self._qcache_hits):
                    results['Qcache_instant_utilization'] = 0
                else:
                    top = float(results['Qcache_hits']) - self._qcache_hits
                    bottom = (
                        (int(results['Qcache_inserts']) - self._qcache_inserts)
                        + (int(results['Qcache_not_cached']) - self._qcache_not_cached)
                        + (int(results['Qcache_hits']) - self._qcache_hits)
                    )
                    results['Qcache_instant_utilization'] = (top / bottom) * 100

            # update all three, or none - for consistent samples.
            self._qcache_hits = int(results['Qcache_hits'])
            self._qcache_inserts = int(results['Qcache_inserts'])
            self._qcache_not_cached = int(results['Qcache_not_cached'])

    def record_warning(self, code, message):
        # type: (DatabaseConfigurationError, str) -> None
        self._warnings_by_code[code] = message

    def _report_warnings(self):
        messages = self._warnings_by_code.values()
        # Reset the warnings for the next check run
        self._warnings_by_code = {}

        for warning in messages:
            self.warning(warning)
Beispiel #16
0
class SnowflakeCheck(AgentCheck):
    """
    Collect Snowflake account usage metrics
    """

    __NAMESPACE__ = 'snowflake'

    SERVICE_CHECK_CONNECT = 'snowflake.can_connect'

    def __init__(self, *args, **kwargs):
        super(SnowflakeCheck, self).__init__(*args, **kwargs)
        self._config = Config(self.instance)
        self._conn = None

        self.proxy_host = self.init_config.get('proxy_host', None)
        self.proxy_port = self.init_config.get('proxy_port', None)
        self.proxy_user = self.init_config.get('proxy_user', None)
        self.proxy_password = self.init_config.get('proxy_password', None)

        # Add default tags like account to all metrics
        self._tags = self._config.tags + [
            'account:{}'.format(self._config.account)
        ]

        if self._config.password:
            self.register_secret(self._config.password)

        if self._config.role == 'ACCOUNTADMIN':
            self.log.info(
                'Snowflake `role` is set as `ACCOUNTADMIN` which should be used cautiously, '
                'refer to docs about custom roles.')

        self.metric_queries = []
        self.errors = []
        for mgroup in self._config.metric_groups:
            try:
                self.metric_queries.extend(METRIC_GROUPS[mgroup])
            except KeyError:
                self.errors.append(mgroup)

        if self.errors:
            self.log.warning(
                'Invalid metric_groups found in snowflake conf.yaml: %s',
                (', '.join(self.errors)))
        if not self.metric_queries:
            raise ConfigurationError(
                'No valid metric_groups configured, please list at least one.')

        self._query_manager = QueryManager(self,
                                           self.execute_query_raw,
                                           queries=self.metric_queries,
                                           tags=self._tags)
        self.check_initializations.append(self._query_manager.compile_queries)

    def check(self, _):
        self.connect()

        if self._conn is not None:
            # Execute queries
            self._query_manager.execute()

            self._collect_version()

            self.log.debug("Closing connection to Snowflake...")
            self._conn.close()

    def execute_query_raw(self, query):
        """
        Executes query with timestamp from parts if comparing start_time field.
        """
        with closing(self._conn.cursor()) as cursor:
            cursor.execute(query)

            if cursor.rowcount is None or cursor.rowcount < 1:
                self.log.debug("Failed to fetch records from query: `%s`",
                               query)
                return []
            return cursor.fetchall()

    def connect(self):
        self.log.debug(
            "Establishing a new connection to Snowflake: account=%s, user=%s, database=%s, schema=%s, warehouse=%s, "
            "role=%s, timeout=%s, authenticator=%s, ocsp_response_cache_filename=%s, proxy_host=%s, proxy_port=%s",
            self._config.account,
            self._config.user,
            self._config.database,
            self._config.schema,
            self._config.warehouse,
            self._config.role,
            self._config.login_timeout,
            self._config.authenticator,
            self._config.ocsp_response_cache_filename,
            self.proxy_host,
            self.proxy_port,
        )

        try:
            conn = sf.connect(
                user=self._config.user,
                password=self._config.password,
                account=self._config.account,
                database=self._config.database,
                schema=self._config.schema,
                warehouse=self._config.warehouse,
                role=self._config.role,
                passcode_in_password=self._config.passcode_in_password,
                passcode=self._config.passcode,
                client_prefetch_threads=self._config.client_prefetch_threads,
                login_timeout=self._config.login_timeout,
                ocsp_response_cache_filename=self._config.
                ocsp_response_cache_filename,
                authenticator=self._config.authenticator,
                token=self._config.token,
                client_session_keep_alive=self._config.client_keep_alive,
                proxy_host=self.proxy_host,
                proxy_port=self.proxy_port,
                proxy_user=self.proxy_user,
                proxy_password=self.proxy_password,
            )
        except Exception as e:
            msg = "Unable to connect to Snowflake: {}".format(e)
            self.service_check(self.SERVICE_CHECK_CONNECT,
                               self.CRITICAL,
                               message=msg,
                               tags=self._tags)
            self.warning(msg)
        else:
            self.service_check(self.SERVICE_CHECK_CONNECT,
                               self.OK,
                               tags=self._tags)
            self._conn = conn

    @AgentCheck.metadata_entrypoint
    def _collect_version(self):
        try:
            raw_version = self.execute_query_raw("select current_version();")
            version = raw_version[0][0]
        except Exception as e:
            self.log.error("Error collecting version for Snowflake: %s", e)
        else:
            if version:
                self.set_metadata('version', version)
Beispiel #17
0
class SinglestoreCheck(AgentCheck):

    SERVICE_CHECK_NAME = "can_connect"
    __NAMESPACE__ = "singlestore"

    def __init__(self, name, init_config, instances):
        # type: (AnyStr, Dict[AnyStr, Any], List[Dict[AnyStr, Any]]) -> None
        super(SinglestoreCheck, self).__init__(name, init_config, instances)
        self.config = SingleStoreConfig(self.instance)
        self._connection = cast(pymysql.Connection, None)

        manager_queries = []
        manager_queries.extend(DEFAULT_QUERIES)
        if self.config.collect_system_metrics:
            manager_queries.extend(ADDITIONAL_SYSTEM_QUERIES)
        self._query_manager = QueryManager(self,
                                           self.execute_query_raw,
                                           queries=manager_queries,
                                           tags=self.config.tags)
        self.check_initializations.append(self._query_manager.compile_queries)
        self._service_check_tags = [
            'singlestore_endpoint:{}:{}'.format(self.config.host,
                                                self.config.port)
        ] + self.config.tags

    def check(self, _):
        # type: (Any) -> None
        with self.connect() as conn:
            self._connection = conn
            self._query_manager.execute()
            self._connection = cast(pymysql.Connection, None)

    def execute_query_raw(self, query):
        # type: (AnyStr) -> Iterable[Sequence]
        with closing(self._connection.cursor()) as cursor:
            cursor.execute(query)
            if cursor.rowcount < 1:
                self.log.warning("Failed to fetch records from query: `%s`.",
                                 query)
                return

            cleaner_method = get_row_cleaner(query)

            for row in cursor.fetchall():
                try:
                    yield cleaner_method(row)
                except Exception:
                    self.log.debug("Unable to clean row %r.", exc_info=True)
                    yield row

    @contextmanager
    def connect(self):
        # type: () ->  Iterator[pymysql.Connection]
        ssl_context = self.get_tls_context() if self.config.use_tls else None

        conn = cast(pymysql.Connection, None)
        try:
            conn = pymysql.connect(
                host=self.config.host,
                port=self.config.port,
                user=self.config.username,
                password=self.config.password,
                connect_timeout=self.config.connect_timeout,
                read_timeout=self.config.read_timeout,
                ssl=ssl_context,
            )
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.OK,
                               tags=self._service_check_tags)
            self.log.debug("Connected to SingleStore")
            yield conn
        except Exception:
            self.service_check(self.SERVICE_CHECK_NAME,
                               AgentCheck.CRITICAL,
                               tags=self._service_check_tags)
            self.log.exception("Cannot connect to SingleStore")
            raise
        finally:
            if conn:
                conn.close()