def collect(self, api):
        # Fetch information analogous to Mongo's db.getReplicationInfo()
        localdb = api["local"]

        oplog_data = {}
        try:
            for collection_name in ("oplog.rs", "oplog.$main"):
                ol_options = localdb[collection_name].options()
                if ol_options:
                    break
        except pymongo.errors.OperationFailure as e:
            # In theory this error should only happen when connected to mongos or arbiter.
            self.log.debug(
                "Unable to collect oplog metrics from replica set member. Error is: %s",
                e)
            return

        if ol_options:
            try:
                oplog_data['logSizeMB'] = round_value(
                    ol_options['size'] / 2.0**20, 2)

                oplog = localdb[collection_name]

                oplog_data['usedSizeMB'] = round_value(
                    localdb.command("collstats", collection_name)['size'] /
                    2.0**20, 2)

                op_asc_cursor = oplog.find({
                    "ts": {
                        "$exists": 1
                    }
                }).sort("$natural", pymongo.ASCENDING).limit(1)
                op_dsc_cursor = oplog.find({
                    "ts": {
                        "$exists": 1
                    }
                }).sort("$natural", pymongo.DESCENDING).limit(1)

                try:
                    first_timestamp = op_asc_cursor[0]['ts'].as_datetime()
                    last_timestamp = op_dsc_cursor[0]['ts'].as_datetime()
                    time_diff = last_timestamp - first_timestamp
                    oplog_data['timeDiff'] = time_diff.total_seconds()
                except (IndexError, KeyError):
                    # if the oplog collection doesn't have any entries
                    # if an object in the collection doesn't have a ts value, we ignore it
                    pass
            except KeyError:
                # encountered an error trying to access options.size for the oplog collection
                self.log.warning(
                    u"Failed to record `ReplicationInfo` metrics.")

        self._submit_payload({'oplog': oplog_data})
    def collect(self, client):
        # Fetch information analogous to Mongo's db.getReplicationInfo()
        localdb = client["local"]

        oplog_data = {}

        for collection_name in ("oplog.rs", "oplog.$main"):
            ol_options = localdb[collection_name].options()
            if ol_options:
                break

        if ol_options:
            try:
                oplog_data['logSizeMB'] = round_value(
                    ol_options['size'] / 2.0**20, 2)

                oplog = localdb[collection_name]

                oplog_data['usedSizeMB'] = round_value(
                    localdb.command("collstats", collection_name)['size'] /
                    2.0**20, 2)

                op_asc_cursor = oplog.find({
                    "ts": {
                        "$exists": 1
                    }
                }).sort("$natural", pymongo.ASCENDING).limit(1)
                op_dsc_cursor = oplog.find({
                    "ts": {
                        "$exists": 1
                    }
                }).sort("$natural", pymongo.DESCENDING).limit(1)

                try:
                    first_timestamp = op_asc_cursor[0]['ts'].as_datetime()
                    last_timestamp = op_dsc_cursor[0]['ts'].as_datetime()
                    time_diff = last_timestamp - first_timestamp
                    oplog_data['timeDiff'] = time_diff.total_seconds()
                except (IndexError, KeyError):
                    # if the oplog collection doesn't have any entries
                    # if an object in the collection doesn't have a ts value, we ignore it
                    pass
            except KeyError:
                # encountered an error trying to access options.size for the oplog collection
                self.log.warning(
                    u"Failed to record `ReplicationInfo` metrics.")

        self._submit_payload({'oplog': oplog_data})
Exemple #3
0
def calculate_elapsed_time(datestamp,
                           timestamp,
                           qm_timezone,
                           current_time=None):
    """
    Calculate elapsed time in seconds from IBM MQ queue status date and timestamps
    Expected Timestamp format: %H.%M.%S, e.g. 18.45.20
    Expected Datestamp format: %Y-%m-%d, e.g. 2021-09-15
    https://www.ibm.com/docs/en/ibm-mq/9.2?topic=reference-display-qstatus-display-queue-status#q086260___3
    """
    if qm_timezone is not None:
        qm_tz = tz.gettz(qm_timezone)
        if qm_tz is None or type(qm_tz) == str:
            msg = ('Time zone `{}` is not recognized or may be deprecated. '
                   'Please specify a valid time zone in IANA/Olson format.'.
                   format(qm_timezone))
            raise ValueError(msg)
    else:
        qm_tz = tz.UTC

    if current_time is None:
        current_time = get_timestamp()
    else:
        current_time = current_time
    """
    1. Construct a datetime object from the IBM MQ timestamp string format
    2. Set the QM time zone on the datetime object.
    3. Calculate the POSIX timestamp in seconds since EPOCH
    """
    if datestamp and timestamp:
        timestamp_str = sanitize_strings(datestamp) + ' ' + sanitize_strings(
            timestamp)
        timestamp_dt = datetime.strptime(timestamp_str, '%Y-%m-%d %H.%M.%S')
        timestamp_tz = timestamp_dt.replace(tzinfo=qm_tz)
        timestamp_posix = (timestamp_tz - EPOCH).total_seconds()
    else:
        return None

    elapsed = round_value(current_time - timestamp_posix)

    return elapsed
Exemple #4
0
    def _check_db(self, instance, custom_tags=None):
        conn = self._get_conn(instance)
        tags = self._get_tags(custom_tags, instance)

        # Ping the database for info, and track the latency.
        # Process the service check: the check passes if we can connect to Redis
        start = time.time()
        try:
            info = conn.info()
            latency_ms = round_value((time.time() - start) * 1000, 2)
            tags = sorted(tags + ["redis_role:%s" % info["role"]])
            self.gauge('redis.info.latency_ms', latency_ms, tags=tags)
            status = AgentCheck.OK
            self.service_check('redis.can_connect', status, tags=tags)
            self._collect_metadata(info)
        except ValueError:
            status = AgentCheck.CRITICAL
            self.service_check('redis.can_connect', status, tags=tags)
            raise
        except Exception:
            status = AgentCheck.CRITICAL
            self.service_check('redis.can_connect', status, tags=tags)
            raise

        # Save the database statistics.
        for key in info.keys():
            if self.db_key_pattern.match(key):
                db_tags = tags + ["redis_db:" + key]
                # allows tracking percentage of expired keys as DD does not
                # currently allow arithmetic on metric for monitoring
                expires_keys = info[key]["expires"]
                total_keys = info[key]["keys"]
                persist_keys = total_keys - expires_keys
                self.gauge("redis.persist", persist_keys, tags=db_tags)
                self.gauge("redis.persist.percent", 100 * persist_keys / total_keys, tags=db_tags)
                self.gauge("redis.expires.percent", 100 * expires_keys / total_keys, tags=db_tags)

                for subkey in self.subkeys:
                    # Old redis module on ubuntu 10.04 (python-redis 0.6.1) does not
                    # returns a dict for those key but a string: keys=3,expires=0
                    # Try to parse it (see lighthouse #46)
                    try:
                        val = info[key].get(subkey, -1)
                    except AttributeError:
                        val = self._parse_dict_string(info[key], subkey, -1)
                    metric = 'redis.{}'.format(subkey)
                    self.gauge(metric, val, tags=db_tags)

        # Save a subset of db-wide statistics
        for info_name in info:
            if info_name in self.GAUGE_KEYS:
                self.gauge(self.GAUGE_KEYS[info_name], info[info_name], tags=tags)
            elif info_name in self.RATE_KEYS:
                self.rate(self.RATE_KEYS[info_name], info[info_name], tags=tags)

        # Save the number of commands.
        self.rate('redis.net.commands', info['total_commands_processed'], tags=tags)
        if 'instantaneous_ops_per_sec' in info:
            self.gauge('redis.net.instantaneous_ops_per_sec', info['instantaneous_ops_per_sec'], tags=tags)

        # Check some key lengths if asked
        self._check_key_lengths(conn, instance, list(tags))

        # Check replication
        self._check_replication(info, tags)
        if instance.get("command_stats", False):
            self._check_command_stats(conn, tags)
Exemple #5
0
    def check(self, instance):
        """
        Returns a dictionary that looks a lot like what's sent back by
        db.serverStatus()
        """

        def total_seconds(td):
            """
            Returns total seconds of a timedelta in a way that's safe for
            Python < 2.7
            """
            if hasattr(td, 'total_seconds'):
                return td.total_seconds()
            else:
                return (lag.microseconds + (lag.seconds + lag.days * 24 * 3600) * 10 ** 6) / 10.0 ** 6

        if 'server' not in instance:
            raise Exception("Missing 'server' in mongo config")

        # x.509 authentication
        ssl_params = {
            'ssl': instance.get('ssl', None),
            'ssl_keyfile': instance.get('ssl_keyfile', None),
            'ssl_certfile': instance.get('ssl_certfile', None),
            'ssl_cert_reqs': instance.get('ssl_cert_reqs', None),
            'ssl_ca_certs': instance.get('ssl_ca_certs', None),
        }

        for key, param in list(iteritems(ssl_params)):
            if param is None:
                del ssl_params[key]

        server = instance['server']
        username, password, db_name, nodelist, clean_server_name, auth_source = self._parse_uri(
            server, sanitize_username=bool(ssl_params)
        )

        additional_metrics = instance.get('additional_metrics', [])

        # Get the list of metrics to collect
        collect_tcmalloc_metrics = 'tcmalloc' in additional_metrics
        metrics_to_collect = self._get_metrics_to_collect(server, additional_metrics)

        # Tagging
        tags = instance.get('tags', [])
        # ...de-dupe tags to avoid a memory leak
        tags = list(set(tags))

        if not db_name:
            self.log.info('No MongoDB database found in URI. Defaulting to admin.')
            db_name = 'admin'

        service_check_tags = ["db:%s" % db_name]
        service_check_tags.extend(tags)

        # ...add the `server` tag to the metrics' tags only
        # (it's added in the backend for service checks)
        tags.append('server:%s' % clean_server_name)

        if nodelist:
            host = nodelist[0][0]
            port = nodelist[0][1]
            service_check_tags = service_check_tags + ["host:%s" % host, "port:%s" % port]

        timeout = float(instance.get('timeout', DEFAULT_TIMEOUT)) * 1000
        try:
            cli = pymongo.mongo_client.MongoClient(
                server,
                socketTimeoutMS=timeout,
                connectTimeoutMS=timeout,
                serverSelectionTimeoutMS=timeout,
                read_preference=pymongo.ReadPreference.PRIMARY_PREFERRED,
                **ssl_params
            )
            # some commands can only go against the admin DB
            admindb = cli['admin']
            db = cli[db_name]
        except Exception:
            self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags)
            raise

        # Authenticate
        do_auth = True
        use_x509 = ssl_params and not password

        if not username:
            self.log.debug(u"A username is required to authenticate to `%s`", server)
            do_auth = False

        if do_auth:
            if auth_source:
                msg = "authSource was specified in the the server URL: using '%s' as the authentication database"
                self.log.info(msg, auth_source)
                self._authenticate(
                    cli[auth_source], username, password, use_x509, clean_server_name, service_check_tags
                )
            else:
                self._authenticate(db, username, password, use_x509, clean_server_name, service_check_tags)

        try:
            status = db.command('serverStatus', tcmalloc=collect_tcmalloc_metrics)
        except Exception:
            self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.CRITICAL, tags=service_check_tags)
            raise
        else:
            self.service_check(self.SERVICE_CHECK_NAME, AgentCheck.OK, tags=service_check_tags)

        if status['ok'] == 0:
            raise Exception(status['errmsg'].__str__())

        ops = db.current_op()
        status['fsyncLocked'] = 1 if ops.get('fsyncLock') else 0

        status['stats'] = db.command('dbstats')
        dbstats = {db_name: {'stats': status['stats']}}

        # Handle replica data, if any
        # See
        # http://www.mongodb.org/display/DOCS/Replica+Set+Commands#ReplicaSetCommands-replSetGetStatus  # noqa
        if is_affirmative(instance.get('replica_check', True)):
            try:
                data = {}

                replSet = admindb.command('replSetGetStatus')
                if replSet:
                    primary = None
                    current = None

                    # need a new connection to deal with replica sets
                    setname = replSet.get('set')
                    cli_rs = pymongo.mongo_client.MongoClient(
                        server,
                        socketTimeoutMS=timeout,
                        connectTimeoutMS=timeout,
                        serverSelectionTimeoutMS=timeout,
                        replicaset=setname,
                        read_preference=pymongo.ReadPreference.NEAREST,
                        **ssl_params
                    )

                    if do_auth:
                        if auth_source:
                            self._authenticate(
                                cli_rs[auth_source], username, password, use_x509, server, service_check_tags
                            )
                        else:
                            self._authenticate(
                                cli_rs[db_name], username, password, use_x509, server, service_check_tags
                            )

                    # Replication set information
                    replset_name = replSet['set']
                    replset_state = self.get_state_name(replSet['myState']).lower()

                    tags.extend([u"replset_name:{0}".format(replset_name), u"replset_state:{0}".format(replset_state)])

                    # Find nodes: master and current node (ourself)
                    for member in replSet.get('members'):
                        if member.get('self'):
                            current = member
                        if int(member.get('state')) == 1:
                            primary = member

                    # Compute a lag time
                    if current is not None and primary is not None:
                        if 'optimeDate' in primary and 'optimeDate' in current:
                            lag = primary['optimeDate'] - current['optimeDate']
                            data['replicationLag'] = total_seconds(lag)

                    if current is not None:
                        data['health'] = current['health']

                    data['state'] = replSet['myState']

                    if current is not None:
                        total = 0.0
                        cfg = cli_rs['local']['system.replset'].find_one()
                        for member in cfg.get('members'):
                            total += member.get('votes', 1)
                            if member['_id'] == current['_id']:
                                data['votes'] = member.get('votes', 1)
                        data['voteFraction'] = data['votes'] / total

                    status['replSet'] = data

                    # Submit events
                    self._report_replica_set_state(data['state'], clean_server_name, replset_name)

            except Exception as e:
                if "OperationFailure" in repr(e) and (
                    "not running with --replSet" in str(e) or "replSetGetStatus" in str(e)
                ):
                    pass
                else:
                    raise e

        # If these keys exist, remove them for now as they cannot be serialized
        try:
            status['backgroundFlushing'].pop('last_finished')
        except KeyError:
            pass
        try:
            status.pop('localTime')
        except KeyError:
            pass

        dbnames = cli.database_names()
        self.gauge('mongodb.dbs', len(dbnames), tags=tags)

        for db_n in dbnames:
            db_aux = cli[db_n]
            dbstats[db_n] = {'stats': db_aux.command('dbstats')}

        # Go through the metrics and save the values
        for metric_name in metrics_to_collect:
            # each metric is of the form: x.y.z with z optional
            # and can be found at status[x][y][z]
            value = status

            if metric_name.startswith('stats'):
                continue
            else:
                try:
                    for c in metric_name.split("."):
                        value = value[c]
                except KeyError:
                    continue

            # value is now status[x][y][z]
            if not isinstance(value, (int, long, float)):
                raise TypeError(
                    u"{0} value is a {1}, it should be an int, a float or a long instead.".format(
                        metric_name, type(value)
                    )
                )

            # Submit the metric
            submit_method, metric_name_alias = self._resolve_metric(metric_name, metrics_to_collect)
            submit_method(self, metric_name_alias, value, tags=tags)

        for st, value in iteritems(dbstats):
            for metric_name in metrics_to_collect:
                if not metric_name.startswith('stats.'):
                    continue

                try:
                    val = value['stats'][metric_name.split('.')[1]]
                except KeyError:
                    continue

                # value is now status[x][y][z]
                if not isinstance(val, (int, long, float)):
                    raise TypeError(
                        u"{0} value is a {1}, it should be an int, a float or a long instead.".format(
                            metric_name, type(val)
                        )
                    )

                # Submit the metric
                metrics_tags = tags + [
                    u"cluster:db:{0}".format(st),  # FIXME 6.0 - keep for backward compatibility
                    u"db:{0}".format(st),
                ]

                submit_method, metric_name_alias = self._resolve_metric(metric_name, metrics_to_collect)
                submit_method(self, metric_name_alias, val, tags=metrics_tags)

        if is_affirmative(instance.get('collections_indexes_stats')):
            mongo_version = cli.server_info().get('version', '0.0')
            if LooseVersion(mongo_version) >= LooseVersion("3.2"):
                self._collect_indexes_stats(instance, db, tags)
            else:
                msg = "'collections_indexes_stats' is only available starting from mongo 3.2: your mongo version is %s"
                self.log.error(msg, mongo_version)

        # Report the usage metrics for dbs/collections
        if 'top' in additional_metrics:
            try:
                dbtop = admindb.command('top')
                for ns, ns_metrics in iteritems(dbtop['totals']):
                    if "." not in ns:
                        continue

                    # configure tags for db name and collection name
                    dbname, collname = ns.split(".", 1)
                    ns_tags = tags + ["db:%s" % dbname, "collection:%s" % collname]

                    # iterate over DBTOP metrics
                    for m in self.TOP_METRICS:
                        # each metric is of the form: x.y.z with z optional
                        # and can be found at ns_metrics[x][y][z]
                        value = ns_metrics
                        try:
                            for c in m.split("."):
                                value = value[c]
                        except Exception:
                            continue

                        # value is now status[x][y][z]
                        if not isinstance(value, (int, long, float)):
                            raise TypeError(
                                u"{0} value is a {1}, it should be an int, a float or a long instead.".format(
                                    m, type(value)
                                )
                            )

                        # Submit the metric
                        submit_method, metric_name_alias = self._resolve_metric(m, metrics_to_collect, prefix="usage")
                        submit_method(self, metric_name_alias, value, tags=ns_tags)
                        # Keep old incorrect metric
                        if metric_name_alias.endswith('countps'):
                            GAUGE(self, metric_name_alias[:-2], value, tags=ns_tags)
            except Exception as e:
                self.log.warning('Failed to record `top` metrics %s' % str(e))

        if 'local' in dbnames:  # it might not be if we are connectiing through mongos
            # Fetch information analogous to Mongo's db.getReplicationInfo()
            localdb = cli['local']

            oplog_data = {}

            for ol_collection_name in ("oplog.rs", "oplog.$main"):
                ol_options = localdb[ol_collection_name].options()
                if ol_options:
                    break

            if ol_options:
                try:
                    oplog_data['logSizeMB'] = round_value(ol_options['size'] / 2.0 ** 20, 2)

                    oplog = localdb[ol_collection_name]

                    oplog_data['usedSizeMB'] = round_value(
                        localdb.command("collstats", ol_collection_name)['size'] / 2.0 ** 20, 2
                    )

                    op_asc_cursor = oplog.find({"ts": {"$exists": 1}}).sort("$natural", pymongo.ASCENDING).limit(1)
                    op_dsc_cursor = oplog.find({"ts": {"$exists": 1}}).sort("$natural", pymongo.DESCENDING).limit(1)

                    try:
                        first_timestamp = op_asc_cursor[0]['ts'].as_datetime()
                        last_timestamp = op_dsc_cursor[0]['ts'].as_datetime()
                        oplog_data['timeDiff'] = total_seconds(last_timestamp - first_timestamp)
                    except (IndexError, KeyError):
                        # if the oplog collection doesn't have any entries
                        # if an object in the collection doesn't have a ts value, we ignore it
                        pass
                except KeyError:
                    # encountered an error trying to access options.size for the oplog collection
                    self.log.warning(u"Failed to record `ReplicationInfo` metrics.")

            for m, value in iteritems(oplog_data):
                submit_method, metric_name_alias = self._resolve_metric('oplog.%s' % m, metrics_to_collect)
                submit_method(self, metric_name_alias, value, tags=tags)

        else:
            self.log.debug('"local" database not in dbnames. Not collecting ReplicationInfo metrics')

        # get collection level stats
        try:
            # Ensure that you're on the right db
            db = cli[db_name]
            # grab the collections from the configutation
            coll_names = instance.get('collections', [])
            # loop through the collections
            for coll_name in coll_names:
                # grab the stats from the collection
                stats = db.command("collstats", coll_name)
                # loop through the metrics
                for m in self.collection_metrics_names:
                    coll_tags = tags + ["db:%s" % db_name, "collection:%s" % coll_name]
                    value = stats.get(m, None)
                    if not value:
                        continue

                    # if it's the index sizes, then it's a dict.
                    if m == 'indexSizes':
                        submit_method, metric_name_alias = self._resolve_metric(
                            'collection.%s' % m, self.COLLECTION_METRICS
                        )
                        # loop through the indexes
                        for idx, val in iteritems(value):
                            # we tag the index
                            idx_tags = coll_tags + ["index:%s" % idx]
                            submit_method(self, metric_name_alias, val, tags=idx_tags)
                    else:
                        submit_method, metric_name_alias = self._resolve_metric(
                            'collection.%s' % m, self.COLLECTION_METRICS
                        )
                        submit_method(self, metric_name_alias, value, tags=coll_tags)
        except Exception as e:
            self.log.warning(u"Failed to record `collection` metrics.")
            self.log.exception(e)

        custom_queries = instance.get("custom_queries", [])
        custom_query_tags = tags + ["db:{}".format(db_name)]
        for raw_query in custom_queries:
            try:
                self._collect_custom_metrics_for_query(db, raw_query, custom_query_tags)
            except Exception as e:
                metric_prefix = raw_query.get('metric_prefix')
                self.log.warning("Errors while collecting custom metrics with prefix %s", metric_prefix, exc_info=e)
    def submit_perf_metrics(self, instance, container_tags, container_id,
                            container_stats):
        try:
            if container_stats is None:
                self.log.debug("Empty stats for container %s", container_id)
                return

            tags = container_tags[container_id]

            # CPU metrics
            cpu_stats = container_stats.get('cpu_stats', {})
            prev_cpu_stats = container_stats.get('precpu_stats', {})

            value_system = cpu_stats.get('system_cpu_usage')
            if value_system is not None:
                self.rate('ecs.fargate.cpu.system', value_system, tags)

            value_total = cpu_stats.get('cpu_usage', {}).get('total_usage')
            if value_total is not None:
                self.rate('ecs.fargate.cpu.user', value_total, tags)

            prevalue_total = prev_cpu_stats.get('cpu_usage',
                                                {}).get('total_usage')
            prevalue_system = prev_cpu_stats.get('system_cpu_usage')

            if prevalue_system is not None and prevalue_total is not None:
                cpu_delta = float(value_total) - float(prevalue_total)
                system_delta = float(value_system) - float(prevalue_system)
            else:
                cpu_delta = 0.0
                system_delta = 0.0

            active_cpus = float(cpu_stats.get('online_cpus', 0.0))

            cpu_percent = 0.0
            if system_delta > 0 and cpu_delta > 0 and active_cpus > 0:
                cpu_percent = (cpu_delta / system_delta) * active_cpus * 100.0
                cpu_percent = round_value(cpu_percent, 2)
                self.gauge('ecs.fargate.cpu.percent', cpu_percent, tags)

            # Memory metrics
            memory_stats = container_stats.get('memory_stats', {})

            for metric in MEMORY_GAUGE_METRICS:
                value = memory_stats.get('stats', {}).get(metric)
                if value is not None and value < CGROUP_NO_VALUE:
                    self.gauge('ecs.fargate.mem.' + metric, value, tags)
            for metric in MEMORY_RATE_METRICS:
                value = memory_stats.get('stats', {}).get(metric)
                if value is not None:
                    self.rate('ecs.fargate.mem.' + metric, value, tags)

            value = memory_stats.get('max_usage')
            if value is not None:
                self.gauge('ecs.fargate.mem.max_usage', value, tags)

            value = memory_stats.get('usage')
            if value is not None:
                self.gauge('ecs.fargate.mem.usage', value, tags)

            value = memory_stats.get('limit')
            if value is not None:
                self.gauge('ecs.fargate.mem.limit', value, tags)

            # I/O metrics
            for blkio_cat, metric_name in iteritems(IO_METRICS):
                read_counter = write_counter = 0
                for blkio_stat in container_stats.get("blkio_stats",
                                                      {}).get(blkio_cat, []):
                    if blkio_stat["op"] == "Read" and "value" in blkio_stat:
                        read_counter += blkio_stat["value"]
                    elif blkio_stat["op"] == "Write" and "value" in blkio_stat:
                        write_counter += blkio_stat["value"]
                self.rate(metric_name + 'read', read_counter, tags)
                self.rate(metric_name + 'write', write_counter, tags)

        except Exception as e:
            self.warning("Cannot retrieve metrics for %s: %s", container_id, e)
 def test_round_modify_sig_digits(self):
     assert round_value(2.555, precision=2) == 2.560
     assert round_value(4.2345, precision=2) == 4.23
     assert round_value(4.2345, precision=3) == 4.235
 def test_round_modify_method(self):
     assert round_value(3.5, rounding_method=ROUND_HALF_DOWN) == 3.0
 def test_round_half_up(self):
     assert round_value(3.5) == 4.0
Exemple #10
0
    def _check_db(self):
        conn = self._get_conn(self.instance)
        # Ping the database for info, and track the latency.
        # Process the service check: the check passes if we can connect to Redis
        start = time.time()
        try:
            info = conn.info()
            latency_ms = round_value((time.time() - start) * 1000, 2)

            tags = list(self.tags)
            if info.get("role"):
                tags.append("redis_role:{}".format(info["role"]))
            else:
                self.log.debug("Redis role was not found")

            self.gauge('redis.info.latency_ms', latency_ms, tags=tags)
            try:
                config = conn.config_get("maxclients")
            except redis.ResponseError:
                # config_get is disabled on some environments
                self.log.debug("Error querying config")
                config = {}
            status = AgentCheck.OK
            self.service_check('redis.can_connect', status, tags=tags)
            self._collect_metadata(info)
        except ValueError:
            status = AgentCheck.CRITICAL
            self.service_check('redis.can_connect', status, tags=self.tags)
            raise
        except Exception:
            status = AgentCheck.CRITICAL
            self.service_check('redis.can_connect', status, tags=self.tags)
            raise

        # Save the database statistics.
        for key in info.keys():
            if self.db_key_pattern.match(key):
                db_tags = tags + ["redis_db:" + key]
                # allows tracking percentage of expired keys as DD does not
                # currently allow arithmetic on metric for monitoring
                expires_keys = info[key]["expires"]
                total_keys = info[key]["keys"]
                persist_keys = total_keys - expires_keys
                self.gauge("redis.persist", persist_keys, tags=db_tags)
                self.gauge("redis.persist.percent",
                           100 * persist_keys / total_keys,
                           tags=db_tags)
                self.gauge("redis.expires.percent",
                           100 * expires_keys / total_keys,
                           tags=db_tags)

                for subkey in self.subkeys:
                    # Old redis module on ubuntu 10.04 (python-redis 0.6.1) does not
                    # returns a dict for those key but a string: keys=3,expires=0
                    # Try to parse it (see lighthouse #46)
                    try:
                        val = info[key].get(subkey, -1)
                    except AttributeError:
                        val = self._parse_dict_string(info[key], subkey, -1)
                    metric = 'redis.{}'.format(subkey)
                    self.gauge(metric, val, tags=db_tags)

        # Save a subset of db-wide statistics
        for info_name in info:
            if info_name in self.GAUGE_KEYS:
                self.gauge(self.GAUGE_KEYS[info_name],
                           info[info_name],
                           tags=tags)
            elif info_name in self.RATE_KEYS:
                self.rate(self.RATE_KEYS[info_name],
                          info[info_name],
                          tags=tags)

        for config_key, value in iteritems(config):
            metric_name = self.CONFIG_GAUGE_KEYS.get(config_key)
            if metric_name is not None:
                self.gauge(metric_name, value, tags=tags)

        if self.collect_client_metrics:
            # Save client connections statistics
            clients = conn.client_list()
            clients_by_name = Counter(client["name"] or DEFAULT_CLIENT_NAME
                                      for client in clients)
            for name, count in clients_by_name.items():
                self.gauge("redis.net.connections",
                           count,
                           tags=tags + ['source:' + name])

        # Save the number of commands.
        self.rate('redis.net.commands',
                  info['total_commands_processed'],
                  tags=tags)
        if 'instantaneous_ops_per_sec' in info:
            self.gauge('redis.net.instantaneous_ops_per_sec',
                       info['instantaneous_ops_per_sec'],
                       tags=tags)

        # Check some key lengths if asked
        self._check_key_lengths(conn, list(tags))

        # Check replication
        self._check_replication(info, tags)
        if self.instance.get("command_stats", False):
            self._check_command_stats(conn, tags)
    def submit_perf_metrics(self, container_tags, container_id,
                            container_stats):
        try:
            if container_stats is None:
                self.log.debug("Empty stats for container %s", container_id)
                return

            tags = container_tags[container_id]

            # CPU metrics
            cpu_stats = container_stats.get('cpu_stats', {})
            prev_cpu_stats = container_stats.get('precpu_stats', {})

            value_system = cpu_stats.get('cpu_usage',
                                         {}).get('usage_in_kernelmode')
            if value_system is not None:
                self.rate('ecs.fargate.cpu.system', value_system, tags)

            value_user = cpu_stats.get('cpu_usage',
                                       {}).get('usage_in_usermode')
            if value_user is not None:
                self.rate('ecs.fargate.cpu.user', value_user, tags)

            value_total = cpu_stats.get('cpu_usage', {}).get('total_usage')
            if value_total is not None:
                self.rate('ecs.fargate.cpu.usage', value_total, tags)

            available_cpu = cpu_stats.get('system_cpu_usage')
            preavailable_cpu = prev_cpu_stats.get('system_cpu_usage')
            prevalue_total = prev_cpu_stats.get('cpu_usage',
                                                {}).get('total_usage')

            # This is always false on Windows because the available cpu is not exposed
            if (available_cpu is not None and preavailable_cpu is not None
                    and value_total is not None
                    and prevalue_total is not None):
                cpu_delta = float(value_total) - float(prevalue_total)
                system_delta = float(available_cpu) - float(preavailable_cpu)
            else:
                cpu_delta = 0.0
                system_delta = 0.0

            # Not reported on Windows
            active_cpus = float(cpu_stats.get('online_cpus', 0.0))

            cpu_percent = 0.0
            if system_delta > 0 and cpu_delta > 0 and active_cpus > 0:
                if system_delta > cpu_delta:
                    cpu_percent = (cpu_delta /
                                   system_delta) * active_cpus * 100.0
                    cpu_percent = round_value(cpu_percent, 2)
                    self.gauge('ecs.fargate.cpu.percent', cpu_percent, tags)
                else:
                    # There is a bug where container CPU usage is occasionally reported as greater than system
                    # CPU usage (which, in fact, represents the maximum available CPU time during this timeframe),
                    # leading to a non-sensical CPU percentage to be reported. To mitigate this we substitute the
                    # system_delta with (t1 - t0)*active_cpus (with a scale factor to convert to nanoseconds)

                    self.log.debug(
                        "Anomalous CPU value for container_id: %s. cpu_percent: %f",
                        container_id,
                        cpu_percent,
                    )
                    self.log.debug(
                        "ECS container_stats for container_id %s: %s",
                        container_id, container_stats)

                    # example format: '2021-09-22T04:55:52.490012924Z',
                    t1 = container_stats.get('read', '')
                    t0 = container_stats.get('preread', '')
                    try:
                        t_delta = int((parser.isoparse(t1) -
                                       parser.isoparse(t0)).total_seconds())
                        # Simplified formula for cpu_percent where system_delta = t_delta * active_cpus * (10 ** 9)
                        cpu_percent = (cpu_delta / (t_delta * (10**9))) * 100.0
                        cpu_percent = round_value(cpu_percent, 2)
                        self.gauge('ecs.fargate.cpu.percent', cpu_percent,
                                   tags)
                    except ValueError:
                        pass

            # Memory metrics
            memory_stats = container_stats.get('memory_stats', {})

            for metric in MEMORY_GAUGE_METRICS:
                value = memory_stats.get('stats', {}).get(metric)
                if value is not None and value < CGROUP_NO_VALUE:
                    self.gauge('ecs.fargate.mem.' + metric, value, tags)
            for metric in MEMORY_RATE_METRICS:
                value = memory_stats.get('stats', {}).get(metric)
                if value is not None:
                    self.rate('ecs.fargate.mem.' + metric, value, tags)

            value = memory_stats.get('max_usage')
            if value is not None:
                self.gauge('ecs.fargate.mem.max_usage', value, tags)

            value = memory_stats.get('usage')
            if value is not None:
                self.gauge('ecs.fargate.mem.usage', value, tags)

            value = memory_stats.get('limit')
            # When there is no hard-limit defined, the ECS API returns that value of 8 EiB
            # It's not exactly 2^63, but a rounded value of it most probably because of a int->float->int conversion
            if value is not None and value != 9223372036854771712:
                self.gauge('ecs.fargate.mem.limit', value, tags)

            # I/O metrics
            for blkio_cat, metric_name in iteritems(IO_METRICS):
                read_counter = write_counter = 0

                blkio_stats = container_stats.get("blkio_stats",
                                                  {}).get(blkio_cat)
                # In Windows is always "None" (string), so don't report anything
                if blkio_stats == 'None':
                    continue
                elif blkio_stats is None:
                    blkio_stats = []

                for blkio_stat in blkio_stats:
                    if blkio_stat["op"] == "Read" and "value" in blkio_stat:
                        read_counter += blkio_stat["value"]
                    elif blkio_stat["op"] == "Write" and "value" in blkio_stat:
                        write_counter += blkio_stat["value"]
                self.rate(metric_name + 'read', read_counter, tags)
                self.rate(metric_name + 'write', write_counter, tags)

            # Network metrics
            networks = container_stats.get('networks', {})
            for network_interface, network_stats in iteritems(networks):
                network_tags = tags + [
                    "interface:{}".format(network_interface)
                ]
                for field_name, metric_name in iteritems(
                        NETWORK_GAUGE_METRICS):
                    metric_value = network_stats.get(field_name)
                    if metric_value is not None:
                        self.gauge(metric_name, metric_value, network_tags)
                for field_name, metric_name in iteritems(NETWORK_RATE_METRICS):
                    metric_value = network_stats.get(field_name)
                    if metric_value is not None:
                        self.rate(metric_name, metric_value, network_tags)

        except Exception as e:
            self.warning("Cannot retrieve metrics for %s: %s", container_id, e)