Example #1
0
 def time(self, key, time):
     """ Timer metric
     """
     prometheus_histogram = Histogram(  # pylint: disable=no-value-for-parameter
         key
     )
     prometheus_histogram.observe(time)
Example #2
0
    def test_histogram(self):
        s = Histogram("hh", "A histogram", registry=self.registry)
        s.observe(0.05)
        self.assertEqual(
            b"""# HELP hh A histogram
# TYPE hh histogram
hh_bucket{le="0.005"} 0.0
hh_bucket{le="0.01"} 0.0
hh_bucket{le="0.025"} 0.0
hh_bucket{le="0.05"} 1.0
hh_bucket{le="0.075"} 1.0
hh_bucket{le="0.1"} 1.0
hh_bucket{le="0.25"} 1.0
hh_bucket{le="0.5"} 1.0
hh_bucket{le="0.75"} 1.0
hh_bucket{le="1.0"} 1.0
hh_bucket{le="2.5"} 1.0
hh_bucket{le="5.0"} 1.0
hh_bucket{le="7.5"} 1.0
hh_bucket{le="10.0"} 1.0
hh_bucket{le="+Inf"} 1.0
hh_count 1.0
hh_sum 0.05
""",
            generate_latest(self.registry),
        )
    def test_customize_reducer(self):
        h = Histogram('test_value', 'Testing roller', registry=self.registry)
        roller_max = HistogramRoller(h, registry=self.registry, options={
            'reducer': 'max'
        })
        roller_min = HistogramRoller(h, registry=self.registry, options={
            'reducer': 'sum'
        })

        def always_one(*args, **kwargs):
            return 1
        roller_one = HistogramRoller(h, registry=self.registry, options={
            'reducer': always_one
        })


        for state in [2.6, 4.7, 3.8, 2.8]:
            h.observe(state)
            roller_max.collect()
            roller_min.collect()
            roller_one.collect()

        # Deltas = 1, 1, 1
        nchecks = 0
        for m in self.registry.collect():
            if m.name.endswith('max_rolled'):
                for name, labels, val in m.samples:
                    if labels['le'] == '5.0':
                        nchecks += 1
                        self.assertEqual(val, 1.0)
        self.assertTrue(nchecks > 0)

        nchecks = 0
        for m in self.registry.collect():
            if m.name.endswith('sum_rolled'):
                for name, labels, val in m.samples:
                    if labels['le'] == '5.0':
                        self.assertEqual(val, 3.0)
                        nchecks += 1
        self.assertTrue(nchecks > 0)

        nchecks = 0
        for m in self.registry.collect():
            if m.name.endswith('always_one_rolled'):
                for name, labels, val in m.samples:
                    if labels['le'] == '5.0':
                        self.assertEqual(val, 1.0)
                        nchecks += 1
        self.assertTrue(nchecks > 0)
    def test_collect(self):
        h = Histogram('test_value', 'Testing roller', registry=self.registry)
        roller = HistogramRoller(h, registry=self.registry)

        # Get values
        roller.collect()

        n_buckets = 0
        for _, _, _ in self.get_hist_samples():
            n_buckets += 1

        n_created_guages = 0
        for _, _, _ in self.get_rolled_samples():
            n_created_guages += 1

        self.assertTrue(n_buckets > 0)
        self.assertTrue(n_created_guages > 0)
        self.assertEqual(n_buckets, n_created_guages)

        # Check that roller values are still 0.0 after initial collection
        for name, labels, value in self.get_rolled_samples():
            self.assertEqual(value, 0.0)

        # Add some samples
        for i in range(100):
            h.observe(pow(2, i/10 - 2))

        # Collect hisogram values
        hist_values = dict()
        for name, labels, value in self.get_hist_samples():
            hist_values[labels['le']] = value

        # Make sure they are still equal after collection
        for name, labels, value in self.get_rolled_samples():
            self.assertEqual(value, 0.0)

        roller.collect()

        for name, labels, value in self.get_rolled_samples():
            self.assertEqual(value, hist_values[labels['le']])
Example #5
0
them manually here.
"""
from enum import Enum

from prometheus_client import Histogram

REQUEST_DURATION_SECONDS = Histogram(
    'request_duration_seconds',
    'request duration for all HTTP requests',
    ['method', 'handler', 'code']
)

SERVER_SPAWN_DURATION_SECONDS = Histogram(
    'server_spawn_duration_seconds',
    'time taken for server spawning operation',
    ['status'],
    # Use custom bucket sizes, since the default bucket ranges
    # are meant for quick running processes. Spawns can take a while!
    buckets=[0.5, 1, 2.5, 5, 10, 15, 30, 60, 120, float("inf")]
)

class ServerSpawnStatus(Enum):
    """
    Possible values for 'status' label of SERVER_SPAWN_DURATION_SECONDS
    """
    success = 'success'
    failure = 'failure'
    already_pending = 'already-pending'
    throttled = 'throttled'
    too_many_users = 'too-many-users'

    def __str__(self):
from functools import partial
from prometheus_client import Counter, Histogram

BUCKETS = (0.01, 0.05, 0.1, 0.5, 0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 15.0, 20.0,
           30.0)

requests_total = Counter(namespace='aiohttp',
                         subsystem='http',
                         name='requests_total',
                         documentation='Asyncio total Request Count',
                         labelnames=['method', 'handler', 'status'])

request_duration = Histogram(
    namespace='aiohttp',
    subsystem='http',
    name='request_duration_seconds',
    documentation='Request latency',
    labelnames=['method', 'handler'],
    buckets=BUCKETS,
)


class MetricsMiddleware:
    def __init__(self):
        pass

    async def __call__(self, app, handler):
        return partial(self.middleware, handler)

    async def middleware(self, handler, request):
        start_time = time()
        handler_name = handler.__name__
Example #7
0
from prometheus_client import Counter, Histogram

api_exceptions = Counter("system_baseline_api_exceptions",
                         "count of exceptions raised on public API")

baseline_create_requests = Histogram("baseline_create_requests",
                                     "baseline create request stats")

baseline_fetch_requests = Histogram("baseline_fetch_requests",
                                    "baseline fetch request stats")

baseline_fetch_all_requests = Histogram("baseline_fetch_all_requests",
                                        "baseline fetch all request stats")

baseline_delete_requests = Histogram("baseline_delete_requests",
                                     "baseline delete request stats")

inventory_service_requests = Histogram("drift_inventory_service_requests",
                                       "inventory service call stats")

inventory_service_exceptions = Counter(
    "drift_inventory_service_exceptions",
    "count of exceptions raised by inv service")

systems_compared_no_sysprofile = Histogram(
    "drift_systems_compared_no_sysprofile",
    "count of systems without system profile"
    "compared in each request",
    buckets=[2, 4, 8, 16, 32, 64, 128, 256],
)
Example #8
0
##########################
# jinja2 globals
##########################


def version():
    return os.environ.get("VERSION", "dev")[:6]


app.jinja_env.globals.update(version=version)

##########################
# Metrics!
##########################
REQUEST_LATENCY = Histogram("flask_request_latency_seconds", "Request Latency",
                            ['method', 'endpoint'])
REQUEST_COUNT = Counter("flask_request_count", "Request Count",
                        ["method", "endpoint", "status"])


@app.before_request
def start_timer():
    request.stats_start = time()


@app.after_request
def stop_timer(response):
    delta = time() - request.stats_start
    REQUEST_LATENCY.labels(request.method, request.endpoint).observe(delta)  #pylint: disable=no-member
    REQUEST_COUNT.labels(request.method, request.endpoint,
                         response.status_code).inc()  #pylint: disable=no-member
Example #9
0
def _create_histogram(name, description, value):
    METRICS[name] = METRICS.get(name) or Histogram(name, description)
    histogram = METRICS.get(name)
    histogram.observe(value)
Example #10
0
NotificationRecord = collections.namedtuple('NotificationRecord', [
    'subscriptionId',
    'clientState',
    'changeType',
    'resource',
    'dataType',
    'url',
    'id',
])


if PROMETHEUS:
    SUBSCR_COUNT = Counter('kopano_mfr_kopano_total_subscriptions', 'Total number of subscriptions')
    SUBSCR_EXPIRED = Counter('kopano_mfr_kopano_total_expired_subscriptions', 'Total number of subscriptions which expired')
    SUBSCR_ACTIVE = Gauge('kopano_mfr_kopano_active_subscriptions', 'Number of active subscriptions', multiprocess_mode='liveall')
    PROCESSOR_BATCH_HIST = Histogram('kopano_mfr_kopano_webhook_batch_size', 'Number of webhook posts processed in one batch')
    POST_COUNT = Counter('kopano_mfr_kopano_total_webhook_posts', 'Total number of webhook posts')
    POST_ERRORS = Counter('kopano_mfr_kopano_total_webhook_post_errors', 'Total number of webhook post errors')
    POST_HIST = Histogram('kopano_mfr_kopano_webhook_post_duration_seconds', 'Duration of webhook post requests in seconds')
    DANGLING_COUNT = Counter('kopano_mfr_kopano_total_broken_subscription_conns', 'Total number of broken subscription connections')
    QUEUE_SIZE_GAUGE = Gauge('kopano_mfr_kopano_subscription_queue_size', 'Current size of subscriptions processor queue', multiprocess_mode='liveall')
    PROCESSOR_POOL_GAUGE = Gauge('kopano_mfr_kopano_webhook_pools', 'Current number of webhook pools')


class Record:
    """Record binds subscription and conection information per user."""

    def __init__(self, server, user, store, subscriptions):
        """Python built-in method.

        Args:
Example #11
0
import re

from urllib.parse import urlparse

from ...config import config

from prometheus_client import Summary
from prometheus_client import Histogram
from prometheus_async.aio import time

REQ_TIME = Summary("external_to_internal_req_time",
                   "time spent with external_to_internal endpoint")
REQ_HISTOGRAM_TIME = Histogram("external_to_internal_req_histogram",
                               "Histogram for external_to_internal endpoint")


@time(REQ_TIME)
@time(REQ_HISTOGRAM_TIME)
async def translate(external_to_internal_spec, repo_provider):

    external_url = external_to_internal_spec["external_url"]

    internal_url = await translate_external_to_internal(external_url)

    result = {"external_url": external_url, "internal_url": internal_url}

    return result


async def translate_external_to_internal(external_git_url):
    """ Logic from original maitai code to do this: found in GitUrlParser.java#generateInternalGitRepoName """
Example #12
0
LOGGER = get_logger(__name__)

VMAAS_HOST = os.getenv('VMAAS_HOST', 'http://vmaas-webapp-1.vmaas-ci.svc:8080')
VMAAS_VULNERABILITIES_API = os.getenv("VMAAS_VULNERABILITIES_API", "/api/v1/vulnerabilities")
vmaas_vulnerabilities_endpoint = "%s%s" % (VMAAS_HOST, VMAAS_VULNERABILITIES_API)  # pylint: disable=invalid-name

kafka_evaluator_topic = os.getenv('EVALUATOR_TOPIC',  # pylint: disable=invalid-name
                                  'vulnerability.evaluator.upload,vulnerability.evaluator.recalc').split(",")
prometheus_port = os.getenv('PROMETHEUS_PORT', '8085')  # pylint: disable=invalid-name
# number of worker threads
WORKER_THREADS = int(os.getenv('WORKER_THREADS', '30'))
MAX_QUEUE_SIZE = int(os.getenv('MAX_QUEUE_SIZE', '30'))

# prometheus probes
# times
VMAAS_EVAL_TIME = Histogram('ve_evaluator_vmaas_evaluation_seconds', 'Time spent checking a system for vmaas hits')
# counts
VMAAS_COUNT = Counter('ve_evaluator_vmaas_calls', 'Number of VMaaS-evaluations attempted')
INV_ID_NOT_FOUND = Counter('ve_evaluator_inventory_not_found', 'Number of times inventory-id not in SystemPlatform')
UNKNOWN_MSG = Counter('ve_evaluator_unknown_msg', 'Number of unrecognized messages delivered from queue')
UNKNOWN_TOPIC = Counter('ve_evaluator_unknown_topic', 'Number of times message delivered from unsupported topic')

CONSUMER_QUEUE = mqueue.MQReader(kafka_evaluator_topic)
WEBHOOKS_QUEUE = mqueue.MQWriter(mqueue.WEBHOOKS_TOPIC)


async def terminate(_, loop):
    """Trigger shutdown."""
    LOGGER.info("Signal received, stopping kafka consumers.")
    await CONSUMER_QUEUE.stop()
    await WEBHOOKS_QUEUE.stop()
Example #13
0
import time

from flask import request
from prometheus_client import Counter, Histogram
from prometheus_client import start_http_server, make_wsgi_app
from werkzeug.wsgi import DispatcherMiddleware

FLASK_REQUEST_ENDPOINT_SENTINEL = '-'
FLASK_REQUEST_LATENCY = Histogram('flask_request_latency_seconds',
                                  'Flask Request Latency',
                                  ['method', 'endpoint'])
FLASK_REQUEST_COUNT = Counter('flask_request_count', 'Flask Request Count',
                              ['method', 'endpoint', 'http_status'])


def before_request():
    request.start_time = time.time()


def after_request(response):
    request_latency = time.time() - request.start_time

    endpoint = request.url_rule.rule if request.url_rule else FLASK_REQUEST_ENDPOINT_SENTINEL

    FLASK_REQUEST_LATENCY.labels(request.method,
                                 endpoint).observe(request_latency)
    FLASK_REQUEST_COUNT.labels(request.method, endpoint,
                               response.status_code).inc()

    return response
Example #14
0
metrics_registry = CollectorRegistry()
multiprocess.MultiProcessCollector(metrics_registry)

APP_INFO = Info("app_info",
                "Application information",
                registry=metrics_registry)
REQUESTS_TOTAL = Counter(
    "http_requests_total",
    "Service Request Count",
    ["method", "endpoint", "http_status"],
    registry=metrics_registry,
)
REQUEST_LATENCY = Histogram(
    "request_latency_ms",
    "Request latency in milliseconds",
    ["method", "endpoint"],
    registry=metrics_registry,
)

RPCS_TOTAL = Counter(
    "rpc_requests_total",
    "Remote procedure call count",
    ["method", "endpoint", "http_status"],
    registry=metrics_registry,
)
RPC_LATENCY = Histogram(
    "rpc_request_latency_ms",
    "Remote procedure call latency in milliseconds",
    ["method", "endpoint", "http_status"],
    registry=metrics_registry,
)
Example #15
0
    'trace_traceback_number',
    'number of traceback produced by function',
    ['app_name', 'endpoint', 'func'],
    registry=registry,
)

TRACEBACK_COUNTER_FUNC1 = TRACEBACK_COUNTER.labels(app_name='poc_app',
                                                   endpoint='/trace',
                                                   func='tracefail')
TRACEBACK_COUNTER_FUNC2 = TRACEBACK_COUNTER.labels(app_name='poc_app',
                                                   endpoint='/trace2',
                                                   func='tracefail2')

REQUEST_DECORATED = Histogram(
    'root_request_processing_seconds',
    'Time spent processing request',
    ['app_name', 'endpoint'],
    registry=registry,
)

REQUEST_DECORATED_TIME = REQUEST_DECORATED.labels(app_name='poc_app',
                                                  endpoint='/')
REQUEST_DECORATED_TIME2 = REQUEST_DECORATED.labels(app_name='poc_app',
                                                   endpoint='/counter')
REQUEST_DECORATED_TIME3 = REQUEST_DECORATED.labels(app_name='poc_app',
                                                   endpoint='/gauge')
REQUEST_DECORATED_TIME4 = REQUEST_DECORATED.labels(app_name='poc_app',
                                                   endpoint='/trace')
REQUEST_DECORATED_TIME5 = REQUEST_DECORATED.labels(app_name='poc_app',
                                                   endpoint='/trace2')

INFO_TYPE = Info(
 def got_histogram_observe(self, name, value):
     if self.check_enabled():
         histogram = self._monitoring_items[self.HISTOGRAM]
         if not histogram.get(name):
             histogram[name] = Histogram(name, name)
         return histogram[name].observe(value)
Example #17
0
class ZombieCollector(Collector):
    logs_histogram = Histogram(
        "cmd_docker_logs_latency_seconds",
        "Command call latency for docker logs (seconds)")
    logs_timeout = 1  # 99th latency is 0.04s

    zombie_container_count = Gauge(
        "zombie_container_count",
        "number of zombie container found for this node", ["type"])

    class ZombieRecorder(object):
        def __init__(self, type):
            self.type = type
            self.zombies = {
            }  # key is container id, value is enter zombie time

            # When we first meet zombie container, we only record time of that meet,
            # we wait extra decay_time to report it as zombie. Because at the time
            # of our recording, zombie just produced, and haven't been recycled, we
            # wait 5 minutes to avoid possible cases of normal zombie.
            self.decay_time = datetime.timedelta(minutes=5)

        def update(self, zombie_ids, now):
            """ feed in new zombie ids and get id of decayed zombie """
            # remove all records not exist anymore
            for z_id in list(self.zombies.keys()):
                if z_id not in zombie_ids:
                    logger.debug("pop zombie %s that not exist anymore", z_id)
                    self.zombies.pop(z_id)

            result = set()
            for current in zombie_ids:
                if current in self.zombies:
                    enter_zombie_time = self.zombies[current]
                    if now - enter_zombie_time > self.decay_time:
                        result.add(current)
                else:
                    logger.debug("new zombie %s", current)
                    self.zombies[current] = now

            ZombieCollector.zombie_container_count.labels(self.type).set(
                len(result))
            return result

        def __len__(self):
            return len(self.zombies)

    def __init__(self, name, sleep_time, atomic_ref, iteration_counter,
                 stats_info_ref, zombie_ids_ref):
        Collector.__init__(self, name, sleep_time, atomic_ref,
                           iteration_counter)
        self.stats_info_ref = stats_info_ref
        self.zombie_ids_ref = zombie_ids_ref

        self.type1_zombies = ZombieCollector.ZombieRecorder("job_exit_hangs")
        self.type2_zombies = ZombieCollector.ZombieRecorder("residual_job")

        self.yarn_pattern = u"container_\w{3}_[0-9]{13}_[0-9]{4}_[0-9]{2}_[0-9]{6}"
        self.yarn_container_reg = re.compile(u"^" + self.yarn_pattern + "$")
        self.job_container_reg = re.compile(u"^.+(" + self.yarn_pattern +
                                            u")$")

    def update_zombie_count_type1(self, exited_containers, now):
        """ this fn will generate zombie container count for the first type,
        exited_containers is container id set of which we believe exited """
        return self.type1_zombies.update(exited_containers, now)

    def update_zombie_count_type2(self, stats, now):
        """ this fn will generate zombie container count for the second type """
        name_to_id = {}
        for info in stats.values():
            name_to_id[info["name"]] = info["id"]

        # key is job name, value is tuple of corresponding
        # yarn_container name and job container id
        job_containers = {}

        yarn_containers = set()

        zombie_ids = set()

        for name, id in name_to_id.items():
            if re.match(self.yarn_container_reg, name) is not None:
                yarn_containers.add(name)
            elif re.match(self.job_container_reg, name) is not None:
                match = re.match(self.job_container_reg, name)
                value = match.groups()[0]
                job_containers[name] = (value, id)
            else:
                pass  # ignore

        for _, val in job_containers.items():
            yarn_name, job_id = val
            if yarn_name not in yarn_containers:
                zombie_ids.add(job_id)

        return self.type2_zombies.update(zombie_ids, now)

    def docker_logs(self, container_id, tail="all"):
        try:
            return utils.exec_cmd(
                ["docker", "logs", "--tail",
                 str(tail),
                 str(container_id)],
                histogram=ZombieCollector.logs_histogram,
                stderr=subprocess.STDOUT,  # also capture stderr output
                timeout=ZombieCollector.logs_timeout)
        except subprocess.TimeoutExpired as e:
            logger.warning("docker log timeout")
        except subprocess.CalledProcessError as e:
            logger.warning("docker logs returns %d, output %s", e.returncode,
                           e.output)
        except Exception:
            logger.exception("exec docker logs error")

        return ""

    def is_container_exited(self, container_id):
        logs = self.docker_logs(container_id, tail=50)
        if re.search(u"USER COMMAND END", logs):
            return True
        return False

    def update_zombie_count(self, stats):
        """
        There are two types of zombie:
            1. container which outputted "USER COMMAND END" but did not exist for a long period of time
            2. yarn container exited but job container didn't
        return set of container id that deemed as zombie
        """
        if stats is None:
            logger.warning("docker stats is None")
            return

        exited_containers = set(filter(self.is_container_exited, stats.keys()))

        now = datetime.datetime.now()
        type1_zombies = self.update_zombie_count_type1(exited_containers, now)
        type2_zombies = self.update_zombie_count_type2(stats, now)
        return type1_zombies.union(type2_zombies)

    def collect_impl(self):
        # set it to None so if docker-stats hangs till next time we get,
        # we will get None
        stats_info = self.stats_info_ref.get(datetime.datetime.now())
        all_zombies = self.update_zombie_count(stats_info)
        self.zombie_ids_ref.set(all_zombies, datetime.datetime.now())
Example #18
0
class ContainerCollector(Collector):
    stats_histogram = Histogram(
        "cmd_docker_stats_latency_seconds",
        "Command call latency for docker stats (seconds)")
    stats_timeout = 20
    # 99th latency may larger than 10s,
    # Because prometheus's largest bucket for recording histogram is 10s,
    # we can not get value higher than 10s.

    inspect_histogram = Histogram(
        "cmd_docker_inspect_latency_seconds",
        "Command call latency for docker inspect (seconds)")
    inspect_timeout = 1  # 99th latency is 0.042s

    iftop_histogram = Histogram("cmd_iftop_latency_seconds",
                                "Command call latency for iftop (seconds)")
    iftop_timeout = 10  # 99th latency is 7.4s

    lsof_histogram = Histogram("cmd_lsof_latency_seconds",
                               "Command call latency for lsof (seconds)")
    lsof_timeout = 2  # 99th latency is 0.5s

    pai_services = list(
        map(
            lambda s: "k8s_" + s,
            [
                # Run in master node
                "rest-server",
                "pylon",
                "webportal",
                "grafana",
                "prometheus",
                "alertmanager",
                "watchdog",
                "frameworkcontroller",
                "hivedscheduler",
                "framework-watcher_database-controller",
                "write-merger_database-controller",
                "poller_database-controller",
                "dshuttle-master",
                "dshuttle-job-master",
                "fluentd",
                "postgresql_postgresql",

                # Run as daemon set
                "node-exporter",
                "job-exporter",
                "log-manager-nginx",
                "log-cleaner",
                "dshuttle-worker",
                "dshuttle-job-worker",
                "dshuttle-csi-daemon",
                "weave",
                "weave-npc",
                "nvidia-device-plugin-ctr",
                "k8s-host-device",
                "amdgpu",
                "k8s-rdma",
            ]))

    def __init__(self, name, sleep_time, atomic_ref, iteration_counter,
                 gpu_info_ref, stats_info_ref, interface):
        Collector.__init__(self, name, sleep_time, atomic_ref,
                           iteration_counter)
        self.gpu_info_ref = gpu_info_ref
        self.stats_info_ref = stats_info_ref

        self.network_interface = network.try_to_get_right_interface(interface)
        logger.info(
            "found %s as potential network interface to listen network traffic",
            self.network_interface)

        self.gpu_vendor = utils.get_gpu_vendor()

        # k8s will prepend "k8s_" to pod name. There will also be a container name
        # prepend with "k8s_POD_" which is a docker container used to construct
        # network & pid namespace for specific container. These container prepend
        # with "k8s_POD" consume nothing.

    def collect_impl(self):
        all_conns = network.iftop(self.network_interface,
                                  ContainerCollector.iftop_histogram,
                                  ContainerCollector.iftop_timeout)

        stats_obj = docker_stats.stats(ContainerCollector.stats_histogram,
                                       ContainerCollector.stats_timeout)

        now = datetime.datetime.now()
        gpu_infos = self.gpu_info_ref.get(now)
        self.stats_info_ref.set(stats_obj, now)

        logger.debug("all_conns is %s", all_conns)
        logger.debug("gpu_info is %s", gpu_infos)
        logger.debug("stats_obj is %s", stats_obj)

        return self.collect_container_metrics(stats_obj, gpu_infos, all_conns)

    @staticmethod
    def parse_from_labels(inspect_info, gpu_infos):
        gpu_ids = []
        result_labels = {}

        result_labels["username"] = inspect_info.username or "unknown"
        result_labels["job_name"] = inspect_info.job_name or "unknown"
        result_labels["role_name"] = inspect_info.role_name or "unknown"
        result_labels["task_index"] = inspect_info.task_index or "unknown"
        result_labels[
            "job_instance_id"] = inspect_info.job_instance_id or "unknown"
        result_labels[
            "virtual_cluster"] = inspect_info.virtual_cluster or "unknown"

        if inspect_info.gpu_ids:
            ids = inspect_info.gpu_ids.replace("\"", "").split(",")
            for id in ids:
                # If the container was scheduled by yarn, we get its GPU usage
                # info from label GPU_ID, value of the label is minor_number, and
                # will be digits.
                # If the container was scheduled by kube launcher, we get its GPU
                # usage info from environment NVIDIA_VISIBLE_DEVICES, the value
                # is like GPU-dc0671b0-61a4-443e-f456-f8fa6359b788. The mapping
                # from uuid to minor_number is get via nvidia-smi, and gpu_infos
                # should have key of this uuid.
                if id.isdigit():
                    gpu_ids.append(id)
                elif id and gpu_infos is not None:
                    # id is in form of UUID like
                    if gpu_infos.get(id) is not None:
                        gpu_ids.append(gpu_infos[id].minor)
                    else:
                        logger.warning(
                            "gpu uuid %s can not be found in map %s", id,
                            gpu_infos)
                else:
                    logger.warning("unknown gpu id %s, gpu_infos is %s", id,
                                   gpu_infos)

        return gpu_ids, result_labels

    @classmethod
    def infer_service_name(cls, container_name):
        """ try to infer service name from container_name, if it's container not belongs
        to pai service, will return None """
        if container_name.startswith("k8s_POD_"):
            # this is empty container created by k8s for pod
            return None

        # TODO speed this up, since this is O(n^2)
        for service_name in cls.pai_services:
            if container_name.startswith(service_name):
                return service_name[4:]  # remove "k8s_" prefix

        return None

    def process_one_container(self, container_id, stats, gpu_infos, all_conns,
                              gauges):
        container_name = utils.walk_json_field_safe(stats, "name")
        pai_service_name = ContainerCollector.infer_service_name(
            container_name)

        inspect_info = docker_inspect.inspect(
            container_id, ContainerCollector.inspect_histogram,
            ContainerCollector.inspect_timeout, self.gpu_vendor)

        pid = inspect_info.pid
        job_name = inspect_info.job_name

        logger.debug("%s has inspect result %s, service_name %s",
                     container_name, inspect_info, pai_service_name)

        if job_name is None and pai_service_name is None:
            logger.debug("%s is ignored", container_name)
            return  # other container, maybe kubelet or api-server

        # get network consumption, since all our services/jobs running in host
        # network, and network statistic from docker is not specific to that
        # container. We have to get network statistic by ourselves.
        lsof_result = network.lsof(pid, ContainerCollector.lsof_histogram,
                                   ContainerCollector.lsof_timeout)

        net_in, net_out = network.get_container_network_metrics(
            all_conns, lsof_result)
        if logger.isEnabledFor(logging.DEBUG):
            debug_info = utils.exec_cmd(
                "ps -o cmd fp {0} | tail -n 1".format(pid), shell=True)

            logger.debug(
                "pid %s with cmd `%s` has lsof result %s, in %d, out %d", pid,
                debug_info.strip(), lsof_result, net_in, net_out)

        if pai_service_name is None:
            gpu_ids, container_labels = ContainerCollector.parse_from_labels(
                inspect_info, gpu_infos)

            if gpu_infos:
                for id in gpu_ids:
                    if gpu_infos.get(id) is None:
                        continue

                    nvidia_gpu_status = gpu_infos[id]
                    labels = copy.deepcopy(container_labels)
                    labels["minor_number"] = id

                    gauges.add_value("task_gpu_percent", labels,
                                     nvidia_gpu_status.gpu_util)
                    gauges.add_value("task_gpu_mem_percent", labels,
                                     nvidia_gpu_status.gpu_mem_util)

            gauges.add_value("task_cpu_percent", container_labels,
                             stats["CPUPerc"])
            gauges.add_value("task_mem_usage_byte", container_labels,
                             stats["MemUsage_Limit"]["usage"])
            gauges.add_value("task_mem_limit_byte", container_labels,
                             stats["MemUsage_Limit"]["limit"])
            gauges.add_value("task_net_in_byte", container_labels, net_in)
            gauges.add_value("task_net_out_byte", container_labels, net_out)
            gauges.add_value("task_block_in_byte", container_labels,
                             stats["BlockIO"]["in"])
            gauges.add_value("task_block_out_byte", container_labels,
                             stats["BlockIO"]["out"])
            gauges.add_value("task_mem_usage_percent", container_labels,
                             stats["MemPerc"])
        else:
            labels = {"name": pai_service_name}
            gauges.add_value("service_cpu_percent", labels, stats["CPUPerc"])
            gauges.add_value("service_mem_usage_byte", labels,
                             stats["MemUsage_Limit"]["usage"])
            gauges.add_value("service_mem_limit_byte", labels,
                             stats["MemUsage_Limit"]["limit"])
            gauges.add_value("service_mem_usage_percent", labels,
                             stats["MemPerc"])
            gauges.add_value("service_net_in_byte", labels, net_in)
            gauges.add_value("service_net_out_byte", labels, net_out)
            gauges.add_value("service_block_in_byte", labels,
                             stats["BlockIO"]["in"])
            gauges.add_value("service_block_out_byte", labels,
                             stats["BlockIO"]["out"])

    def collect_container_metrics(self, stats_obj, gpu_infos, all_conns):
        if stats_obj is None:
            logger.warning("docker stats returns None")
            return None

        gauges = ResourceGauges()

        for container_id, stats in stats_obj.items():
            try:
                self.process_one_container(container_id, stats, gpu_infos,
                                           all_conns, gauges)
            except Exception:
                logger.exception(
                    "error when trying to process container %s with name %s",
                    container_id, utils.walk_json_field_safe(stats, "name"))

        return gauges.as_array()
Example #19
0
class GpuCollector(Collector):
    nvidia_cmd_histogram = Histogram(
        "cmd_nvidia_smi_latency_seconds",
        "Command call latency for nvidia-smi (seconds)")
    amd_cmd_hostogram = Histogram(
        "cmd_rocm_smi_latency_seconds",
        "Command call latency for rocm-smi (seconds)")

    cmd_timeout = 60  # 99th latency is 0.97s

    def __init__(self, name, sleep_time, atomic_ref, iteration_counter,
                 gpu_info_ref, zombie_info_ref, mem_leak_thrashold):
        Collector.__init__(self, name, sleep_time, atomic_ref,
                           iteration_counter)
        self.gpu_info_ref = gpu_info_ref
        self.zombie_info_ref = zombie_info_ref
        self.mem_leak_thrashold = mem_leak_thrashold
        self.gpu_vendor = utils.get_gpu_vendor()

    @staticmethod
    def get_container_id(pid):
        """ return two values, the first one is if we found the corresponding
        container_id, the second one is the container_id if found """
        path = "/proc/%d/cgroup" % (pid)
        if not os.path.isfile(path):
            return False, ""

        with open(path) as f:
            content = f.read()

        for line in content.split("\n"):
            line = line.strip()
            if "pids" in line:
                if "/docker/" in line:
                    parts = line.split("/docker/")
                    if len(parts) == 2 and re.match(u"[0-9a-f]+", parts[1]):
                        return True, parts[1]
                elif "/kubepods/" in line:
                    parts = line.split("/kubepods/")
                    if len(parts) == 2 and re.match(u"pod[0-9a-f-]+",
                                                    parts[1]):
                        return True, parts[1]
                else:
                    logger.info("unknown format in pid cgroup %s", line)

        return False, ""

    @staticmethod
    def gen_common_gpu_gauge():
        return gen_gpu_util_gauge(), gen_gpu_mem_util_gauge()

    @staticmethod
    def convert_nvidia_gpu_info_to_metrics(
        gpu_info,
        zombie_info,
        pid_to_cid_fn,
        mem_leak_thrashold,
        node_name=os.environ.get("NODE_NAME")):
        """ This fn used to convert gpu_info & zombie_info into metrics, used to make
        it easier to do unit test """
        # common gpu metrics
        gpu_core_util, gpu_mem_util = GpuCollector.gen_common_gpu_gauge()
        # nvidia metrics
        nvidia_core_utils = gen_nvidia_gpu_util_gauge()
        nvidia_mem_utils = gen_nvidia_gpu_mem_util_gauge()
        nvidia_gpu_temp = gen_nvidia_gpu_temperature_gauge()
        nvidia_ecc_errors = gen_nvidia_gpu_ecc_counter()
        nvidia_mem_leak = gen_nvidia_gpu_memory_leak_counter()
        external_process = gen_gpu_used_by_external_process_counter()
        zombie_container = gen_gpu_used_by_zombie_container_counter()

        pids_use_gpu = {}  # key is gpu minor, value is an array of pid

        for minor, info in gpu_info.items():
            if not minor.isdigit():
                continue  # ignore UUID

            gpu_core_util.add_metric([minor, GpuVendor.NVIDIA.value],
                                     info.gpu_util)
            gpu_mem_util.add_metric([minor, GpuVendor.NVIDIA.value],
                                    info.gpu_mem_util)
            nvidia_core_utils.add_metric([minor], info.gpu_util)
            nvidia_mem_utils.add_metric([minor], info.gpu_mem_util)
            if info.temperature is not None:
                nvidia_gpu_temp.add_metric([minor], info.temperature)
            nvidia_ecc_errors.add_metric([node_name, minor, "single"],
                                         info.ecc_errors.single)
            nvidia_ecc_errors.add_metric([node_name, minor, "double"],
                                         info.ecc_errors.double)

            # TODO: this piece of code seems not corret, gpu_mem_util is
            # a percentage number but mem_leak_thrashold is memory size. Need to fix it.
            if info.gpu_mem_util > mem_leak_thrashold and len(info.pids) == 0:
                # we found memory leak less than 20M can be mitigated automatically
                nvidia_mem_leak.add_metric([minor], 1)

            if len(info.pids) > 0:
                pids_use_gpu[minor] = info.pids

        logger.debug("pids_use_gpu is %s, zombie_info is %s", pids_use_gpu,
                     zombie_info)
        if len(pids_use_gpu) > 0:
            if zombie_info is None:
                zombie_info = []

            for minor, pids in pids_use_gpu.items():
                for pid in pids:
                    found, z_id = pid_to_cid_fn(pid)
                    logger.debug("pid %s has found %s, z_id %s", pid, found,
                                 z_id)
                    if found:
                        # NOTE: zombie_info is a set of short docker container id, but
                        # z_id is full id.
                        for zombie_id in zombie_info:
                            if z_id.startswith(zombie_id):
                                # found corresponding container
                                zombie_container.add_metric([minor, zombie_id],
                                                            1)
                    else:
                        external_process.add_metric([minor, str(pid)], 1)
            if len(zombie_container.samples) > 0 or len(
                    external_process.samples) > 0:
                logger.warning(
                    "found gpu used by external %s, zombie container %s",
                    external_process, zombie_container)

        return [
            nvidia_core_utils, nvidia_mem_utils, nvidia_ecc_errors,
            nvidia_mem_leak, external_process, zombie_container,
            nvidia_gpu_temp, gpu_core_util, gpu_mem_util
        ]

    @staticmethod
    def convert_amd_gpu_info_to_metrics(gpu_info):
        # common gpu metrics
        gpu_core_util, gpu_mem_util = GpuCollector.gen_common_gpu_gauge()

        # amd metrics
        amd_core_utils = gen_amd_gpu_util_gauge()
        amd_mem_utils = gen_amd_gpu_mem_util_gauge()
        amd_gpu_temp = gen_amd_gpu_temperature_gauge()

        for minor, info in gpu_info.items():
            gpu_core_util.add_metric([minor, GpuVendor.AMD.value],
                                     info.gpu_util)
            gpu_mem_util.add_metric([minor, GpuVendor.AMD.value],
                                    info.gpu_mem_util)
            amd_core_utils.add_metric([minor], info.gpu_util)
            amd_mem_utils.add_metric([minor], info.gpu_mem_util)
            amd_gpu_temp.add_metric([minor], info.temperature)
        return [
            amd_core_utils, amd_mem_utils, amd_gpu_temp, gpu_core_util,
            gpu_mem_util
        ]

    def collect_impl(self):
        if self.gpu_vendor == GpuVendor.UNKNOWN:
            logger.warning(
                "Couldn't identify the GPU vendor, please make sure the GPU driver installed correctly"
            )
            return None
        if self.gpu_vendor == GpuVendor.NVIDIA:
            gpu_info = nvidia.nvidia_smi(GpuCollector.nvidia_cmd_histogram,
                                         GpuCollector.cmd_timeout)

            logger.debug("get nvidia gpu_info %s", gpu_info)

            now = datetime.datetime.now()
            self.gpu_info_ref.set(gpu_info, now)
            zombie_info = self.zombie_info_ref.get(now)

            if gpu_info:
                return GpuCollector.convert_nvidia_gpu_info_to_metrics(
                    gpu_info, zombie_info, GpuCollector.get_container_id,
                    self.mem_leak_thrashold)
            return None
        if self.gpu_vendor == GpuVendor.AMD:
            gpu_info = amd.rocm_smi(GpuCollector.amd_cmd_hostogram,
                                    GpuCollector.cmd_timeout)
            logger.debug("get amd gpu info %s", gpu_info)

            self.gpu_info_ref.set(gpu_info, datetime.datetime.now())
            if gpu_info:
                return GpuCollector.convert_amd_gpu_info_to_metrics(gpu_info)
            return None
        return None
Example #20
0
from synapse.events.snapshot import EventContext
from synapse.logging.utils import log_function
from synapse.state import v1, v2
from synapse.storage.data_stores.main.events_worker import EventRedactBehaviour
from synapse.types import StateMap
from synapse.util.async_helpers import Linearizer
from synapse.util.caches import get_cache_factor_for
from synapse.util.caches.expiringcache import ExpiringCache
from synapse.util.metrics import Measure, measure_func

logger = logging.getLogger(__name__)

# Metrics for number of state groups involved in a resolution.
state_groups_histogram = Histogram(
    "synapse_state_number_state_groups_in_resolution",
    "Number of state groups used when performing a state resolution",
    buckets=(1, 2, 3, 5, 7, 10, 15, 20, 50, 100, 200, 500, "+Inf"),
)

KeyStateTuple = namedtuple("KeyStateTuple", ("context", "type", "state_key"))

SIZE_OF_CACHE = 100000 * get_cache_factor_for("state_cache")
EVICTION_TIMEOUT_SECONDS = 60 * 60

_NEXT_STATE_ID = 1

POWER_KEY = (EventTypes.PowerLevels, "")


def _gen_state_id():
    global _NEXT_STATE_ID
Example #21
0
import argparse
from flask import Flask, render_template_string, abort
from prometheus_client import generate_latest, REGISTRY, Counter, Gauge, Histogram

app = Flask(__name__)

# A counter to count the total number of HTTP requests
REQUESTS = Counter('http_requests_total', 'Total HTTP Requests (count)',
                   ['method', 'endpoint', 'status_code'])

# A gauge (i.e. goes up and down) to monitor the total number of in progress requests
IN_PROGRESS = Gauge('http_requests_inprogress',
                    'Number of in progress HTTP requests')

# A histogram to measure the latency of the HTTP requests
TIMINGS = Histogram('http_request_duration_seconds',
                    'HTTP request latency (seconds)')

# A gauge to count the number of packages newly added
PACKAGES_NEW = Gauge('packages_newly_added', 'Packages newly added')


# Standard Flask route stuff.
@app.route('/')
# Helper annotation to measure how long a method takes and save as a histogram metric.
@TIMINGS.time()
# Helper annotation to increment a gauge when entering the method and decrementing when leaving.
@IN_PROGRESS.track_inprogress()
def hello_world():
    REQUESTS.labels(method='GET', endpoint="/",
                    status_code=200).inc()  # Increment the counter
    return 'Hello, World!'
Example #22
0
 def _histogram(self, var, var_help, labels, buckets):
     return Histogram(var,
                      var_help,
                      labels,
                      buckets=buckets,
                      registry=self._reg)  # pylint: disable=unexpected-keyword-arg
Example #23
0
import contextlib

from prometheus_client import Counter, Gauge, Histogram


""" The logic of the Prometheus metrics is defined in this module """


IDUNN_WIKI_REQUEST_DURATION = Histogram(
    "idunn_wiki_request_duration_seconds",
    "Time spent processing a Wiki request.",
    ["target", "handler"],
)

IDUNN_WIKI_EXCEPTIONS_COUNT = Counter(
    "idunn_wiki_exceptions_count",
    "Number of exceptions caught in Idunn WikipediaBlock.",
    ["exception_type"]
)


@contextlib.contextmanager
def wiki_request_duration(target, handler):
    with IDUNN_WIKI_REQUEST_DURATION.labels(target, handler).time():
        yield

def exception(exception_type):
    IDUNN_WIKI_EXCEPTIONS_COUNT.labels(exception_type).inc()
Example #24
0
from models import sql

j2env = jinja2.Environment(
    loader=jinja2.FileSystemLoader("templates"),
    autoescape=jinja2.select_autoescape(["html", "xml"]),
)

# These things are either misconfigured to not send a static device_id
# or they're maliciously inflating their values. As such, we reject stats
# coming from them.
BLACKLIST = {"device_version": {"13.0-20180304-UNOFFICIAL-ht16": True}}


REQUEST_LATENCY = Histogram(
    "falcon_request_latency_seconds", "Request Latency", ["method", "endpoint"]
)
REQUEST_COUNT = Counter(
    "falcon_request_count", "Request Count", ["method", "endpoint", "status"]
)


class PrometheusComponent(object):
    def process_request(self, req, resp):
        req.context["start_time"] = time()

    def process_response(self, req, resp, resource, req_suceeded):
        delta = time() - req.context["start_time"]
        if req.relative_uri in ["/api/v1/stats", "/"]:
            REQUEST_LATENCY.labels(req.method, req.relative_uri).observe(delta)
            REQUEST_COUNT.labels(req.method, req.relative_uri, resp.status).inc()
Example #25
0
    version,
    "min_version":
    util.version_string(wallet_server_version.PROTOCOL_MIN),
    "cpu_count":
    CPU_COUNT
})
SESSIONS_COUNT = Gauge("session_count",
                       "Number of connected client sessions",
                       namespace=NAMESPACE,
                       labelnames=("version", ))
REQUESTS_COUNT = Counter("requests_count",
                         "Number of requests received",
                         namespace=NAMESPACE,
                         labelnames=("method", "version"))
RESPONSE_TIMES = Histogram("response_time",
                           "Response times",
                           namespace=NAMESPACE,
                           labelnames=("method", "version"))
NOTIFICATION_COUNT = Counter(
    "notification",
    "Number of notifications sent (for subscriptions)",
    namespace=NAMESPACE,
    labelnames=("method", "version"))
REQUEST_ERRORS_COUNT = Counter("request_error",
                               "Number of requests that returned errors",
                               namespace=NAMESPACE,
                               labelnames=("method", "version"))
SQLITE_INTERRUPT_COUNT = Counter("interrupt",
                                 "Number of interrupted queries",
                                 namespace=NAMESPACE)
SQLITE_OPERATIONAL_ERROR_COUNT = Counter(
    "operational_error",
Example #26
0
    def export_defaults(self,
                        buckets=None,
                        group_by='path',
                        latency_as_histogram=True,
                        prefix='flask',
                        app=None,
                        **kwargs):
        """
        Export the default metrics:
            - HTTP request latencies
            - HTTP request exceptions
            - Number of HTTP requests

        :param buckets: the time buckets for request latencies
            (will use the default when `None`)
        :param group_by: group default HTTP metrics by
            this request property, like `path`, `endpoint`, `rule`, etc.
            (defaults to `path`)
        :param latency_as_histogram: export request latencies
            as a Histogram, otherwise use a Summary instead
            (defaults to `True` to export as a Histogram)
        :param prefix: prefix to start the default metrics names with
            or `NO_PREFIX` (to skip prefix)
        :param app: the Flask application
        """

        if app is None:
            app = self.app or current_app

        if not prefix:
            prefix = self._defaults_prefix or 'flask'

        if kwargs.get('group_by_endpoint') is True:
            warnings.warn(
                'The `group_by_endpoint` argument of '
                '`PrometheusMetrics.export_defaults` is deprecated since 0.4.0, '
                'please use the new `group_by` argument.', DeprecationWarning)

            duration_group = 'endpoint'

        elif group_by:
            duration_group = group_by

        else:
            duration_group = 'path'

        if callable(duration_group):
            duration_group_name = duration_group.__name__

        else:
            duration_group_name = duration_group

        if prefix == NO_PREFIX:
            prefix = ""
        else:
            prefix = prefix + "_"

        try:
            self.info('%sexporter_info' % prefix,
                      'Information about the Prometheus Flask exporter',
                      version=self.version)
        except ValueError:
            return  # looks like we have already exported the default metrics

        labels = self._get_combined_labels(None)

        if latency_as_histogram:
            # use the default buckets from prometheus_client if not given here
            buckets_as_kwargs = {}
            if buckets is not None:
                buckets_as_kwargs['buckets'] = buckets

            request_duration_metric = Histogram(
                '%shttp_request_duration_seconds' % prefix,
                'Flask HTTP request duration in seconds',
                ('method', duration_group_name, 'status') + labels.keys(),
                registry=self.registry,
                **buckets_as_kwargs)

        else:
            # export as Summary instead
            request_duration_metric = Summary(
                '%shttp_request_duration_seconds' % prefix,
                'Flask HTTP request duration in seconds',
                ('method', duration_group_name, 'status') + labels.keys(),
                registry=self.registry)

        counter_labels = ('method', 'status') + labels.keys()
        request_total_metric = Counter('%shttp_request_total' % prefix,
                                       'Total number of HTTP requests',
                                       counter_labels,
                                       registry=self.registry)

        request_exceptions_metric = Counter(
            '%shttp_request_exceptions_total' % prefix,
            'Total number of HTTP requests which resulted in an exception',
            counter_labels,
            registry=self.registry)

        def before_request():
            request.prom_start_time = default_timer()

        def after_request(response):
            if hasattr(request, 'prom_do_not_track') or hasattr(
                    request, 'prom_exclude_all'):
                return response

            if self.excluded_paths:
                if any(
                        pattern.match(request.path)
                        for pattern in self.excluded_paths):
                    return response

            if hasattr(request, 'prom_start_time'):
                total_time = max(default_timer() - request.prom_start_time, 0)

                if callable(duration_group):
                    group = duration_group(request)
                else:
                    group = getattr(request, duration_group)

                request_duration_labels = {
                    'method': request.method,
                    'status': _to_status_code(response.status_code),
                    duration_group_name: group
                }
                request_duration_labels.update(labels.values_for(response))

                request_duration_metric.labels(
                    **request_duration_labels).observe(total_time)

            request_total_metric.labels(method=request.method,
                                        status=_to_status_code(
                                            response.status_code),
                                        **labels.values_for(response)).inc()

            return response

        def teardown_request(exception=None):
            if not exception or hasattr(request,
                                        'prom_do_not_track') or hasattr(
                                            request, 'prom_exclude_all'):
                return

            if self.excluded_paths:
                if any(
                        pattern.match(request.path)
                        for pattern in self.excluded_paths):
                    return

            response = make_response('Exception: %s' % exception, 500)

            if callable(duration_group):
                group = duration_group(request)
            else:
                group = getattr(request, duration_group)

            request_exceptions_metric.labels(
                method=request.method,
                status=500,
                **labels.values_for(response)).inc()

            if hasattr(request, 'prom_start_time'):
                total_time = max(default_timer() - request.prom_start_time, 0)

                request_duration_labels = {
                    'method': request.method,
                    'status': 500,
                    duration_group_name: group
                }
                request_duration_labels.update(labels.values_for(response))

                request_duration_metric.labels(
                    **request_duration_labels).observe(total_time)

            request_total_metric.labels(method=request.method,
                                        status=500,
                                        **labels.values_for(response)).inc()

            return

        app.before_request(before_request)
        app.after_request(after_request)
        app.teardown_request(teardown_request)
Example #27
0
import time
import logging
from typing import Text

from flask import Blueprint, Response, request
from prometheus_client import multiprocess, Counter, Histogram, generate_latest, CollectorRegistry, REGISTRY
from pyms.flask.services.driver import DriverService

# Based on https://github.com/sbarratt/flask-prometheus
# and https://github.com/korfuri/python-logging-prometheus/

FLASK_REQUEST_LATENCY = Histogram(
    "http_server_requests_seconds", "Flask Request Latency", ["service", "method", "uri", "status"]
)
FLASK_REQUEST_COUNT = Counter(
    "http_server_requests_count", "Flask Request Count", ["service", "method", "uri", "status"]
)

LOGGER_TOTAL_MESSAGES = Counter(
    "logger_messages_total",
    "Count of log entries by service and level.",
    ["service", "level"],
)


class FlaskMetricsWrapper():
    def __init__(self, app_name):
        self.app_name = app_name

    def before_request(self):  # pylint: disable=R0201
        request.start_time = time.time()
Example #28
0
    'hpfeeds_broker_subscriptions',
    'Number of subscriptions to a channel',
    ['ident', 'chan'],
)

RECEIVE_PUBLISH_COUNT = Counter(
    'hpfeeds_broker_receive_publish_count',
    'Number of events received by broker for a channel',
    ['ident', 'chan'],
)

RECEIVE_PUBLISH_SIZE = Histogram(
    'hpfeeds_broker_receive_publish_size',
    'Sizes of messages received by broker for a channel',
    ['ident', 'chan'],
    buckets=[
        1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288,
        1048576, 2097152, 4194304
    ],
)


def reset():
    ''' Reset the metrics to 0. This is intended for tests **only**. '''
    CLIENT_CONNECTIONS._value.set(0)
    SUBSCRIPTIONS._metrics = {}
    RECEIVE_PUBLISH_SIZE._metrics = {}
    RECEIVE_PUBLISH_COUNT._metrics = {}
    CLIENT_RECEIVE_BUFFER_FILL._metrics = {}
    CLIENT_SEND_BUFFER_FILL._metrics = {}
    CLIENT_SEND_BUFFER_DRAIN._metrics = {}
Example #29
0
from buildman.manager.basemanager import BaseManager
from buildman.manager.executor import PopenExecutor, EC2Executor, KubernetesExecutor
from buildman.component.buildcomponent import BuildComponent
from buildman.jobutil.buildjob import BuildJob
from buildman.server import BuildJobResult
from util import slash_join
from util.morecollections import AttrDict

logger = logging.getLogger(__name__)

build_fallback = Counter("quay_build_fallback_total",
                         "number of times a build has been retried",
                         labelnames=["executor"])
build_ack_duration = Histogram(
    "quay_build_ack_duration_seconds",
    "seconds taken for the builder to acknowledge a queued build",
    labelnames=["executor"],
)
build_duration = Histogram(
    "quay_build_duration_seconds",
    "seconds taken for a build's execution",
    labelnames=["executor", "job_status"],
)

JOB_PREFIX = "building/"
LOCK_PREFIX = "lock/"
REALM_PREFIX = "realm/"
CANCEL_PREFIX = "cancel/"
METRIC_PREFIX = "metric/"

CANCELED_LOCK_PREFIX = slash_join(LOCK_PREFIX, "job-cancelled")
Example #30
0
    "Total number of RPCs started on the server.",
    ["doge_service", "doge_method"],
)

DOGE_SERVER_HANDLED_TOTAL_COUNTER = Counter(
    "doge_server_handled_total",
    (
        "Total number of RPCs completed on the server, "
        "regardless of success or failure."
    ),
    ["doge_service", "doge_method", "code"],
)

DOGE_SERVER_HANDLED_LATENCY_SECONDS = Histogram(
    "doge_server_handled_latency_seconds",
    "Histogram of response latency (seconds) of gRPC that had been "
    "application-level handled by the server",
    ["doge_service", "doge_method"],
)


class MetricsServerFilter(BaseFilter):
    def execute(self, req: Request) -> Response:
        doge_service = req.service
        doge_method = req.method

        DOGE_SERVER_STARTED_TOTAL_COUNTER.labels(
            doge_service=doge_service, doge_method=doge_method
        ).inc()

        with DOGE_SERVER_HANDLED_LATENCY_SECONDS.labels(
            doge_service=doge_service, doge_method=doge_method
Example #31
0
class TestHistogram(unittest.TestCase):
    def setUp(self):
        self.registry = CollectorRegistry()
        self.histogram = Histogram('h', 'help', registry=self.registry)
        self.labels = Histogram('hl', 'help', ['l'], registry=self.registry)

    def test_histogram(self):
        self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '1.0'}))
        self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '2.5'}))
        self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '5.0'}))
        self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '+Inf'}))
        self.assertEqual(0, self.registry.get_sample_value('h_count'))
        self.assertEqual(0, self.registry.get_sample_value('h_sum'))

        self.histogram.observe(2)
        self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '1.0'}))
        self.assertEqual(1, self.registry.get_sample_value('h_bucket', {'le': '2.5'}))
        self.assertEqual(1, self.registry.get_sample_value('h_bucket', {'le': '5.0'}))
        self.assertEqual(1, self.registry.get_sample_value('h_bucket', {'le': '+Inf'}))
        self.assertEqual(1, self.registry.get_sample_value('h_count'))
        self.assertEqual(2, self.registry.get_sample_value('h_sum'))

        self.histogram.observe(2.5)
        self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '1.0'}))
        self.assertEqual(2, self.registry.get_sample_value('h_bucket', {'le': '2.5'}))
        self.assertEqual(2, self.registry.get_sample_value('h_bucket', {'le': '5.0'}))
        self.assertEqual(2, self.registry.get_sample_value('h_bucket', {'le': '+Inf'}))
        self.assertEqual(2, self.registry.get_sample_value('h_count'))
        self.assertEqual(4.5, self.registry.get_sample_value('h_sum'))

        self.histogram.observe(float("inf"))
        self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '1.0'}))
        self.assertEqual(2, self.registry.get_sample_value('h_bucket', {'le': '2.5'}))
        self.assertEqual(2, self.registry.get_sample_value('h_bucket', {'le': '5.0'}))
        self.assertEqual(3, self.registry.get_sample_value('h_bucket', {'le': '+Inf'}))
        self.assertEqual(3, self.registry.get_sample_value('h_count'))
        self.assertEqual(float("inf"), self.registry.get_sample_value('h_sum'))

    def test_setting_buckets(self):
        h = Histogram('h', 'help', registry=None, buckets=[0, 1, 2])
        self.assertEqual([0.0, 1.0, 2.0, float("inf")], h._upper_bounds)

        h = Histogram('h', 'help', registry=None, buckets=[0, 1, 2, float("inf")])
        self.assertEqual([0.0, 1.0, 2.0, float("inf")], h._upper_bounds)

        self.assertRaises(ValueError, Histogram, 'h', 'help', registry=None, buckets=[])
        self.assertRaises(ValueError, Histogram, 'h', 'help', registry=None, buckets=[float("inf")])
        self.assertRaises(ValueError, Histogram, 'h', 'help', registry=None, buckets=[3, 1])

    def test_labels(self):
        self.labels.labels('a').observe(2)
        self.assertEqual(0, self.registry.get_sample_value('hl_bucket', {'le': '1.0', 'l': 'a'}))
        self.assertEqual(1, self.registry.get_sample_value('hl_bucket', {'le': '2.5', 'l': 'a'}))
        self.assertEqual(1, self.registry.get_sample_value('hl_bucket', {'le': '5.0', 'l': 'a'}))
        self.assertEqual(1, self.registry.get_sample_value('hl_bucket', {'le': '+Inf', 'l': 'a'}))
        self.assertEqual(1, self.registry.get_sample_value('hl_count', {'l': 'a'}))
        self.assertEqual(2, self.registry.get_sample_value('hl_sum', {'l': 'a'}))

    def test_function_decorator(self):
        self.assertEqual(0, self.registry.get_sample_value('h_count'))
        self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '+Inf'}))

        @self.histogram.time()
        def f():
            pass

        f()
        self.assertEqual(1, self.registry.get_sample_value('h_count'))
        self.assertEqual(1, self.registry.get_sample_value('h_bucket', {'le': '+Inf'}))

    def test_block_decorator(self):
        self.assertEqual(0, self.registry.get_sample_value('h_count'))
        self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '+Inf'}))
        with self.histogram.time():
            pass
        self.assertEqual(1, self.registry.get_sample_value('h_count'))
        self.assertEqual(1, self.registry.get_sample_value('h_bucket', {'le': '+Inf'}))
 def update_seconds_not_divisible_by_1_exception():
     h = Histogram('test_value', 'Testing roller', registry=self.registry)
     roller = HistogramRoller(h, registry=self.registry, options={
         'update_seconds': 2.5
     })
Example #33
0
define('PIO_MODEL_NAMESPACE', default='', help='prediction model namespace', type=str)
define('PIO_MODEL_NAME', default='', help='prediction model name', type=str)
define('PIO_MODEL_VERSION', default='', help='prediction model version', type=str)
define('PIO_MODEL_SERVER_PORT', default='9876', help='tornado http server listen port', type=int)
define('PIO_MODEL_SERVER_PROMETHEUS_PORT', default=8080, help='port to run the prometheus http metrics server on', type=int)

MODEL_MODULE_NAME = 'pio_bundle'
# Create a metric to track time spent and requests made.
REQUEST_TIME = Summary('request_processing_seconds', 'Model Server: Time spent processing request')
REQUEST_TIME.observe(1.0)    # Observe 1.0 (seconds in this case)
REQUESTS_IN_PROGRESS = Gauge('inprogress_requests', 'model server: request current in progress')
REQUESTS_COUNT = Counter('http_requests_total', 'model server: total \
            http request count since the last time the process was restarted', ['method', 'model_type', 'model_namespace',
                                                                                'model_name', 'model_version'])
EX_COUNT = Counter('exceptions_total', 'model server: total http request count since the last time the process was restarted')
REQUEST_LATENCY = Histogram('http_request_processing_seconds', 'model server: time in seconds spent processing requests.')
REQUEST_LATENCY_BUCKETS = Histogram('http_request_duration_microseconds', 'model server: \
                         time in microseconds spent processing requests.', ['method', 'model_type', 'model_namespace',
                                                                            'model_name', 'model_version'])
REGISTRY = CollectorRegistry()
REGISTRY.register(REQUEST_TIME)
REGISTRY.register(REQUESTS_IN_PROGRESS)
REGISTRY.register(REQUESTS_COUNT)
REGISTRY.register(EX_COUNT)
REGISTRY.register(REQUEST_LATENCY)
REGISTRY.register(REQUEST_LATENCY_BUCKETS)
LOGGER = logging.getLogger(__name__)
LOGGER.setLevel(logging.DEBUG)
CH = logging.StreamHandler()
CH.setLevel(logging.DEBUG)
LOGGER.addHandler(CH)
Example #34
0
from django_prometheus.utils import Time, TimeSince, PowersOf
import django

if django.VERSION >= (1, 10, 0):
    from django.utils.deprecation import MiddlewareMixin
else:
    MiddlewareMixin = object

requests_total = Counter(
    'django_http_requests_before_middlewares_total',
    'Total count of requests before middlewares run.')
responses_total = Counter(
    'django_http_responses_before_middlewares_total',
    'Total count of responses before middlewares run.')
requests_latency_before = Histogram(
    'django_http_requests_latency_including_middlewares_seconds',
    ('Histogram of requests processing time (including middleware '
     'processing time).'))
requests_unknown_latency_before = Counter(
    'django_http_requests_unknown_latency_including_middlewares_total',
    ('Count of requests for which the latency was unknown (when computing '
     'django_http_requests_latency_including_middlewares_seconds).'))


class PrometheusBeforeMiddleware(MiddlewareMixin):
    """Monitoring middleware that should run before other middlewares."""
    def process_request(self, request):
        requests_total.inc()
        request.prometheus_before_middleware_event = Time()

    def process_response(self, request, response):
        responses_total.inc()
Example #35
0
    class _MarshalService(cls):
        def __init__(self, *args, **kwargs):
            from prometheus_client import Histogram, Counter, Gauge

            super(_MarshalService, self).__init__(*args, **kwargs)
            namespace = config('instrument').get(
                'default_namespace')  # its own namespace?
            service_name = self.bento_service_metadata_pb.name

            self.metrics_request_batch_size = Histogram(
                name=service_name + '_mb_batch_size',
                documentation=service_name + "microbatch request batch size",
                namespace=namespace,
                labelnames=['endpoint'],
            )
            self.metrics_request_duration = Histogram(
                name=service_name + '_mb_requestmb_duration_seconds',
                documentation=service_name +
                "API HTTP request duration in seconds",
                namespace=namespace,
                labelnames=['endpoint', 'http_response_code'],
            )
            self.metrics_request_in_progress = Gauge(
                name=service_name + "_mb_request_in_progress",
                documentation='Totoal number of HTTP requests in progress now',
                namespace=namespace,
                labelnames=['endpoint', 'http_method'],
            )
            self.metrics_request_exception = Counter(
                name=service_name + "_mb_request_exception",
                documentation='Totoal number of service exceptions',
                namespace=namespace,
                labelnames=['endpoint', 'exception_class'],
            )
            self.metrics_request_total = Counter(
                name=service_name + "_mb_request_total",
                documentation='Totoal number of service exceptions',
                namespace=namespace,
                labelnames=['endpoint', 'http_response_code'],
            )

        async def request_dispatcher(self, request):
            func = super(_MarshalService, self).request_dispatcher
            api_name = request.match_info.get("name", "/")
            _metrics_request_in_progress = self.metrics_request_in_progress.labels(
                endpoint=api_name,
                http_method=request.method,
            )
            _metrics_request_in_progress.inc()
            time_st = time.time()
            try:
                resp = await func(request)
            except Exception as e:  # pylint: disable=broad-except
                self.metrics_request_exception.labels(
                    endpoint=api_name,
                    exception_class=e.__class__.__name__).inc()
                logger.error(traceback.format_exc())
                resp = aiohttp.web.Response(status=500)
            self.metrics_request_total.labels(
                endpoint=api_name, http_response_code=resp.status).inc()
            self.metrics_request_duration.labels(
                endpoint=api_name,
                http_response_code=resp.status).observe(time.time() - time_st)
            _metrics_request_in_progress.dec()
            return resp

        async def _batch_handler_template(self, requests, api_name):
            func = super(_MarshalService, self)._batch_handler_template
            self.metrics_request_batch_size.labels(endpoint=api_name).observe(
                len(requests))
            return await func(requests, api_name)
Example #36
0
 def setUp(self):
     self.registry = CollectorRegistry()
     self.histogram = Histogram('h', 'help', registry=self.registry)
     self.labels = Histogram('hl', 'help', ['l'], registry=self.registry)
'''middleware.py'''

import time
import falcon
from prometheus_client import Counter, Histogram
from prometheus_client import multiprocess, CollectorRegistry
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST

REQUEST_COUNT = Counter('request_count', 'App Request Count',
                        ['app_name', 'method', 'endpoint', 'http_status'])
REQUEST_LATENCY = Histogram('request_latency_seconds', 'Request latency',
                            ['app_name', 'endpoint'])
API_PATHS = [
    '/misc/angdia', '/ct/lbb6/star', '/mt/wbh/star', '/t5/cargogen',
    '/ct/lbb2/cargogen/purchase', '/ct/lbb2/cargogen/sale', '/t5/orbit',
    '/misc/starcolor', '/metrics', '/ping'
]


class PrometheusMetrics(object):
    '''Prometheus metrics middleware'''
    @staticmethod
    def start_timer(request):
        '''Start request timer'''
        request.start_time = time.time()

    def stop_timer(self, request, response):
        '''Stop request timer'''
        metric_path = self.trim_path(request.path)
        resp_time = time.time() - request.start_time
        if metric_path: