Example #1
0
def main():
    g = Gauge('bcr_gauge_example', 'Testing how Prometheus Gauge works')   
    start_http_server(8000)
    while True:
      g.inc(3)
      time.sleep(5)
      g.dec(2)
Example #2
0
 def dec_counter(self, key, amount=1):
     """ Decrement metric
     """
     prometheus_counter = Gauge(  # pylint: disable=no-value-for-parameter
         key
     )
     prometheus_counter.dec(amount)
    def send_gauge(cls, metrics_name, help_info, value, inc=None):
        g = Gauge(metrics_name, help_info)

        if inc is None:
            g.set(value)
        else:
            if inc:
                g.inc(value)
            else:
                g.dec(value)
Example #4
0
class OperationMetricSet:
    """Collection of Prometheus metrics representing a logical operation"""

    requests: Counter
    requests_duration: Histogram
    exceptions: Counter
    requests_in_progress: Gauge

    def __init__(self, operation_name: str, labels: List[str]):
        self.requests = Counter(
            f"pyncette_{operation_name}_total",
            f"Total count of {operation_name} operations",
            labels,
        )
        self.requests_duration = Histogram(
            f"pyncette_{operation_name}_duration_seconds",
            f"Histogram of {operation_name} processing time",
            labels,
        )
        self.exceptions = Counter(
            f"pyncette_{operation_name}_failures_total",
            f"Total count of failed {operation_name} failures",
            [*labels, "exception_type"],
        )
        self.requests_in_progress = Gauge(
            f"pyncette_{operation_name}_in_progress",
            f"Gauge of {operation_name} operations currently being processed",
            labels,
        )

    @contextlib.asynccontextmanager
    async def measure(self, **labels: Dict[str, str]) -> AsyncIterator[None]:
        """An async context manager that measures the execution of the wrapped code"""
        if labels:
            self.requests_in_progress.labels(**labels).inc()
            self.requests.labels(**labels).inc()
        else:
            self.requests_in_progress.inc()
            self.requests.inc()

        before_time = time.perf_counter()
        try:
            yield
        except Exception as e:
            self.exceptions.labels(**labels, exception_type=type(e).__name__).inc()
            raise e from None
        finally:
            if labels:
                self.requests_duration.labels(**labels).observe(
                    time.perf_counter() - before_time
                )
                self.requests_in_progress.labels(**labels).dec()
            else:
                self.requests_duration.observe(time.perf_counter() - before_time)
                self.requests_in_progress.dec()
Example #5
0
class TestGauge(unittest.TestCase):
    def setUp(self):
        self.registry = CollectorRegistry()
        self.gauge = Gauge('g', 'help', registry=self.registry)

    def test_gauge(self):
        self.assertEqual(0, self.registry.get_sample_value('g'))
        self.gauge.inc()
        self.assertEqual(1, self.registry.get_sample_value('g'))
        self.gauge.dec(3)
        self.assertEqual(-2, self.registry.get_sample_value('g'))
        self.gauge.set(9)
        self.assertEqual(9, self.registry.get_sample_value('g'))

    def test_function_decorator(self):
        self.assertEqual(0, self.registry.get_sample_value('g'))

        @self.gauge.track_inprogress()
        def f():
            self.assertEqual(1, self.registry.get_sample_value('g'))

        f()
        self.assertEqual(0, self.registry.get_sample_value('g'))

    def test_block_decorator(self):
        self.assertEqual(0, self.registry.get_sample_value('g'))
        with self.gauge.track_inprogress():
            self.assertEqual(1, self.registry.get_sample_value('g'))
        self.assertEqual(0, self.registry.get_sample_value('g'))

    def test_gauge_function(self):
        x = {}
        self.gauge.set_function(lambda: len(x))
        self.assertEqual(0, self.registry.get_sample_value('g'))
        self.gauge.inc()
        self.assertEqual(0, self.registry.get_sample_value('g'))
        x['a'] = None
        self.assertEqual(1, self.registry.get_sample_value('g'))

    def test_function_decorator(self):
        self.assertEqual(0, self.registry.get_sample_value('g'))

        @self.gauge.time()
        def f():
            time.sleep(.001)

        f()
        self.assertNotEqual(0, self.registry.get_sample_value('g'))

    def test_block_decorator(self):
        self.assertEqual(0, self.registry.get_sample_value('g'))
        with self.gauge.time():
            time.sleep(.001)
        self.assertNotEqual(0, self.registry.get_sample_value('g'))
Example #6
0
class TestGauge(unittest.TestCase):
    def setUp(self):
        self.registry = CollectorRegistry()
        self.gauge = Gauge('g', 'help', registry=self.registry)

    def test_gauge(self):
        self.assertEqual(0, self.registry.get_sample_value('g'))
        self.gauge.inc()
        self.assertEqual(1, self.registry.get_sample_value('g'))
        self.gauge.dec(3)
        self.assertEqual(-2, self.registry.get_sample_value('g'))
        self.gauge.set(9)
        self.assertEqual(9, self.registry.get_sample_value('g'))

    def test_function_decorator(self):
        self.assertEqual(0, self.registry.get_sample_value('g'))

        @self.gauge.track_inprogress()
        def f():
            self.assertEqual(1, self.registry.get_sample_value('g'))

        f()
        self.assertEqual(0, self.registry.get_sample_value('g'))

    def test_block_decorator(self):
        self.assertEqual(0, self.registry.get_sample_value('g'))
        with self.gauge.track_inprogress():
            self.assertEqual(1, self.registry.get_sample_value('g'))
        self.assertEqual(0, self.registry.get_sample_value('g'))

    def test_gauge_function(self):
        x = {}
        self.gauge.set_function(lambda: len(x))
        self.assertEqual(0, self.registry.get_sample_value('g'))
        self.gauge.inc()
        self.assertEqual(0, self.registry.get_sample_value('g'))
        x['a'] = None
        self.assertEqual(1, self.registry.get_sample_value('g'))

    def test_function_decorator(self):
        self.assertEqual(0, self.registry.get_sample_value('g'))

        @self.gauge.time()
        def f():
            time.sleep(.001)

        f()
        self.assertNotEqual(0, self.registry.get_sample_value('g'))

    def test_block_decorator(self):
        self.assertEqual(0, self.registry.get_sample_value('g'))
        with self.gauge.time():
            time.sleep(.001)
        self.assertNotEqual(0, self.registry.get_sample_value('g'))
Example #7
0
class TelemetryClient(object):
    @Inject
    def __init__(self, environment: SystemEnvironmentProperties):
        self.endpoint = environment.get("PROMETHEUS_GATEWAY_ENDPOINT")
        self.registry = CollectorRegistry()
        self.get_request_counter = Counter("invertpdf_get_request_count",
                                           "Number of successful GET requests",
                                           registry=self.registry)
        self.post_request_counter = Counter(
            "invertpdf_post_request_count",
            "Number of successful POST requests",
            registry=self.registry)
        self.duration_histogram = Histogram("invertpdf_request_duration_ms",
                                            "Request duration",
                                            registry=self.registry,
                                            buckets=[
                                                0, 50, 100, 200, 500, 1000,
                                                2000, 5000, 10000, 30000,
                                                60000, 1800000, 3600000
                                            ])
        self.failure_counter = Counter("invertpdf_failed_requests",
                                       "Number of failed requests",
                                       registry=self.registry)

        self.requests_in_progress = Gauge("invertpdf_requests_in_progress",
                                          "Number of pending requests",
                                          registry=self.registry)
        self.free_disk = Gauge("invertpdf_free_disk_space",
                               "Free disk space on tmpfs",
                               registry=self.registry)
        self.logger = logging.getLogger(self.__class__.__name__)

    def track_request(self, method: str, duration: int):
        self.logger.info(f"Request took {duration}ms.")
        self.duration_histogram.observe(duration)
        if method == "GET":
            self.get_request_counter.inc()
        elif method == "POST":
            self.post_request_counter.inc()

    def track_failure(self, method: str, duration: int):
        self.failure_counter.inc()
        self.duration_histogram.observe(duration)

    def track_start(self):
        self.requests_in_progress.inc()

    def track_end(self):
        self.requests_in_progress.dec()

    def submit(self):
        push_to_gateway(self.endpoint, "invertpdf", self.registry)
Example #8
0
def update_stats(name):
    metric = metrics.get(name, None)
    value = int(request.args.get("value", 1))
    if metric is None:
        metric = Gauge(name, name)
        metrics[name] = metric
    if request.method == "DELETE":
        metric.dec(value)
    elif request.method == "POST":
        metric.inc(value)
    elif request.method == "PATCH":
        metric.set(value)
    return ""
Example #9
0
# Create a metric to track time spent and requests made.
counter = Counter('sobi3ch_counter', 'Description of a counter')
gauge = Gauge('sobi3ch_gauge', 'Description of gauge')
gauge.set(50)
SUMMARY = Summary('sobi3ch_summary_request_processing_seconds',
                  'Time spent processing request')
histogram = Histogram('sobi3ch_histogram_request_latency_seconds',
                      'Description of histogram')


# Decorate function with metric.
@SUMMARY.time()
def process_request(t):
    """A dummy function that takes some time."""
    time.sleep(t)


if __name__ == '__main__':
    # Start up the server to expose the metrics.
    start_http_server(8000)
    # Generate some requests.
    while True:
        r = random.random()
        process_request(r)
        if r > 0.8:
            counter.inc()
        if r < 0.5:
            gauge.inc()  # Increment by 1
        else:
            gauge.dec()
        histogram.observe(4.7)  # Observe 4.7 (seconds in this case)
class Prometheus(service.BuildbotService):
    '''
    This service exposes buildbot metrics to Prometheus.

    Metrics state is initialised at service start and is (mostly) retained
    through a reconfiguration. Instance attributes holding a Prometheus
    metrics item are prefixed with a symbol indicating the kind of metric
    they are. For example:

    - Counters: c_<attr_label>
    - Gauges: g_<attr_label>
    - Histogram: h_<attr_label>
    - Summary: s_<attr_label>

    '''

    name = "Prometheus"
    namespace = 'buildbot'

    def __init__(self, port=9101, **kwargs):
        service.BuildbotService.__init__(self, **kwargs)
        self.port = port
        self.server = None
        self.consumers = []
        self.registry = None
        self.create_metrics()

    @defer.inlineCallbacks
    def reconfigService(self, builders=None, **kwargs):
        '''
        Accumulated metrics are maintained through a reconfigure.
        '''
        log.msg("Reconfiguring Prometheus reporter")
        yield service.BuildbotService.reconfigService(self)
        self.registerConsumers()

    @defer.inlineCallbacks
    def startService(self):
        log.msg("Starting Prometheus reporter")
        yield service.BuildbotService.startService(self)
        root = Resource()
        root.putChild(b'metrics', MetricsResource(registry=self.registry))
        self.server = reactor.listenTCP(self.port, Site(root))
        log.msg("Prometheus service starting on {}".format(self.server.port))

    @defer.inlineCallbacks
    def stopService(self):
        log.msg("Stopping Prometheus reporter")
        yield self.server.stopListening()
        yield service.BuildbotService.stopService(self)
        self.removeConsumers()

    def create_metrics(self):
        '''
        Create the Prometheus metrics that will be exposed.
        '''
        log.msg("Creating Prometheus metrics")
        self.registry = CollectorRegistry()

        # build metrics
        builds_labels = ['builder_id', 'worker_id']
        self.g_builds_duration = Gauge(
            'builds_duration_seconds',
            'Number of seconds spent performing builds',
            labelnames=builds_labels,
            namespace=self.namespace,
            registry=self.registry)
        self.c_builds_success = Counter('builds_success',
                                        'Number of builds reporting success',
                                        labelnames=builds_labels,
                                        namespace=self.namespace,
                                        registry=self.registry)
        self.c_builds_failure = Counter('builds_failure',
                                        'Number of builds reporting failure',
                                        labelnames=builds_labels,
                                        namespace=self.namespace,
                                        registry=self.registry)
        self.c_builds_error = Counter('builds_error',
                                      'Number of builds reporting error',
                                      labelnames=builds_labels,
                                      namespace=self.namespace,
                                      registry=self.registry)

        # builders metrics
        builders_labels = ['builder_id', 'builder_name']
        self.g_builders_running_total = Gauge(
            'builders_running_total',
            'Total number of builders running',
            namespace=self.namespace,
            registry=self.registry)
        self.g_builders_running = Gauge('builders_running',
                                        'Number of builders running',
                                        labelnames=builders_labels,
                                        namespace=self.namespace,
                                        registry=self.registry)

        # buildsets metrics
        buildsets_labels = ['buildset_id']
        self.g_buildsets_duration = Gauge(
            'buildsets_duration_seconds',
            'Number of seconds spent performing buildsets',
            labelnames=buildsets_labels,
            namespace=self.namespace,
            registry=self.registry)
        self.c_buildsets_success = Counter(
            'buildsets_success',
            'Number of buildsets reporting success',
            labelnames=buildsets_labels,
            namespace=self.namespace,
            registry=self.registry)
        self.c_buildsets_failure = Counter(
            'buildsets_failure',
            'Number of buildsets reporting failure',
            labelnames=buildsets_labels,
            namespace=self.namespace,
            registry=self.registry)
        self.c_buildsets_error = Counter('buildsets_error',
                                         'Number of buildsets reporting error',
                                         labelnames=buildsets_labels,
                                         namespace=self.namespace,
                                         registry=self.registry)

        # build requests metrics
        build_requests_labels = ['builder_id']
        self.g_build_requests_duration = Gauge(
            'build_requests_duration_seconds',
            'Number of seconds spent performing build requests',
            labelnames=build_requests_labels,
            namespace=self.namespace,
            registry=self.registry)
        self.c_build_requests_success = Counter(
            'build_requests_success',
            'Number of build requests reporting success',
            labelnames=build_requests_labels,
            namespace=self.namespace,
            registry=self.registry)
        self.c_build_requests_failure = Counter(
            'build_requests_failure',
            'Number of build requests reporting failure',
            labelnames=build_requests_labels,
            namespace=self.namespace,
            registry=self.registry)
        self.c_build_requests_error = Counter(
            'build_requests_error',
            'Number of build requests reporting error',
            labelnames=build_requests_labels,
            namespace=self.namespace,
            registry=self.registry)

        # steps metrics
        steps_labels = ['step_number', 'step_name', 'builder_id', 'worker_id']
        self.g_steps_duration = Gauge(
            'steps_duration_seconds',
            'Number of seconds spent performing build steps',
            labelnames=steps_labels,
            namespace=self.namespace,
            registry=self.registry)
        self.c_steps_success = Counter('steps_success',
                                       'Number of steps reporting success',
                                       labelnames=steps_labels,
                                       namespace=self.namespace,
                                       registry=self.registry)
        self.c_steps_failure = Counter('steps_failure',
                                       'Number of steps reporting failure',
                                       labelnames=steps_labels,
                                       namespace=self.namespace,
                                       registry=self.registry)
        self.c_steps_error = Counter('steps_error',
                                     'Number of steps reporting error',
                                     labelnames=steps_labels,
                                     namespace=self.namespace,
                                     registry=self.registry)

        # workers metrics
        workers_labels = ['worker_id', 'worker_name']
        self.g_workers_running_total = Gauge('workers_running_total',
                                             'Total number of workers running',
                                             namespace=self.namespace,
                                             registry=self.registry)
        self.g_workers_running = Gauge('workers_running',
                                       'Number of workers running',
                                       labelnames=workers_labels,
                                       namespace=self.namespace,
                                       registry=self.registry)

    @defer.inlineCallbacks
    def registerConsumers(self):
        self.removeConsumers()
        startConsuming = self.master.mq.startConsuming

        handlers = (
            (('builds', None, None), self.buildsConsumer),
            (('builders', None, None), self.buildersConsumer),
            (('buildsets', None, None), self.buildSetsConsumer),
            (('buildrequests', None, None), self.buildRequestsConsumer),
            (('steps', None, None), self.stepsConsumer),
            (('workers', None, None), self.workersConsumer),
        )
        for routingKey, handler in handlers:
            consumer = yield startConsuming(handler, routingKey)
            self.consumers.append(consumer)

    @defer.inlineCallbacks
    def removeConsumers(self):
        for consumer in self.consumers:
            yield consumer.stopConsuming()
        self.consumers = []

    # @defer.inlineCallbacks
    def buildsConsumer(self, key, msg):
        '''
        This method is responsible for updating build related metrics. There
        are four build set metrics:

        - buildbot_builds_duration_seconds,
        - buildbot_builds_success,
        - buildbot_builds_failure,
        - buildbot_builds_error

        buildbot_builds_duration_seconds is a gauge metric used to
        track the duration of individual builds by making use of Prometheus
        multi dimensional labels. As builds complete, an instance of this
        metric is created by passing builder_id and worker_id labels and
        then setting the value. This allows visualisation tools to query and
        filter metrics for specific builder combinations.

        Similarly, the other counter metrics record success, failure and
        error states for each build.
        '''
        action = key[2]
        labels = dict(builder_id=msg['builderid'], worker_id=msg['workerid'])
        # build_info = yield self.master.data.get(("builds", msg['buildid']))

        if action == 'finished':

            assert msg['complete']
            build_started = msg['started_at']
            build_finished = msg['complete_at']
            build_duration = build_finished - build_started
            duration_seconds = build_duration
            self.g_builds_duration.labels(**labels).set(duration_seconds)

            build_status = resolve_results_status(msg['results'])
            if build_status == 'success':
                self.c_builds_success.labels(**labels).inc()
            elif build_status == 'failure':
                self.c_builds_failure.labels(**labels).inc()
            elif build_status == 'error':
                self.c_builds_error.labels(**labels).inc()

    def buildersConsumer(self, key, msg):
        '''
        The Buildmaster runs a collection of Builders, each of which handles a
        single type of build (e.g. full versus quick), on one or more workers.
        Builders serve as a kind of queue for a particular type of build. Each
        Builder gets a separate column in the waterfall display. In general,
        each Builder runs independently.

        Each builder is a long-lived object which controls a sequence of Builds.
        Each Builder is created when the config file is first parsed, and lives
        forever (or rather until it is removed from the config file). It mediates
        the connections to the workers that do all the work, and is responsible
        for creating the Build objects - Builds.

        This method is responsible for updating builder related metrics. There
        are two builder metrics ``buildbot_builders_running_total`` and
        ``buildbot_builders_running``.

        ``buildbot_builders_running_total`` is a gauge metric used to track the
        total number of running builders. As builders start the metric is
        increased and as they stop the metric is decreased. No extra labels are
        used with this metric.

        ``buildbot_builders_running`` is a gauge metric used to track the
        running state of individual workers by making use of Prometheus multi
        dimensional labels. As builders start, an instance of this metric is
        created by passing ``builder_id`` and ``builder_name`` labels and then
        incremented. When the worker disconnects the same gauge metric is
        decreased. This means that a gauge value of 1 indicates started while
        a gauge value of 0 indicates stopped.
        '''
        action = key[2]
        labels = dict(builder_id=msg['builderid'], builder_name=msg['name'])

        if action == 'started':
            self.g_builders_running_total.inc()
            self.g_builders_running.labels(**labels).inc()
        elif action == 'stopped':
            self.g_builders_running_total.dec()
            self.g_builders_running.labels(**labels).dec()

    # @defer.inlineCallbacks
    def buildSetsConsumer(self, key, msg):
        '''
        A BuildSet is the name given to a set of Builds that all compile/test
        the same version of the tree on multiple Builders. In general, all these
        component Builds will perform the same sequence of Steps, using the same
        source code, but on different platforms or against a different set of
        libraries.

        Each scheduler creates and submits BuildSet objects to the BuildMaster.
        The buildmaster is responsible for turning the BuildSet into a set of
        BuildRequest objects and queueing them on the appropriate Builders.

        This method is responsible for updating build set related metrics.
        There are four build set metrics:

        - buildbot_buildsets_duration_seconds,
        - buildbot_buildsets_success,
        - buildbot_buildsets_failure,
        - buildbot_buildsets_error

        buildbot_buildsets_duration_seconds is a gauge metric used to
        track the duration of individual build sets by making use of
        Prometheus multi dimensional labels. As build sets complete, an
        instance of this metric is created by passing buildset_id labels and
        then setting the value. This allows visualisation tools to query and
        filter metrics for specific builder combinations.

        Similarly, the other counter metrics record success, failure and
        error states for each build set.

        '''
        action = key[2]
        # TODO: substitute bsid for something more useful. bsid is just
        # a number that increments. A better choice would be something
        # like the repo, project, etc
        labels = dict(buildset_id=msg['bsid'])

        # buildset_info = yield self.master.data.get(("buildsets", msg['bsid']))

        if action == 'complete':

            assert msg['complete']
            buildset_started = msg['submitted_at']
            buildset_finished = msg['complete_at']
            buildset_duration = buildset_finished - buildset_started
            duration_seconds = buildset_duration
            self.g_buildsets_duration.labels(**labels).set(duration_seconds)

            bs_success = resolve_results_status(msg['results'])
            if bs_success == 'success':
                self.c_buildsets_success.labels(**labels).inc()
            elif bs_success == 'failure':
                self.c_buildsets_failure.labels(**labels).inc()
            elif bs_success == 'error':
                self.c_buildsets_error.labels(**labels).inc()

    def buildRequestsConsumer(self, key, msg):
        '''
        A BuildRequest is a request to build a specific set of source code
        on a single Builder. Each Builder runs the BuildRequest as soon as
        it can (i.e. when an associated worker becomes free).

        This method is responsible for updating build request related metrics.
        There are four nuild request metrics:

        - buildbot_build_requests_duration_seconds
        - buildbot_build_requests_success
        - buildbot_build_requests_failure
        - buildbot_build_requests_error

        buildbot_build_requests_duration_seconds is a gauge metric used to
        track the duration of individual build requests by making use of
        Prometheus multi dimensional labels. As build requests complete, an
        instance of this metric is created by passing builder_id labels and
        then setting the value. This allows visualisation tools to query and
        filter metrics for specific builder combinations.

        Similarly, the other counter metrics record success, failure and
        error states for each build request.
        '''
        action = key[2]
        labels = dict(builder_id=msg['builderid'])

        if action == 'complete':
            assert msg['complete']
            br_started = msg['submitted_at']
            br_finished = msg['complete_at']
            br_duration = br_finished - br_started
            duration_seconds = br_duration
            self.g_build_requests_duration.labels(
                **labels).set(duration_seconds)

            br_success = resolve_results_status(msg['results'])
            if br_success == 'success':
                self.c_build_requests_success.labels(**labels).inc()
            elif br_success == 'failure':
                self.c_build_requests_failure.labels(**labels).inc()
            elif br_success == 'error':
                self.c_build_requests_error.labels(**labels).inc()

    @defer.inlineCallbacks
    def stepsConsumer(self, key, msg):
        '''
        This method is responsible for updating step related metrics. There
        are four steps metrics:

        - buildbot_steps_duration_seconds,
        - buildbot_steps_success
        - buildbot_steps_failure
        - buildbot_steps_error

        buildbot_steps_duration_seconds is a gauge metric used to track
        the duration of individual steps by making use of Prometheus multi
        dimensional labels. As steps complete, an instance of this metric is
        created by passing step_number, step_name, builder_id and worker_id
        labels and then setting the value. This allows visualisation tools
        to query and filter metrics for specific step, builder and worker
        combinations.

        Similarly, the other counter metrics record success, failure and
        error states for each step.
        '''
        action = key[2]

        build_info = yield self.master.data.get(("builds", msg['buildid']))

        labels = dict(step_number=msg['number'],
                      step_name=msg['name'],
                      builder_id=build_info['builderid'],
                      worker_id=build_info['workerid'])

        if action == 'finished':
            assert msg['complete']
            step_started = msg['started_at']
            step_finished = msg['complete_at']
            step_duration = step_finished - step_started
            duration_seconds = step_duration
            self.g_steps_duration.labels(**labels).set(duration_seconds)

            step_success = resolve_results_status(msg['results'])
            if step_success == 'success':
                self.c_steps_success.labels(**labels).inc()
            elif step_success == 'failure':
                self.c_steps_failure.labels(**labels).inc()
            elif step_success == 'error':
                self.c_steps_error.labels(**labels).inc()

    def workersConsumer(self, key, msg):
        '''
        This method is responsible for updating worker related metrics. There
        are two worker metrics ``buildbot_workers_running_total`` and
        ``buildbot_workers_running``.

        ``buildbot_workers_running_total`` is a gauge metric used to track the
        total number of running workers. As workers connect the metric is
        increased and as they disconnect the metric is decreased. No extra
        labels are used with this metric.

        ``buildbot_workers_running`` is a gauge metric used to track the
        running state of individual workers by making use of Prometheus multi
        dimensional labels. As workers connect, an instance of this metric is
        created by passing ``worker_id`` and ``worker_name`` labels and then
        incremented. When the worker disconnects the same gauge metric is
        decreased. This means that a gauge value of 1 indicates connected while
        a gauge value of 0 indicates disconnected.
        '''
        action = key[2]

        labels = dict(worker_id=msg['workerid'], worker_name=msg['name'])

        if action == 'connected':
            self.g_workers_running_total.inc()
            self.g_workers_running.labels(**labels).inc()
        elif action == 'disconnected':
            self.g_workers_running_total.dec()
            self.g_workers_running.labels(**labels).dec()
    pass


# Within your code
with c.count_exceptions():
    pass

# Count only one type of exception
with c.count_exceptions(ValueError):
    pass

# Gauges: Used to track any value, anything that's not counting will be here (e.g. temperature, cpu usage, ...)
# Can inc, dec, and set
g = Gauge('my_inprogress_requests', 'Description of gauge')
g.inc()  # Increment by 1
g.dec(10)  # Decrement by given value
g.set(4.2)  # Set to a given value

g.set_to_current_time()  # Set to current unixtime


# Another use case: Increment when entered, decrement when exited.
@g.track_inprogress()
def f():
    pass


with g.track_inprogress():
    pass

# A gauge can also take its value from a callback
Example #12
0
class PrometheusMonitor(Monitor):
    """
    Prometheus Faust Sensor.

    This sensor, records statistics using prometheus_client and expose
    them using the aiohttp server running under /metrics by default

    Usage:
        import faust
        from faust.sensors.prometheus import PrometheusMonitor

        app = faust.App('example', broker='kafka://')
        app.monitor = PrometheusMonitor(app, pattern='/metrics')
    """

    ERROR = 'error'
    COMPLETED = 'completed'
    KEYS_RETRIEVED = 'keys_retrieved'
    KEYS_UPDATED = 'keys_updated'
    KEYS_DELETED = 'keys_deleted'

    def __init__(self,
                 app: AppT,
                 pattern: str = '/metrics',
                 **kwargs: Any) -> None:
        self.app = app
        self.pattern = pattern

        if prometheus_client is None:
            raise ImproperlyConfigured(
                'prometheus_client requires `pip install prometheus_client`.')

        self._initialize_metrics()
        self.expose_metrics()
        super().__init__(**kwargs)

    def _initialize_metrics(self) -> None:
        """
        Initialize Prometheus metrics
        """
        # On message received
        self.messages_received = Counter('messages_received',
                                         'Total messages received')
        self.active_messages = Gauge('active_messages',
                                     'Total active messages')
        self.messages_received_per_topics = Counter(
            'messages_received_per_topic', 'Messages received per topic',
            ['topic'])
        self.messages_received_per_topics_partition = Gauge(
            'messages_received_per_topics_partition',
            'Messages received per topic/partition', ['topic', 'partition'])
        self.events_runtime_latency = Histogram('events_runtime_ms',
                                                'Events runtime in ms')

        # On Event Stream in
        self.total_events = Counter('total_events', 'Total events received')
        self.total_active_events = Gauge('total_active_events',
                                         'Total active events')
        self.total_events_per_stream = Counter('total_events_per_stream',
                                               'Events received per Stream',
                                               ['stream'])

        # On table changes get/set/del keys
        self.table_operations = Counter('table_operations',
                                        'Total table operations',
                                        ['table', 'operation'])

        # On message send
        self.topic_messages_sent = Counter('topic_messages_sent',
                                           'Total messages sent per topic',
                                           ['topic'])
        self.total_sent_messages = Counter('total_sent_messages',
                                           'Total messages sent')
        self.producer_send_latency = Histogram('producer_send_latency',
                                               'Producer send latency in ms')
        self.total_error_messages_sent = Counter('total_error_messages_sent',
                                                 'Total error messages sent')
        self.producer_error_send_latency = Histogram(
            'producer_error_send_latency', 'Producer error send latency in ms')

        # Assignment
        self.assignment_operations = Counter(
            'assignment_operations',
            'Total assigment operations (completed/error)', ['operation'])
        self.assign_latency = Histogram('assign_latency',
                                        'Assignment latency in ms')

        # Revalances
        self.total_rebalances = Gauge('total_rebalances', 'Total rebalances')
        self.total_rebalances_recovering = Gauge(
            'total_rebalances_recovering', 'Total rebalances recovering')
        self.revalance_done_consumer_latency = Histogram(
            'revalance_done_consumer_latency',
            'Consumer replying that rebalance is done to broker in ms')
        self.revalance_done_latency = Histogram(
            'revalance_done_latency', 'Revalance finished latency in ms')

        # Count Metrics by name
        self.count_metrics_by_name = Gauge('metrics_by_name',
                                           'Total metrics by name', ['metric'])

        # Web
        self.http_status_codes = Counter('http_status_codes',
                                         'Total http_status code',
                                         ['status_code'])
        self.http_latency = Histogram('http_latency',
                                      'Http response latency in ms')

        # Topic/Partition Offsets
        self.topic_partition_end_offset = Gauge(
            'topic_partition_end_offset', 'Offset ends per topic/partition',
            ['topic', 'partition'])
        self.topic_partition_offset_commited = Gauge(
            'topic_partition_offset_commited',
            'Offset commited per topic/partition', ['topic', 'partition'])
        self.consumer_commit_latency = Histogram(
            'consumer_commit_latency', 'Consumer commit latency in ms')

    def on_message_in(self, tp: TP, offset: int, message: Message) -> None:
        """Call before message is delegated to streams."""
        super().on_message_in(tp, offset, message)

        self.messages_received.inc()
        self.active_messages.inc()
        self.messages_received_per_topics.labels(topic=tp.topic).inc()
        self.messages_received_per_topics_partition.labels(
            topic=tp.topic, partition=tp.partition).set(offset)

    def on_stream_event_in(self, tp: TP, offset: int, stream: StreamT,
                           event: EventT) -> typing.Optional[typing.Dict]:
        """Call when stream starts processing an event."""
        state = super().on_stream_event_in(tp, offset, stream, event)
        self.total_events.inc()
        self.total_active_events.inc()
        self.total_events_per_stream.labels(
            stream=f'stream.{self._stream_label(stream)}.events').inc()

        return state

    def _stream_label(self, stream: StreamT) -> str:
        return self._normalize(
            stream.shortlabel.lstrip('Stream:'), ).strip('_').lower()

    def on_stream_event_out(self,
                            tp: TP,
                            offset: int,
                            stream: StreamT,
                            event: EventT,
                            state: typing.Dict = None) -> None:
        """Call when stream is done processing an event."""
        super().on_stream_event_out(tp, offset, stream, event, state)
        self.total_active_events.dec()
        self.events_runtime_latency.observe(
            self.secs_to_ms(self.events_runtime[-1]))

    def on_message_out(self, tp: TP, offset: int, message: Message) -> None:
        """Call when message is fully acknowledged and can be committed."""
        super().on_message_out(tp, offset, message)
        self.active_messages.dec()

    def on_table_get(self, table: CollectionT, key: typing.Any) -> None:
        """Call when value in table is retrieved."""
        super().on_table_get(table, key)
        self.table_operations.labels(table=f'table.{table.name}',
                                     operation=self.KEYS_RETRIEVED).inc()

    def on_table_set(self, table: CollectionT, key: typing.Any,
                     value: typing.Any) -> None:
        """Call when new value for key in table is set."""
        super().on_table_set(table, key, value)
        self.table_operations.labels(table=f'table.{table.name}',
                                     operation=self.KEYS_UPDATED).inc()

    def on_table_del(self, table: CollectionT, key: typing.Any) -> None:
        """Call when key in a table is deleted."""
        super().on_table_del(table, key)
        self.table_operations.labels(table=f'table.{table.name}',
                                     operation=self.KEYS_DELETED).inc()

    def on_commit_completed(self, consumer: ConsumerT,
                            state: typing.Any) -> None:
        """Call when consumer commit offset operation completed."""
        super().on_commit_completed(consumer, state)
        self.consumer_commit_latency.observe(
            self.ms_since(typing.cast(float, state)))

    def on_send_initiated(self, producer: ProducerT, topic: str,
                          message: PendingMessage, keysize: int,
                          valsize: int) -> typing.Any:
        """Call when message added to producer buffer."""
        self.topic_messages_sent.labels(topic=f'topic.{topic}').inc()

        return super().on_send_initiated(producer, topic, message, keysize,
                                         valsize)

    def on_send_completed(self, producer: ProducerT, state: typing.Any,
                          metadata: RecordMetadata) -> None:
        """Call when producer finished sending message."""
        super().on_send_completed(producer, state, metadata)
        self.total_sent_messages.inc()
        self.producer_send_latency.observe(
            self.ms_since(typing.cast(float, state)))

    def on_send_error(self, producer: ProducerT, exc: BaseException,
                      state: typing.Any) -> None:
        """Call when producer was unable to publish message."""
        super().on_send_error(producer, exc, state)
        self.total_error_messages_sent.inc()
        self.producer_error_send_latency.observe(
            self.ms_since(typing.cast(float, state)))

    def on_assignment_error(self, assignor: PartitionAssignorT,
                            state: typing.Dict, exc: BaseException) -> None:
        """Partition assignor did not complete assignor due to error."""
        super().on_assignment_error(assignor, state, exc)
        self.assignment_operations.labels(operation=self.ERROR).inc()
        self.assign_latency.observe(self.ms_since(state['time_start']))

    def on_assignment_completed(self, assignor: PartitionAssignorT,
                                state: typing.Dict) -> None:
        """Partition assignor completed assignment."""
        super().on_assignment_completed(assignor, state)
        self.assignment_operations.labels(operation=self.COMPLETED).inc()
        self.assign_latency.observe(self.ms_since(state['time_start']))

    def on_rebalance_start(self, app: AppT) -> typing.Dict:
        """Cluster rebalance in progress."""
        state = super().on_rebalance_start(app)
        self.total_rebalances.inc()

        return state

    def on_rebalance_return(self, app: AppT, state: typing.Dict) -> None:
        """Consumer replied assignment is done to broker."""
        super().on_rebalance_return(app, state)
        self.total_rebalances.dec()
        self.total_rebalances_recovering.inc()
        self.revalance_done_consumer_latency.observe(
            self.ms_since(state['time_return']))

    def on_rebalance_end(self, app: AppT, state: typing.Dict) -> None:
        """Cluster rebalance fully completed (including recovery)."""
        super().on_rebalance_end(app, state)
        self.total_rebalances_recovering.dec()
        self.revalance_done_latency.observe(self.ms_since(state['time_end']))

    def count(self, metric_name: str, count: int = 1) -> None:
        """Count metric by name."""
        super().count(metric_name, count=count)
        self.count_metrics_by_name.labels(metric=metric_name).inc(count)

    def on_tp_commit(self, tp_offsets: TPOffsetMapping) -> None:
        """Call when offset in topic partition is committed."""
        super().on_tp_commit(tp_offsets)
        for tp, offset in tp_offsets.items():
            self.topic_partition_offset_commited.labels(
                topic=tp.topic, partition=tp.partition).set(offset)

    def track_tp_end_offset(self, tp: TP, offset: int) -> None:
        """Track new topic partition end offset for monitoring lags."""
        super().track_tp_end_offset(tp, offset)
        self.topic_partition_end_offset.labels(
            topic=tp.topic, partition=tp.partition).set(offset)

    def on_web_request_end(self,
                           app: AppT,
                           request: web.Request,
                           response: typing.Optional[web.Response],
                           state: typing.Dict,
                           *,
                           view: web.View = None) -> None:
        """Web server finished working on request."""
        super().on_web_request_end(app, request, response, state, view=view)
        status_code = int(state['status_code'])
        self.http_status_codes.labels(status_code=status_code).inc()
        self.http_latency.observe(self.ms_since(state['time_end']))

    def expose_metrics(self) -> None:
        """Expose promethues metrics using the current aiohttp application."""
        @self.app.page(self.pattern)
        async def metrics_handler(self: _web.View,
                                  request: _web.Request) -> _web.Response:
            headers = {
                'Content-Type': 'text/plain; version=0.0.4; charset=utf-8',
            }

            return cast(
                _web.Response,
                Response(body=generate_latest(REGISTRY),
                         headers=headers,
                         status=200))
Example #13
0
from prometheus_client import Counter
from prometheus_client import Gauge

c = Counter('my_failures', 'Description of counter')
print(c)
c.inc()  # Increment by 1
print(c)
c.inc(1.6)  # Increment by given value
print(c)
print(c.collect())

g = Gauge("my_gauge", "my description of gauge")
g.set(1)
g.inc(1)
g.dec(2)
g.track_inprogress()  # 在进入时加1,在退出时减一


@g.track_inprogress()
def f():
    pass
Example #14
0
class RequestHandler:
    """
    Class that handles the requests arriving to the gateway and the result extracted from the requests future.

    :param metrics_registry: optional metrics registry for prometheus used if we need to expose metrics from the executor or from the data request handler
    :param runtime_name: optional runtime_name that will be registered during monitoring
    """

    def __init__(
        self,
        metrics_registry: Optional['CollectorRegistry'] = None,
        runtime_name: Optional[str] = None,
    ):
        self._request_init_time = {} if metrics_registry else None
        self._executor_endpoint_mapping = None

        if metrics_registry:
            with ImportExtensions(
                required=True,
                help_text='You need to install the `prometheus_client` to use the montitoring functionality of jina',
            ):
                from prometheus_client import Gauge, Summary

            self._receiving_request_metrics = Summary(
                'receiving_request_seconds',
                'Time spent processing request',
                registry=metrics_registry,
                namespace='jina',
                labelnames=('runtime_name',),
            ).labels(runtime_name)

            self._pending_requests_metrics = Gauge(
                'number_of_pending_requests',
                'Number of pending requests',
                registry=metrics_registry,
                namespace='jina',
                labelnames=('runtime_name',),
            ).labels(runtime_name)

        else:
            self._receiving_request_metrics = None
            self._pending_requests_metrics = None

    def handle_request(
        self, graph: 'TopologyGraph', connection_pool: 'GrpcConnectionPool'
    ) -> Callable[['Request'], 'asyncio.Future']:
        """
        Function that handles the requests arriving to the gateway. This will be passed to the streamer.

        :param graph: The TopologyGraph of the Flow.
        :param connection_pool: The connection pool to be used to send messages to specific nodes of the graph
        :return: Return a Function that given a Request will return a Future from where to extract the response
        """

        async def gather_endpoints(request_graph):
            nodes = request_graph.all_nodes
            try:
                tasks_to_get_endpoints = [
                    node.get_endpoints(connection_pool) for node in nodes
                ]
                endpoints = await asyncio.gather(*tasks_to_get_endpoints)
            except InternalNetworkError as err:
                err_code = err.code()
                if err_code == grpc.StatusCode.UNAVAILABLE:
                    err._details = (
                        err.details()
                        + f' |Gateway: Communication error with deployment at address(es) {err.dest_addr}. Head or worker(s) may be down.'
                    )
                    raise err
                else:
                    raise

            self._executor_endpoint_mapping = {}
            for node, (endp, _) in zip(nodes, endpoints):
                self._executor_endpoint_mapping[node.name] = endp.endpoints

        def _handle_request(request: 'Request') -> 'asyncio.Future':
            if self._receiving_request_metrics:
                self._request_init_time[request.request_id] = time.time()
            if self._pending_requests_metrics:
                self._pending_requests_metrics.inc()
            # important that the gateway needs to have an instance of the graph per request
            request_graph = copy.deepcopy(graph)

            if graph.has_filter_conditions:
                request_doc_ids = request.data.docs[
                    :, 'id'
                ]  # used to maintain order of docs that are filtered by executors
            tasks_to_respond = []
            tasks_to_ignore = []
            endpoint = request.header.exec_endpoint
            r = request.routes.add()
            r.executor = 'gateway'
            r.start_time.GetCurrentTime()
            # If the request is targeting a specific deployment, we can send directly to the deployment instead of
            # querying the graph
            for origin_node in request_graph.origin_nodes:
                leaf_tasks = origin_node.get_leaf_tasks(
                    connection_pool,
                    request,
                    None,
                    endpoint=endpoint,
                    executor_endpoint_mapping=self._executor_endpoint_mapping,
                    target_executor_pattern=request.header.target_executor,
                )
                # Every origin node returns a set of tasks that are the ones corresponding to the leafs of each of their
                # subtrees that unwrap all the previous tasks. It starts like a chain of waiting for tasks from previous
                # nodes
                tasks_to_respond.extend([task for ret, task in leaf_tasks if ret])
                tasks_to_ignore.extend([task for ret, task in leaf_tasks if not ret])

            def _sort_response_docs(response):
                # sort response docs according to their order in the initial request
                def sort_by_request_order(doc):
                    if doc.id in request_doc_ids:
                        return request_doc_ids.index(doc.id)
                    else:
                        return len(request_doc_ids)  # put new/unknown docs at the end

                sorted_docs = sorted(response.data.docs, key=sort_by_request_order)
                response.data.docs = DocumentArray(sorted_docs)

            async def _process_results_at_end_gateway(
                tasks: List[asyncio.Task], request_graph: TopologyGraph
            ) -> asyncio.Future:
                if self._executor_endpoint_mapping is None:
                    await asyncio.gather(gather_endpoints(request_graph))

                partial_responses = await asyncio.gather(*tasks)
                partial_responses, metadatas = zip(*partial_responses)
                filtered_partial_responses = list(
                    filter(lambda x: x is not None, partial_responses)
                )

                response = filtered_partial_responses[0]
                request_graph.add_routes(response)

                if graph.has_filter_conditions:
                    _sort_response_docs(response)

                return response

            # In case of empty topologies
            if not tasks_to_respond:
                r.end_time.GetCurrentTime()
                future = asyncio.Future()
                future.set_result((request, {}))
                tasks_to_respond.append(future)
            return asyncio.ensure_future(
                _process_results_at_end_gateway(tasks_to_respond, request_graph)
            )

        return _handle_request

    def handle_result(self) -> Callable[['Request'], 'asyncio.Future']:
        """
        Function that handles the result when extracted from the request future

        :return: Return a Function that returns a request to be returned to the client
        """

        def _handle_result(result: 'Request'):
            """
            Function that handles the result when extracted from the request future

            :param result: The result returned to the gateway. It extracts the request to be returned to the client
            :return: Returns a request to be returned to the client
            """

            for route in result.routes:
                if route.executor == 'gateway':
                    route.end_time.GetCurrentTime()

            if self._receiving_request_metrics:
                init_time = self._request_init_time.pop(
                    result.request_id
                )  # need to pop otherwise it stays in memory forever
                self._receiving_request_metrics.observe(time.time() - init_time)

            if self._pending_requests_metrics:
                self._pending_requests_metrics.dec()

            return result

        return _handle_result