class PrometheusMiddleware(object): def __init__(self, register: CollectorRegistry): self.registry = register self.requests = Counter('http_total_request', 'Counter of total HTTP requests', ['method', 'path', 'status'], registry=self.registry) self.request_histogram = Histogram('request_latency_seconds', 'Histogram of request latency', ['method', 'path', 'status'], registry=self.registry) def process_request(self, req: Request, resp: Response) -> None: req.start_time = time.time() def process_response(self, req: Request, resp: Response, resource, req_succeeded: bool) -> None: resp_time = time.time() - req.start_time self.requests.labels(method=req.method, path=req.path, status=resp.status).inc() self.request_histogram.labels(method=req.method, path=req.path, status=resp.status).observe(resp_time) def on_get(self, req: Request, resp: Response) -> None: data = generate_latest(self.registry) resp.content_type = 'text/plain; version=0.0.4; charset=utf-8' resp.body = str(data.decode('utf-8'))
class Prometheus: def __init__(self): self.request_count = Counter( 'requests_total', 'Total Request Count', ['app_name', 'method', 'endpoint', 'http_status']) self.request_latency = Histogram('request_latency_seconds', 'Request Latency', ['app_name', 'endpoint']) self.request_in_progress = Gauge('requests_in_progress_total', 'Requests in progress', ['app_name', 'endpoint', 'method']) self.graph_stats = Counter('graph_stats', 'Graph Stats', [ 'constraints_added', 'constraints_removed', 'contains_updates', 'indexes_added', 'indexes_removed', 'labels_added', 'labels_removed', 'nodes_created', 'nodes_deleted', 'properties_set', 'relationships_created', 'relationships_deleted', 'query_count' ]) @contextmanager def in_flight(self, name, path, method): self.request_in_progress.labels(name, path, method).inc() yield self.request_in_progress.labels(name, path, method).dec() @contextmanager def latency(self, name, path): start = time.time() yield self.request_latency.labels(name, path).observe(time.time() - start)
class InstrumentMiddleware: def __init__(self, app, bento_service): self.app = app self.bento_service = bento_service from prometheus_client import Histogram, Counter, Gauge service_name = self.bento_service.name namespace = config('instrument').get('default_namespace') self.metrics_request_duration = Histogram( name=service_name + '_request_duration_seconds', documentation=service_name + " API HTTP request duration in seconds", namespace=namespace, labelnames=['endpoint', 'service_version', 'http_response_code'], ) self.metrics_request_total = Counter( name=service_name + "_request_total", documentation='Totoal number of HTTP requests', namespace=namespace, labelnames=['endpoint', 'service_version', 'http_response_code'], ) self.metrics_request_in_progress = Gauge( name=service_name + "_request_in_progress", documentation='Totoal number of HTTP requests in progress now', namespace=namespace, labelnames=['endpoint', 'service_version'], ) def __call__(self, environ, start_response): req = Request(environ) endpoint = req.path start_time = default_timer() def start_response_wrapper(status, headers): ret = start_response(status, headers) status_code = int(status.split()[0]) # instrument request total count self.metrics_request_total.labels( endpoint=endpoint, service_version=self.bento_service.version, http_response_code=status_code, ).inc() # instrument request duration total_time = max(default_timer() - start_time, 0) self.metrics_request_duration.labels( endpoint=endpoint, service_version=self.bento_service.version, http_response_code=status_code, ).observe(total_time) return ret with self.metrics_request_in_progress.labels( endpoint=endpoint, service_version=self.bento_service.version).track_inprogress(): return self.app(environ, start_response_wrapper)
class PrometheusReporter(NullReporter): def __init__(self, namespace='', normalize=default_normalize): self.histograms = {} self.lock = Lock() self.namespace = namespace self.normalize = normalize or default_normalize self._http_metrics = HTTPMetrics(namespace=namespace, normalize=normalize) self._operation_metrics = Histogram( self.metric_name(METRICS_NAME_OPERATION), 'Duration of operations in microsecond', ['name']) def report_span(self, span): srv = self.get_tag(span, 'span.kind') surl = self.get_tag(span, 'http.url') smeth = self.get_tag(span, 'http.method') if srv == 'server' and (surl or smeth): self._http_metrics.record(span) return else: self._operation_metrics.labels(self.normalize( span.operation_name)).observe(span.end_time - span.start_time) def get_tag(self, span, key): for tag in span.tags: if tag.key == key: if hasattr(tag, 'value'): return str(tag.value) break return '' def metric_name(self, name): return metric_name(name, namespace=self.namespace)
class ProtonPrometheus(object): def __init__(self): super(ProtonPrometheus, self).__init__() self.registry = CollectorRegistry() self.requests = Counter('http_total_request', 'Counter of total HTTP requests', ['method', 'path', 'status'], registry=self.registry) self.request_historygram = Histogram('request_latency_seconds', 'Histogram of request latency', ['method', 'path', 'status'], registry=self.registry) def process_request(self, req, resp): req.context.start_time = time.time() def process_response(self, req, resp, resource, req_succeeded): if 'start_time' in req.context: resp_time = time.time() - req.context.start_time self.requests.labels(method=req.method, path=req.path, status=resp.status).inc() self.request_historygram.labels( method=req.method, path=req.path, status=resp.status).observe(resp_time) def on_get(self, req, resp): data = generate_latest(self.registry) resp.content_type = 'text/plain; version=0.0.4; charset=utf-8' resp.body = str(data.decode('utf-8'))
def test_histogram(self): """Test that we can track histogram in Service303""" # Add a histogram with a label to the regisry c = Histogram('process_max_fds', 'A summary', ['result'], registry=self.registry, buckets=[0, 2, float('inf')]) c.labels('success').observe(1.23) c.labels('failure').observe(2.34) # Build proto outputs histogram1 = metrics_pb2.Histogram(sample_count=1, sample_sum=1.23) histogram1.bucket.add(upper_bound=0, cumulative_count=0) histogram1.bucket.add(upper_bound=2, cumulative_count=1) histogram1.bucket.add(upper_bound=float('inf'), cumulative_count=1) histogram2 = metrics_pb2.Histogram(sample_count=1, sample_sum=2.34) histogram2.bucket.add(upper_bound=0, cumulative_count=0) histogram2.bucket.add(upper_bound=2, cumulative_count=0) histogram2.bucket.add(upper_bound=float('inf'), cumulative_count=1) metric1 = metrics_pb2.Metric(histogram=histogram1, timestamp_ms=1234000) metric2 = metrics_pb2.Metric(histogram=histogram2, timestamp_ms=1234000) family = metrics_pb2.MetricFamily(name=str( metricsd_pb2.process_max_fds), type=metrics_pb2.HISTOGRAM) metric1.label.add(name=str(metricsd_pb2.result), value='success') metric2.label.add(name=str(metricsd_pb2.result), value='failure') family.metric.extend([metric1, metric2]) with unittest.mock.patch('time.time') as mock_time: mock_time.side_effect = lambda: 1234 self.assertCountEqual( list(metrics_export.get_metrics(self.registry))[0].metric, family.metric)
class HTTPMetrics(object): def __init__(self, namespace='', normalize=default_normalize): self.namespace = namespace self.normalize = normalize or default_normalize self.requests = Counter( self.metric_name(METRICS_NAME_HTTP_REQUESTS), 'Counts the number of requests made distinguished by their endpoint and error status', ['endpoint', 'error']) self.latency = Histogram( self.metric_name(METRICS_NAME_HTTP_REQUEST_LATENCY), 'Duration of HTTP requests in second distinguished by their endpoint and error status', ['endpoint', 'error']) self.status_codes = Counter( self.metric_name(METRICS_NAME_HTTP_STATUS_CODES), 'Counts the responses distinguished by endpoint and status code bucket', ['endpoint', 'status_code']) def record(self, span): status_code = self.get_int_tag(span, 'http.status_code') sc = status_code / 100 endpoint = self.normalize(span.operation_name) if not endpoint: endpoint = "other" error = self.get_tag(span, 'error') if not error or error.lower() == 'false': error = 'false' else: error = 'true' self.requests.labels(endpoint, error).inc(1) self.latency.labels(endpoint, error).observe(span.end_time - span.start_time) if sc >= 2 and sc <= 5: self.status_codes.labels(endpoint, str(sc) + 'xx').inc(1) def get_int_tag(self, span, key): tg = self.get_tag(span, key) if not tg: return 0 return int(tg) def get_tag(self, span, key): for tag in span.tags: if tag.key == key: if hasattr(tag, 'value'): return str(tag.value) break return '' def metric_name(self, name): return metric_name(name, namespace=self.namespace)
class OperationMetricSet: """Collection of Prometheus metrics representing a logical operation""" requests: Counter requests_duration: Histogram exceptions: Counter requests_in_progress: Gauge def __init__(self, operation_name: str, labels: List[str]): self.requests = Counter( f"pyncette_{operation_name}_total", f"Total count of {operation_name} operations", labels, ) self.requests_duration = Histogram( f"pyncette_{operation_name}_duration_seconds", f"Histogram of {operation_name} processing time", labels, ) self.exceptions = Counter( f"pyncette_{operation_name}_failures_total", f"Total count of failed {operation_name} failures", [*labels, "exception_type"], ) self.requests_in_progress = Gauge( f"pyncette_{operation_name}_in_progress", f"Gauge of {operation_name} operations currently being processed", labels, ) @contextlib.asynccontextmanager async def measure(self, **labels: Dict[str, str]) -> AsyncIterator[None]: """An async context manager that measures the execution of the wrapped code""" if labels: self.requests_in_progress.labels(**labels).inc() self.requests.labels(**labels).inc() else: self.requests_in_progress.inc() self.requests.inc() before_time = time.perf_counter() try: yield except Exception as e: self.exceptions.labels(**labels, exception_type=type(e).__name__).inc() raise e from None finally: if labels: self.requests_duration.labels(**labels).observe( time.perf_counter() - before_time ) self.requests_in_progress.labels(**labels).dec() else: self.requests_duration.observe(time.perf_counter() - before_time) self.requests_in_progress.dec()
class PrometheusMixin(_Base): """Mixin for tornado.web.Application""" def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) self.prometheus_registry = kwargs.get("prometheus_registry", REGISTRY) self._requests_total_counter = Counter( registry=self.prometheus_registry, namespace=_NAMESPACE, subsystem=_SUB_SYSTEM, name="requests_total", documentation="Counter of HTTP requests.", labelnames=("handler", "method", "code"), ) self._requests_duration_seconds_histogram = Histogram( registry=self.prometheus_registry, namespace=_NAMESPACE, subsystem=_SUB_SYSTEM, name="request_duration_seconds", documentation="Histogram of latencies for HTTP requests.", buckets=(0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 3, 8, 20, 60), labelnames=("handler", "method"), ) self._response_size_bytes_histogram = Histogram( registry=self.prometheus_registry, namespace=_NAMESPACE, subsystem=_SUB_SYSTEM, name="response_size_bytes", documentation="Histogram of response size for HTTP requests.", buckets=(10, 100, 1_000, 10_000, 100_000, 1_000_000, 10_000_000), labelnames=("handler", "method"), ) def log_request(self, handler: RequestHandler) -> None: super().log_request(handler) self._update_metrics(handler) def _update_metrics(self, handler: RequestHandler) -> None: method = handler.request.method handler_name = type(handler).__name__ content_length_str = handler._headers.get("Content-Length") try: if isinstance(content_length_str, str): content_length: Optional[int] = int(content_length_str) else: content_length = None except ValueError: content_length = None self._requests_duration_seconds_histogram.labels( handler_name, method).observe(handler.request.request_time()) self._requests_total_counter.labels(handler_name, method, handler.get_status()).inc() if isinstance(content_length, int): self._response_size_bytes_histogram.labels( handler_name, method).observe(content_length)
class PrometheusExporter(object): __instance = None LABELS = [ 'name', 'object_size', 'type', 'pool', 'image', 'status', ] NAMESPACE = 'rbd_prober' @staticmethod def getInstance(): if PrometheusExporter.__instance is None: PrometheusExporter() return PrometheusExporter.__instance def __init__(self, *args, **kwargs): if PrometheusExporter.__instance is not None: raise Exception("This class is a singleton!") PrometheusExporter.__instance = self def init_metrics(self, histogram_buckets): histogram_buckets.append(INF) self.response_time = Histogram( name='response_time', documentation='Prober response time in seconds', labelnames=self.LABELS, namespace=self.NAMESPACE, buckets=histogram_buckets, ) self.bandwidth = Counter( name='bandwidth', documentation='Bytes has be written or read from RBD', labelnames=self.LABELS, namespace=self.NAMESPACE, ) self.prober_ops = Counter( name='ops', documentation='Total ops count', labelnames=self.LABELS, namespace=self.NAMESPACE, ) def observe(self, response_time, bytes_size, label_values): if response_time != -1: label_values['status'] = 'success' else: label_values['status'] = 'fail' self.response_time.labels(**label_values).observe(response_time) self.bandwidth.labels(**label_values).inc(bytes_size) self.prober_ops.labels(**label_values).inc()
class MonitorMiddleware(object): def __init__(self, flask_app, metric_url): self.metric_url = metric_url self.process_id = str(os.getpid()) self.instance_id = requests.get( 'http://169.254.169.254/latest/meta-data/instance-id').text flask_app.add_url_rule(metric_url, view_func=metrics, methods=['GET']) self.wsgi_app = ProxyFix(flask_app.wsgi_app) self.req_counter = Counter( 'recommend_requests_total', 'Total request counts', ['method', 'endpoint', 'instance', 'process']) self.err_counter = Counter( 'recommend_error_total', 'Total error counts', ['method', 'endpoint', 'instance', 'process']) self.resp_latency = Histogram( 'recommend_response_latency_millisecond', 'Response latency (millisecond)', ['method', 'endpoint', 'instance', 'process'], buckets=(10, 20, 30, 50, 80, 100, 200, 300, 500, 1000, 2000, 3000)) def _label(self): return { 'method': request.method, 'endpoint': request.url_rule.rule, 'instance': self.instance_id, 'process': self.process_id, } def log_response(self, response): label = self._label() if label['endpoint'] == self.metric_url: return time_used = int((time.time() - g.start_time) * 1000) logger.info('{} {} {}'.format(response.status_code, label['endpoint'], time_used)) self.req_counter.labels(**label).inc() self.resp_latency.labels(**label).observe(time_used) def log_exception(self, e): logger.exception(e) self.err_counter.labels(**self._label()).inc()
class MetricDecoration: __instance = None def __init__(self, modules, service_name, whitelist=None): if MetricDecoration.__instance: raise Exception("MetricDecoration instance exists: Singleton") else: MetricDecoration.__instance = self self.modules = modules self.H = Histogram(f"{service_name}_call_duration_seconds", "API call duration (s)", ["call"]) self.whitelist = whitelist or [] def decorate_all_in_modules(self): """ Decorate all functions in a module with the specified decorator """ for module_ in self.modules: for name in dir(module_): if name not in self.whitelist: obj = getattr(module_, name) if isinstance(obj, FunctionType): # We only check functions that are defined in the module we # specified. Some of the functions in the module may have been # imported from other modules. These are ignored. if obj.__module__ == module_.__name__: logger.debug(f"Adding metrics to {module_}:{name}") setattr( module_, name, self._prometheus_module_metric_decorator(obj)) else: logger.debug( f"No metrics on {module_}:{name} because it belongs to another " f"module") else: logger.debug( f"No metrics on {module_}:{name} because it is not a coroutine or " f"function") def _prometheus_module_metric_decorator(self, f: FunctionType): """ A Prometheus decorator adding timing metrics to a function. This decorator will work on both asynchronous and synchronous functions. Note, however, that this function will turn synchronous functions into asynchronous ones when used as a decorator. :param f: The function for which to capture metrics """ module_ = f.__module__.split(".")[-1] call_key = "{}_{}".format(module_, f.__name__) @functools.wraps(f) def wrapper(*args, **kwargs): with self.H.labels(call=call_key).time(): return f(*args, **kwargs) return wrapper
class MetricsMiddleware(): def __init__(self): self.requests = Counter( 'http_total_request', 'Counter of total HTTP requests', ['method', 'path', 'status']) self.request_historygram = Histogram( 'request_latency_seconds', 'Histogram of request latency', ['method', 'path', 'status']) def process_request(self, req, resp): req.start_time = time.time() def process_response(self, req, resp, resource, req_succeeded): resp_time = time.time() - req.start_time self.requests.labels(method=req.method, path=req.path, status=resp.status).inc() self.request_historygram.labels(method=req.method, path=req.path, status=resp.status).observe(resp_time)
class DatabaseMonitoring: _instance = None _init = False def __new__(cls, *args, **kwargs): if cls._instance: return cls._instance o = object.__new__(cls) cls._instance = o return o @once def __init__(self): self.request_latency = Histogram("enjoliver_db_request_duration_seconds", "Database request latency", ["caller"]) self.request_count = Counter("enjoliver_db_request_total", "Database request count", ["caller"]) self.cockroach_retry_count = Counter("enjoliver_cockroachdb_txn_retry_total", "CockroachDB transaction retry count", ['caller']) self.exception_count = Counter("enjoliver_db_exception_total", "Counter of number error during session", ["caller", "exception"]) @contextmanager def observe_transaction(self, caller: str): """ Wrapper to call around transaction against a database :param caller: :return: """ start = time.time() try: yield except Exception as e: self.exception_count.labels(caller, type(e).__name__).inc() raise finally: latency = time.time() - start self.request_latency.labels(caller).observe(latency) self.request_count.labels(caller).inc()
class Application(web.Application): def __init__(self, *args, **kwargs): super().__init__([ (r'/', HealthcheckHandler), (r'/code', BarcodeHandler), (r'/code.html', HTMLBarcodeHandler), (r'/metrics', MetricsHandler)], middlewares=[], *args, **settings, **kwargs) self.request_count = Counter( 'requests_total', 'Total requests count', ['method', 'endpoint', 'http_status'] ) self.redis_request_time = Histogram( 'redis_request_latency', 'Redis request total time', ['endpoint'] ) self.mongodb_collision_count = Counter( 'collision_total', 'Total collision omitted' ) def log_request(self, handler): super(Application, self).log_request(handler) self.request_count.labels( method=handler.request.method.lower(), endpoint=type(handler).__name__.lower(), http_status=int(handler.get_status()) ).inc() if hasattr(handler.request, 'redis_request_time'): self.redis_request_time.labels( endpoint=type(handler).__name__.lower() ).observe(handler.request.redis_request_time)
class PrometheusConfig: def __init__(self): self.bitcoin_cost = Histogram( name='bitcoin_cost', documentation='bitcoin cost over time', buckets=[ 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 55000, 60000, 65000, 70000, 75000, 80000, 85000, 90000, 95000, 100000 ], labelnames=['currency'] ) start_http_server(8000) self.gather_metrics() def gather_metrics(self): while True: json_response = requests.get(API_URL).json() for currency, price in json_response.items(): self.bitcoin_cost.labels(currency).observe(float(''.join(price.split(',')))) time.sleep(5)
mk = args.memcachekey mv = args.memcachevalue b = args.buckets # convert buckets config string to float list buckets = [float(i) for i in b.split(', ')] # metrics REQUEST_TIME = Histogram('memcachemon_request_duration_seconds', 'Time in seconds a memcache ' 'operation takes', ['operation', 'memcache'], buckets=buckets) REQUEST_FAIL = Counter('memcachemon_request_failures', 'Counter for failed operations', ['operation', 'memcache']) # labeled instances request_time_get = REQUEST_TIME.labels(operation="get", memcache=mc) request_time_set = REQUEST_TIME.labels(operation="set", memcache=mc) # get func with decorator @request_time_get.time() def memc_get(key): try: client.get(key) except Exception as error: REQUEST_FAIL.labels(operation="get", memcache=mc).inc() logger.warning("Error on mc get: %s", error) # set func with decorator @request_time_set.time() def memc_set(key, value): try:
class _MarshalService(cls): def __init__(self, *args, **kwargs): for attr_name in functools.WRAPPER_ASSIGNMENTS: try: setattr(self.__class__, attr_name, getattr(cls, attr_name)) except AttributeError: pass from prometheus_client import Counter, Gauge, Histogram super(_MarshalService, self).__init__(*args, **kwargs) namespace = config('instrument').get( 'default_namespace') # its own namespace? service_name = self.bento_service_metadata_pb.name self.metrics_request_batch_size = Histogram( name=service_name + '_mb_batch_size', documentation=service_name + "microbatch request batch size", namespace=namespace, labelnames=['endpoint'], ) self.metrics_request_duration = Histogram( name=service_name + '_mb_requestmb_duration_seconds', documentation=service_name + "API HTTP request duration in seconds", namespace=namespace, labelnames=['endpoint', 'http_response_code'], ) self.metrics_request_in_progress = Gauge( name=service_name + "_mb_request_in_progress", documentation='Total number of HTTP requests in progress now', namespace=namespace, labelnames=['endpoint', 'http_method'], ) self.metrics_request_exception = Counter( name=service_name + "_mb_request_exception", documentation='Total number of service exceptions', namespace=namespace, labelnames=['endpoint', 'exception_class'], ) self.metrics_request_total = Counter( name=service_name + "_mb_request_total", documentation='Total number of service exceptions', namespace=namespace, labelnames=['endpoint', 'http_response_code'], ) async def request_dispatcher(self, request): func = super(_MarshalService, self).request_dispatcher api_name = request.match_info.get("name", "/") _metrics_request_in_progress = self.metrics_request_in_progress.labels( endpoint=api_name, http_method=request.method, ) _metrics_request_in_progress.inc() time_st = time.time() try: resp = await func(request) except Exception as e: # pylint: disable=broad-except self.metrics_request_exception.labels( endpoint=api_name, exception_class=e.__class__.__name__).inc() logger.error(traceback.format_exc()) resp = aiohttp.web.Response(status=500) self.metrics_request_total.labels( endpoint=api_name, http_response_code=resp.status).inc() self.metrics_request_duration.labels( endpoint=api_name, http_response_code=resp.status).observe(time.time() - time_st) _metrics_request_in_progress.dec() return resp async def _batch_handler_template(self, requests, api_name): func = super(_MarshalService, self)._batch_handler_template self.metrics_request_batch_size.labels(endpoint=api_name).observe( len(requests)) return await func(requests, api_name)
class Prometheus(commands.Cog): """Collects prometheus metrics""" def __init__(self, bot): self.bot = bot self.ram_gauge = Gauge( "miso_memory_usage_bytes", "Memory usage of the bot process in bytes.", ) self.cpu_gauge = Gauge( "system_cpu_usage_percent", "CPU usage of the system in percent.", ["core"], ) self.event_counter = Counter( "miso_gateway_events_total", "Total number of gateway events.", ["event_type"], ) self.command_histogram = Histogram( "miso_command_response_time_seconds", "Command end-to-end response time in seconds.", ["command"], buckets=(0.1, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0, 3.0, 5.0), ) self.shard_latency_summary = Summary( "miso_shard_latency_seconds", "Latency of a shard in seconds.", ["shard"], ) self.guild_count = Gauge( "miso_cached_guild_count", "Total amount of guilds cached.", ) self.member_count = Gauge( "miso_cached_member_count", "Total amount of members cached.", ) async def cog_load(self): self.log_system_metrics.start() self.log_shard_latencies.start() self.log_cache_contents.start() def cog_unload(self): self.log_system_metrics.cancel() self.log_shard_latencies.cancel() self.log_cache_contents.cancel() @commands.Cog.listener() async def on_socket_event_type(self, event_type): self.event_counter.labels(event_type).inc() @tasks.loop(seconds=10) async def log_shard_latencies(self): for shard in self.bot.shards.values(): self.shard_latency_summary.labels(shard.id).observe(shard.latency) @tasks.loop(minutes=1) async def log_cache_contents(self): guild_count = len(self.bot.guilds) member_count = len(self.bot.users) self.guild_count.set(guild_count) self.member_count.set(member_count) @tasks.loop(seconds=10) async def log_system_metrics(self): ram = psutil.Process().memory_info().rss self.ram_gauge.set(ram) for core, usage in enumerate( psutil.cpu_percent(interval=None, percpu=True)): self.cpu_gauge.labels(core).set(usage) @log_shard_latencies.before_loop @log_cache_contents.before_loop async def task_waiter(self): await self.bot.wait_until_ready() @commands.Cog.listener() async def on_command_completion(self, ctx: commands.Context): if ctx.invoked_subcommand is None: took = time() - ctx.timer command = str(ctx.command) self.command_histogram.labels(command).observe(took)
class ServerSpawnStatus(Enum): """ Possible values for 'status' label of SERVER_SPAWN_DURATION_SECONDS """ success = 'success' failure = 'failure' already_pending = 'already-pending' throttled = 'throttled' too_many_users = 'too-many-users' def __str__(self): return self.value for s in ServerSpawnStatus: # Create empty metrics with the given status SERVER_SPAWN_DURATION_SECONDS.labels(status=s) PROXY_ADD_DURATION_SECONDS = Histogram( 'proxy_add_duration_seconds', 'duration for adding user routes to proxy', ['status'] ) class ProxyAddStatus(Enum): """ Possible values for 'status' label of PROXY_ADD_DURATION_SECONDS """ success = 'success' failure = 'failure'
got_request_exception, abort, request from flask_talisman import Talisman, DENY from prometheus_client import generate_latest, CONTENT_TYPE_LATEST, Counter, Histogram from .transformer import Transformer from ..specs.factory import InvalidConfiguration """Web app that provides default values for fiaas config, an endpoint to transform between available fiaas config versions and prometheus metrics.""" LOG = logging.getLogger(__name__) web = Blueprint("web", __name__, template_folder="templates") request_histogram = Histogram("web_request_latency", "Request latency in seconds", ["page"]) defaults_histogram = request_histogram.labels("defaults") defaults_versioned_histogram = request_histogram.labels("defaults_versioned") frontpage_histogram = request_histogram.labels("frontpage") metrics_histogram = request_histogram.labels("metrics") transform_histogram = request_histogram.labels("transform") healthz_histogram = request_histogram.labels("healthz") @web.route("/") @frontpage_histogram.time() def frontpage(): return render_template("frontpage.html") @web.route("/internal-backstage/prometheus") @metrics_histogram.time()
def push_job_information(self): ''' Process Bareos job data and send it to the prometheus pushgateway ''' registry = CollectorRegistry() TIME_BUCKETS=(6, 60, 600, 1800, 3600, 10800, 18000, 28800, 86400) bareos_job_status = Enum('bareos_job_status', 'Backup Status', states=self.job_status.values(), labelnames=['instance', 'jobid'], registry=registry) # see https://github.com/bareos/bareos/blob/master/core/src/include/job_level.h bareos_job_level = Enum('bareos_job_level', 'Backup Level', states=self.job_levels.values(), labelnames=['instance', 'jobid'], registry=registry) bareos_job_running_time = Histogram('bareos_job_running_time', 'Job running time', labelnames=['instance', 'jobid'], registry=registry, buckets=TIME_BUCKETS) bareos_job_files = Gauge('bareos_job_files', 'Backed up files', labelnames=['instance', 'jobid'], registry=registry) bareos_job_bytes = Gauge('bareos_job_bytes', 'Backed up bytes', labelnames=['instance', 'jobid'], registry=registry) bareos_job_throughput = Gauge('bareos_job_throughtput', 'Backup throughtput', registry=registry, labelnames=['instance', 'jobid']) # see https://github.com/bareos/bareos/blob/master/core/src/include/job_types.h bareos_job_type = Enum('bareos_job_type', 'Job Type', states=self.job_types.values(), registry=registry, labelnames=['instance', 'jobid']) bareos_job_client = Info('bareos_job_client', 'Client', registry=registry, labelnames=['instance', 'jobid']) bareos_job_priority = Gauge('bareos_job_priority', 'Job Priority', registry=registry, labelnames=['instance', 'jobid']) bareos_job_name = '_'.join(self.jobName.split('.')[:-3]) bareos_job_id = self.jobId if (self.jobStatus == 'E' or self.jobStatus == 'f' or self.jobStatus == 'A') and self.report_failed == False: return bareos_job_status.labels(instance=bareos_job_name, jobid=bareos_job_id).state(self.job_status[self.jobStatus]) bareos_job_running_time.labels(instance=bareos_job_name, jobid=bareos_job_id).observe(self.jobRunningTime) bareos_job_files.labels(instance=bareos_job_name, jobid=bareos_job_id).set(self.jobFiles) bareos_job_bytes.labels(instance=bareos_job_name, jobid=bareos_job_id).set(self.jobBytes) bareos_job_throughput.labels(instance=bareos_job_name, jobid=bareos_job_id).set(self.throughput) bareos_job_priority.labels(instance=bareos_job_name, jobid=bareos_job_id).set(self.Priority) bareos_job_level.labels(instance=bareos_job_name, jobid=bareos_job_id).state(self.job_levels[self.jobLevel]) bareos_job_type.labels(instance=bareos_job_name, jobid=bareos_job_id).state(self.job_types[chr(self.jobType)]) bareos_job_client.labels(instance=bareos_job_name, jobid=bareos_job_id).info({'client': self.jobClient}) if self.use_tls == True or self.use_tls == 'yes': gateway = "https://{}:{}".format(self.gateway_host,self.gateway_port) else: gateway = "{}:{}".format(self.gateway_host,self.gateway_port) bareosdir.DebugMessage(100, "Submitting metrics to {}\n".format(gateway)) try: if self.use_basic_auth: push_to_gateway('{}'.format(gateway), job='bareos', registry=registry, handler=self.authentication_handler) else: push_to_gateway('{}'.format(gateway), job='bareos', registry=registry) except Exception as excp: bareosdir.DebugMessage(100, "Error: Submitting metrics to pushgateway '{}' failed.\n".format(gateway)) bareosdir.DebugMessage(100, "python error was: {}\n".format(excp)) bareosdir.JobMessage(bareosdir.M_INFO, "Failed to submit metrics to pushgateway\n")
Possible values for 'status' label of SERVER_SPAWN_DURATION_SECONDS """ success = 'success' failure = 'failure' already_pending = 'already-pending' throttled = 'throttled' too_many_users = 'too-many-users' def __str__(self): return self.value for s in ServerSpawnStatus: # Create empty metrics with the given status SERVER_SPAWN_DURATION_SECONDS.labels(status=s) PROXY_ADD_DURATION_SECONDS = Histogram( 'proxy_add_duration_seconds', 'duration for adding user routes to proxy', ['status']) class ProxyAddStatus(Enum): """ Possible values for 'status' label of PROXY_ADD_DURATION_SECONDS """ success = 'success' failure = 'failure' def __str__(self):
class TestHistogram(unittest.TestCase): def setUp(self): self.registry = CollectorRegistry() self.histogram = Histogram('h', 'help', registry=self.registry) self.labels = Histogram('hl', 'help', ['l'], registry=self.registry) def test_histogram(self): self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '1.0'})) self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '2.5'})) self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '5.0'})) self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '+Inf'})) self.assertEqual(0, self.registry.get_sample_value('h_count')) self.assertEqual(0, self.registry.get_sample_value('h_sum')) self.histogram.observe(2) self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '1.0'})) self.assertEqual(1, self.registry.get_sample_value('h_bucket', {'le': '2.5'})) self.assertEqual(1, self.registry.get_sample_value('h_bucket', {'le': '5.0'})) self.assertEqual(1, self.registry.get_sample_value('h_bucket', {'le': '+Inf'})) self.assertEqual(1, self.registry.get_sample_value('h_count')) self.assertEqual(2, self.registry.get_sample_value('h_sum')) self.histogram.observe(2.5) self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '1.0'})) self.assertEqual(2, self.registry.get_sample_value('h_bucket', {'le': '2.5'})) self.assertEqual(2, self.registry.get_sample_value('h_bucket', {'le': '5.0'})) self.assertEqual(2, self.registry.get_sample_value('h_bucket', {'le': '+Inf'})) self.assertEqual(2, self.registry.get_sample_value('h_count')) self.assertEqual(4.5, self.registry.get_sample_value('h_sum')) self.histogram.observe(float("inf")) self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '1.0'})) self.assertEqual(2, self.registry.get_sample_value('h_bucket', {'le': '2.5'})) self.assertEqual(2, self.registry.get_sample_value('h_bucket', {'le': '5.0'})) self.assertEqual(3, self.registry.get_sample_value('h_bucket', {'le': '+Inf'})) self.assertEqual(3, self.registry.get_sample_value('h_count')) self.assertEqual(float("inf"), self.registry.get_sample_value('h_sum')) def test_setting_buckets(self): h = Histogram('h', 'help', registry=None, buckets=[0, 1, 2]) self.assertEqual([0.0, 1.0, 2.0, float("inf")], h._upper_bounds) h = Histogram('h', 'help', registry=None, buckets=[0, 1, 2, float("inf")]) self.assertEqual([0.0, 1.0, 2.0, float("inf")], h._upper_bounds) self.assertRaises(ValueError, Histogram, 'h', 'help', registry=None, buckets=[]) self.assertRaises(ValueError, Histogram, 'h', 'help', registry=None, buckets=[float("inf")]) self.assertRaises(ValueError, Histogram, 'h', 'help', registry=None, buckets=[3, 1]) def test_labels(self): self.labels.labels('a').observe(2) self.assertEqual(0, self.registry.get_sample_value('hl_bucket', {'le': '1.0', 'l': 'a'})) self.assertEqual(1, self.registry.get_sample_value('hl_bucket', {'le': '2.5', 'l': 'a'})) self.assertEqual(1, self.registry.get_sample_value('hl_bucket', {'le': '5.0', 'l': 'a'})) self.assertEqual(1, self.registry.get_sample_value('hl_bucket', {'le': '+Inf', 'l': 'a'})) self.assertEqual(1, self.registry.get_sample_value('hl_count', {'l': 'a'})) self.assertEqual(2, self.registry.get_sample_value('hl_sum', {'l': 'a'})) def test_function_decorator(self): self.assertEqual(0, self.registry.get_sample_value('h_count')) self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '+Inf'})) @self.histogram.time() def f(): pass f() self.assertEqual(1, self.registry.get_sample_value('h_count')) self.assertEqual(1, self.registry.get_sample_value('h_bucket', {'le': '+Inf'})) def test_block_decorator(self): self.assertEqual(0, self.registry.get_sample_value('h_count')) self.assertEqual(0, self.registry.get_sample_value('h_bucket', {'le': '+Inf'})) with self.histogram.time(): pass self.assertEqual(1, self.registry.get_sample_value('h_count')) self.assertEqual(1, self.registry.get_sample_value('h_bucket', {'le': '+Inf'}))
class PrometheusMetrics(DependencyProvider): """ Dependency provider which measures RPC, event handler and HTTP endpoint latency. On service start, a few default metrics are declared. These are: - ``<prefix>_http_requests_total`` - ``<prefix>_http_request_latency_seconds`` - ``<prefix>_rpc_requests_total`` - ``<prefix>_rpc_request_latency_seconds`` - ``<prefix>_events_total`` - ``<prefix>_events_latency_seconds`` where ``prefix`` is either derived from ``name`` attribute of the service class, or :ref:`configured manually <configuration>`. """ def __init__(self): self.worker_starts: MutableMapping[WorkerContext, float] = WeakKeyDictionary() def setup(self) -> None: """ Configures the dependency provider and declares default metrics. """ # read config from container, use service name as default prefix service_name = self.container.service_name config = self.container.config.get("PROMETHEUS", {}) service_config = config.get(service_name, {}) prefix = service_config.get("prefix", service_name) # initialize default metrics exposed for every service self.http_request_total_counter = Counter( f"{prefix}_http_requests_total", "Total number of HTTP requests", ["http_method", "endpoint", "status_code"], ) self.http_request_latency_histogram = Histogram( f"{prefix}_http_request_latency_seconds", "HTTP request duration in seconds", ["http_method", "endpoint", "status_code"], ) self.rpc_request_total_counter = Counter( f"{prefix}_rpc_requests_total", "Total number of RPC requests", ["method_name"], ) self.rpc_request_latency_histogram = Histogram( f"{prefix}_rpc_request_latency_seconds", "RPC request duration in seconds", ["method_name"], ) self.events_total_counter = Counter( f"{prefix}_events_total", "Total number of handled events", ["source_service", "event_type"], ) self.events_latency_histogram = Histogram( f"{prefix}_events_latency_seconds", "Event handler duration in seconds", ["source_service", "event_type"], ) def get_dependency(self, worker_ctx: WorkerContext) -> MetricsServer: """ Returns an instance of :class:`~nameko_prometheus.dependencies.MetricsServer` to be injected into the worker. """ return MetricsServer() def worker_setup(self, worker_ctx: WorkerContext) -> None: """ Called before service worker starts. """ self.worker_starts[worker_ctx] = time.perf_counter() def worker_result( self, worker_ctx: WorkerContext, result=None, exc_info=None ) -> None: """ Called after service worker completes. At this point the default metrics such as worker latency are observed, regardless of whether the worker finished successfully or raised an exception. """ try: start = self.worker_starts.pop(worker_ctx) entrypoint = worker_ctx.entrypoint logger.debug(f"Got result from entrypoint: {entrypoint}") duration = time.perf_counter() - start if isinstance(entrypoint, HttpRequestHandler): http_method = entrypoint.method url = entrypoint.url if exc_info: _, exc, _ = exc_info status_code = entrypoint.response_from_exception(exc).status_code else: status_code = entrypoint.response_from_result(result).status_code logger.debug(f"Tracing HTTP request: {http_method} {url} {status_code}") self.http_request_total_counter.labels( http_method=http_method, endpoint=url, status_code=status_code ).inc() self.http_request_latency_histogram.labels( http_method=http_method, endpoint=url, status_code=status_code ).observe(duration) elif isinstance(entrypoint, Rpc): method_name = entrypoint.method_name logger.debug(f"Tracing RPC request: {method_name}") self.rpc_request_total_counter.labels(method_name=method_name).inc() self.rpc_request_latency_histogram.labels( method_name=method_name ).observe(duration) elif isinstance(entrypoint, EventHandler): source_service = entrypoint.source_service event_type = entrypoint.event_type logger.debug(f"Tracing event handler: {source_service} {event_type}") self.events_total_counter.labels( source_service=source_service, event_type=event_type ).inc() self.events_latency_histogram.labels( source_service=source_service, event_type=event_type ).observe(duration) else: logger.warning( f"Entrypoint {entrypoint} is not traceable by nameko_prometheus" ) except KeyError: logger.info("No worker_ctx in request start dictionary")
start_http_server(9116) hosts = sys.argv[1] hosts = hosts.split(',') while True: # ping servers command = 'fping -A -C 10 -f - -i 10 -q -r 0'.split() p = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) _, data = p.communicate(input='\n'.join(hosts).encode('utf-8')) # update histogram # 77.75.79.53 : 3.56 3.68 2.51 2.63 2.93 7.67 2.49 2.86 - for line in data.decode('utf-8').splitlines(): if not line: continue line = line.strip() pinged_host, pings = line.rsplit(':', 1) pinged_host = pinged_host.strip() pings = pings.strip() for value in pings.split(): if not value: continue if value == '-': value = 99999999999999999 h.labels(target=pinged_host).observe(float(value) / 1000) # sleep time.sleep(10)
""" Copyright 2022 The Magma Authors. This source code is licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree. Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ from prometheus_client import Histogram # Metrics for current configuration controller status GRPC_REQUEST_PROCESSING_TIME = Histogram( 'dp_rc_grpc_request_processing_seconds', 'Time spent processing a GRPC request', ('name',), ) GET_CBSD_STATE_PROCESSING_TIME = GRPC_REQUEST_PROCESSING_TIME.labels('get_cbsd_state') GET_DB_STATE_PROCESSING_TIME = GRPC_REQUEST_PROCESSING_TIME.labels('get_database_state') DELETE_CBSD_PROCESSING_TIME = GRPC_REQUEST_PROCESSING_TIME.labels('delete_cbsd') ACKNOWLEDGE_UPDATE_PROCESSING_TIME = GRPC_REQUEST_PROCESSING_TIME.labels('acknowledge_cbsd_update') INSERT_TO_DB_PROCESSING_TIME = GRPC_REQUEST_PROCESSING_TIME.labels('insert_requests_to_db') STORE_AVAILABLE_FREQUENCIES_PROCESSING_TIME = GRPC_REQUEST_PROCESSING_TIME.labels('store_available_frequencies_in_db')
"shadowsocks network transmit bytes", labelnames=[ "ss_node", ], ) NETWORK_TRANSMIT_BYTES = NETWORK_TRANSMIT_BYTES.labels(ss_node=NODE_HOST_NAME) ENCRYPT_DATA_TIME = Histogram( "encrypt_data_time_seconds", "shadowsocks encrypt data time seconds", labelnames=[ "ss_node", ], ) ENCRYPT_DATA_TIME = ENCRYPT_DATA_TIME.labels(ss_node=NODE_HOST_NAME) DECRYPT_DATA_TIME = Histogram( "decrypt_data_time_seconds", "shadowsocks decrypt data time seconds", labelnames=[ "ss_node", ], ) DECRYPT_DATA_TIME = DECRYPT_DATA_TIME.labels(ss_node=NODE_HOST_NAME) FIND_ACCESS_USER_TIME = Histogram( "find_access_user_time_seconds", "time to find access user",
random.seed() client_id = random.random() histogram_reg = CollectorRegistry() counter_reg = CollectorRegistry() c = Counter('coinbase_http_response_total', 'HTTP responses counted by status_code', ['client', 'method', 'code', 'message'], registry=counter_reg) req_time = Histogram('coinbase_request_seconds', 'Time spent processing request', ['client', 'method'], registry=histogram_reg) get_accounts_time = req_time.labels(client=client_id, method='get_accounts') update_user_time = req_time.labels(client=client_id, method='update_current_user') request_money_time = req_time.labels(client=client_id, method='request_money') @update_user_time.time() def update_user_request(): try: client.update_current_user(name=''.join( random.choice(string.ascii_lowercase) for i in range(10))) c.labels(client=client_id, method='update_current_user', code="200", message='ok').inc() except CoinbaseError as E:
class LyraMetrics: """ Стандартный класс для отправки метрик в prometheus gateway """ pushgateway_host: str = LyraMetricsConsts.PUSH_GATEWAY_HOST_DEFAULT.value model_name: str = None model_version: str = None _metrics_registry: CollectorRegistry = None _metrics_counter: Counter = None _metrics_gauge: Gauge = None _metrics_histogram: Histogram = None def __init__(self, host: str, model_name: str, model_version: str = "undefined"): if not isinstance(host, str): raise ValueError(f"push gateway host isn't string a value") if not is_host_defined(host): raise ValueError( f"lyra metrics host is not defined, set {LyraMetricsConsts.METRICS_HOST_ENV.value} variable" ) if not model_name or model_name == "" or model_name is None: raise ValueError(f"lyra metrics model name is not set") self.pushgateway_host = get_host(host) self._metrics_registry = CollectorRegistry() self.model_name = model_name self.model_version = model_version if get_namespace() is not None: ns = get_namespace() self._metrics_counter = Counter( name="lyra_metrics_counter", documentation= "A counter is a cumulative metric that represents a single monotonically increasing counter whose value can only increase or be reset to zero on restart. For example, you can use a counter to represent the number of requests served, tasks completed, or errors. Do not use a counter to expose a value that can decrease. For example, do not use a counter for the number of currently running processes; instead use a gauge", namespace=(get_namespace() if get_namespace() is not None else ""), labelnames=[ LyraMetricsConsts.LABEL_MODEL_NAME.value, LyraMetricsConsts.LABEL_MODEL_VERSION.value, LyraMetricsConsts.LABEL_METRIC_NAME.value, ], registry=self._metrics_registry, ) self._metrics_gauge = Gauge( name="lyra_metrics_gauge", documentation= "A gauge is a metric that represents a single numerical value that can arbitrarily go up and down. Gauges are typically used for measured values like temperatures or current memory usage, but also counts that can go up and down, like the number of concurrent requests", namespace=(get_namespace() if get_namespace() is not None else ""), labelnames=[ LyraMetricsConsts.LABEL_MODEL_NAME.value, LyraMetricsConsts.LABEL_MODEL_VERSION.value, LyraMetricsConsts.LABEL_METRIC_NAME.value, ], registry=self._metrics_registry, ) self._metrics_histogram = Histogram( name="lyra_metrics_histogram", documentation= "A histogram samples observations (usually things like request durations or response sizes) and counts them in configurable buckets. It also provides a sum of all observed values", namespace=(get_namespace() if get_namespace() is not None else ""), labelnames=[ LyraMetricsConsts.LABEL_MODEL_NAME.value, LyraMetricsConsts.LABEL_MODEL_VERSION.value, LyraMetricsConsts.LABEL_METRIC_NAME.value, ], registry=self._metrics_registry, ) @property def registry(self) -> Optional[CollectorRegistry]: """ Геттер для реджистри """ return self._metrics_registry def counter(self, metric_name: str, value: Any): """ Увеличиваем инкремент метрики """ self._metrics_counter.labels( metric_name=metric_name, model_name=self.model_name, model_version=self.model_version, ).inc(value) push_to_gateway( self.pushgateway_host, job=f"{LyraMetricsConsts.JOB_PREFIX.value}_counter", registry=self._metrics_registry, ) pass def set(self, metric_name: str, value: Any): """ Устанавливаем значение для калибра """ self._metrics_gauge.labels( metric_name=metric_name, model_name=self.model_name, model_version=self.model_version, ).set(int(value)) push_to_gateway( self.pushgateway_host, job=f"{LyraMetricsConsts.JOB_PREFIX.value}_gauge", registry=self._metrics_registry, ) pass def observe(self, metric_name: str, value: Any): """ Устанавливаем значение для гистограммы """ self._metrics_histogram.labels( metric_name=metric_name, model_name=self.model_name, model_version=self.model_version, ).observe(value) push_to_gateway( self.pushgateway_host, job=f"{LyraMetricsConsts.JOB_PREFIX.value}_histogram", registry=self._metrics_registry, ) pass
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST, Histogram from werkzeug.exceptions import UnprocessableEntity from .application_generator import ApplicationGenerator from .common import make_safe_name from .configmap_generator import ConfigMapGenerator from .deployer import Deployer from .models import ApplicationConfiguration from .models import Release from .status import status web = Blueprint("web", __name__) request_histogram = Histogram("web_request_latency", "Request latency in seconds", ["page"]) status_histogram = request_histogram.labels("status") generate_application_histogram = request_histogram.labels( "generate_paasbetaapplication") generate_configmap_histogram = request_histogram.labels("generate_configmap") deploy_histogram = request_histogram.labels("deploy") metrics_histogram = request_histogram.labels("metrics") health_histogram = request_histogram.labels("health") BOOTSTRAP_STATUS = dict(UNKNOWN="warning", SUCCESS="success", RUNNING="info", FAILED="danger") @web.route("/health", methods=["GET"]) @health_histogram.time()
from prometheus_client import start_http_server, Histogram import random import time function_exec = Histogram('function_exec_time', 'Time spend processing a function', ['func_name']) def func1(): if random.random() < 0.02: time.sleep(2) return time.sleep(0.2) def func2(): if random.random() < 0.5: time.sleep(0.6) return time.sleep(0.4) start_http_server(9100) while True: start_time1 = time.time() func1() function_exec.labels(func_name='func1').observe(time.time() - start_time1) start_time2 = time.time() func2() function_exec.labels(func_name='func2').observe(time.time() - start_time2)