def inc_counter(self, key, amount=1): """ Increment counter """ prometheus_counter = Gauge( # pylint: disable=no-value-for-parameter key ) prometheus_counter.inc(amount)
def publish_comments(self, comment, link): """ Publish the comment on the customer case. """ comment_endpoint = f'https://api.access.redhat.com/rs/cases/{self.ticket}/comments' payload = { "label": "Solution by the bot", "text": comment, "uri": link, "draft": False, "caseNumber": str(self.ticket), "public": False } comment_response = requests.post(comment_endpoint, json=payload, auth=(self.rhn_username, self.rhn_password)) if comment_response.status_code == 200 or comment_response.status_code == 201: print('comment to customer cases was successfully published') return True else: print('comment to customer cases was NOT successfully published') metric_name = self.job + '-publish-comment' metric_name = metric_name.replace('-', '_') job_comment_metric = Gauge(metric_name, 'Error of comment publish on customer case', registry=prometheus_registry) job_comment_metric.inc() return False
def main(): g = Gauge('bcr_gauge_example', 'Testing how Prometheus Gauge works') start_http_server(8000) while True: g.inc(3) time.sleep(5) g.dec(2)
def test_nolabels(self): gauge = Gauge('g', 'help', registry=self.registry) gauge.inc() self.gb.push() self.t.join() self.assertEqual(b'g 1.0 1434898897\n', self.data)
def ensure_backups(args): push_job_started_metric(args.prom_push_gateway_endpoint, job_ensure_backups) start_time = time.time() # ensure-backups job specific metrics bigtable_backup_job_num_tables_backed_up = Gauge('bigtable_backup_job_num_tables_backed_up', 'Number of table backups found during last run', registry=registry) bigtable_backup_job_num_backup_ups = Gauge('bigtable_backup_job_num_backup_ups', 'Sum of number of backups per table found during last run', registry=registry) # Read all the existing backups popen = subprocess.Popen(['bigtable-backup', 'list-backups', '-ojson', '--backup-path', args.destination_path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) popen.wait() # build and push metrics related to existing backups backups = json.loads(popen.stdout.readline()) if (args.duration == None and args.period_from == None) or (args.duration != None and args.period_from != None): raise ValueError("Either of --duration or --periodic-table-duration must be set") bigtable_backup_job_num_tables_backed_up.set(len(backups)) for __, timestamps in backups.items(): bigtable_backup_job_num_backup_ups.inc(len(timestamps)) push_metrics(args.prom_push_gateway_endpoint, job_ensure_backups) if args.period_from == None: period_from = datetime.utcnow() - timedelta(days=args.duration) args.period_from = valid_date(period_from.strftime("%Y-%m-%d")) args.period_to = valid_date(datetime.utcnow().strftime("%Y-%m-%d")) oldest_table_number = int(args.period_from.timestamp() / args.periodic_table_duration) newest_table_number = int(args.period_to.timestamp() / args.periodic_table_duration) active_table_number = time.time() / args.periodic_table_duration print("Checking right backups exist") while oldest_table_number <= newest_table_number: table_id = args.bigtable_table_id_prefix + str(oldest_table_number) oldest_table_number += 1 if table_id not in backups: print("backup for {} not found".format(table_id)) create_backup(table_id, args) bigtable_backup_job_backups_created.inc(1) print("Checking whether all the backups are created after their period is over and deleting old unwanted backups") for table_id, timestamps in backups.items(): table_number = int(table_id.rsplit("_", 1)[-1]) last_timestamp_from_table_number = find_last_timestamp_from_table_number(table_number, args.periodic_table_duration) # Checking whether backup is created after last timestamp of tables period. if last_timestamp_from_table_number > timestamps[-1]: create_backup(table_id, args) # Retain only most recent backup for non active table if table_number != active_table_number and len(timestamps) > 1: for timestamp in timestamps[:-1]: delete_backup(table_id, str(timestamp), args) push_job_finished_metric(args.prom_push_gateway_endpoint, job_ensure_backups, int(time.time() - start_time))
def send_gauge(cls, metrics_name, help_info, value, inc=None): g = Gauge(metrics_name, help_info) if inc is None: g.set(value) else: if inc: g.inc(value) else: g.dec(value)
def increment(self, stat, by=1): self.stats[stat] += by # Update the associated Prometheus gauge. if stat not in self.prom_gauges: gauge = Gauge(sanitize_name("felix_" + self.name + " " + stat), "%s: %s" % (self.name, stat)) self.prom_gauges[stat] = gauge else: gauge = self.prom_gauges[stat] gauge.inc(by)
class OperationMetricSet: """Collection of Prometheus metrics representing a logical operation""" requests: Counter requests_duration: Histogram exceptions: Counter requests_in_progress: Gauge def __init__(self, operation_name: str, labels: List[str]): self.requests = Counter( f"pyncette_{operation_name}_total", f"Total count of {operation_name} operations", labels, ) self.requests_duration = Histogram( f"pyncette_{operation_name}_duration_seconds", f"Histogram of {operation_name} processing time", labels, ) self.exceptions = Counter( f"pyncette_{operation_name}_failures_total", f"Total count of failed {operation_name} failures", [*labels, "exception_type"], ) self.requests_in_progress = Gauge( f"pyncette_{operation_name}_in_progress", f"Gauge of {operation_name} operations currently being processed", labels, ) @contextlib.asynccontextmanager async def measure(self, **labels: Dict[str, str]) -> AsyncIterator[None]: """An async context manager that measures the execution of the wrapped code""" if labels: self.requests_in_progress.labels(**labels).inc() self.requests.labels(**labels).inc() else: self.requests_in_progress.inc() self.requests.inc() before_time = time.perf_counter() try: yield except Exception as e: self.exceptions.labels(**labels, exception_type=type(e).__name__).inc() raise e from None finally: if labels: self.requests_duration.labels(**labels).observe( time.perf_counter() - before_time ) self.requests_in_progress.labels(**labels).dec() else: self.requests_duration.observe(time.perf_counter() - before_time) self.requests_in_progress.dec()
class TestGauge(unittest.TestCase): def setUp(self): self.registry = CollectorRegistry() self.gauge = Gauge('g', 'help', registry=self.registry) def test_gauge(self): self.assertEqual(0, self.registry.get_sample_value('g')) self.gauge.inc() self.assertEqual(1, self.registry.get_sample_value('g')) self.gauge.dec(3) self.assertEqual(-2, self.registry.get_sample_value('g')) self.gauge.set(9) self.assertEqual(9, self.registry.get_sample_value('g')) def test_function_decorator(self): self.assertEqual(0, self.registry.get_sample_value('g')) @self.gauge.track_inprogress() def f(): self.assertEqual(1, self.registry.get_sample_value('g')) f() self.assertEqual(0, self.registry.get_sample_value('g')) def test_block_decorator(self): self.assertEqual(0, self.registry.get_sample_value('g')) with self.gauge.track_inprogress(): self.assertEqual(1, self.registry.get_sample_value('g')) self.assertEqual(0, self.registry.get_sample_value('g')) def test_gauge_function(self): x = {} self.gauge.set_function(lambda: len(x)) self.assertEqual(0, self.registry.get_sample_value('g')) self.gauge.inc() self.assertEqual(0, self.registry.get_sample_value('g')) x['a'] = None self.assertEqual(1, self.registry.get_sample_value('g')) def test_function_decorator(self): self.assertEqual(0, self.registry.get_sample_value('g')) @self.gauge.time() def f(): time.sleep(.001) f() self.assertNotEqual(0, self.registry.get_sample_value('g')) def test_block_decorator(self): self.assertEqual(0, self.registry.get_sample_value('g')) with self.gauge.time(): time.sleep(.001) self.assertNotEqual(0, self.registry.get_sample_value('g'))
class TelemetryClient(object): @Inject def __init__(self, environment: SystemEnvironmentProperties): self.endpoint = environment.get("PROMETHEUS_GATEWAY_ENDPOINT") self.registry = CollectorRegistry() self.get_request_counter = Counter("invertpdf_get_request_count", "Number of successful GET requests", registry=self.registry) self.post_request_counter = Counter( "invertpdf_post_request_count", "Number of successful POST requests", registry=self.registry) self.duration_histogram = Histogram("invertpdf_request_duration_ms", "Request duration", registry=self.registry, buckets=[ 0, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 30000, 60000, 1800000, 3600000 ]) self.failure_counter = Counter("invertpdf_failed_requests", "Number of failed requests", registry=self.registry) self.requests_in_progress = Gauge("invertpdf_requests_in_progress", "Number of pending requests", registry=self.registry) self.free_disk = Gauge("invertpdf_free_disk_space", "Free disk space on tmpfs", registry=self.registry) self.logger = logging.getLogger(self.__class__.__name__) def track_request(self, method: str, duration: int): self.logger.info(f"Request took {duration}ms.") self.duration_histogram.observe(duration) if method == "GET": self.get_request_counter.inc() elif method == "POST": self.post_request_counter.inc() def track_failure(self, method: str, duration: int): self.failure_counter.inc() self.duration_histogram.observe(duration) def track_start(self): self.requests_in_progress.inc() def track_end(self): self.requests_in_progress.dec() def submit(self): push_to_gateway(self.endpoint, "invertpdf", self.registry)
def update_stats(name): metric = metrics.get(name, None) value = int(request.args.get("value", 1)) if metric is None: metric = Gauge(name, name) metrics[name] = metric if request.method == "DELETE": metric.dec(value) elif request.method == "POST": metric.inc(value) elif request.method == "PATCH": metric.set(value) return ""
def execute_citellus(self, sosreport_dir): """ Run Citellus on the Customer Ticket sos-report """ if self.check_sosreports(sosreport_dir): os.system(f'python3 citellus/citellus.py {sosreport_dir}') _LOGGER.info('Citellus execution on the sosreport is completed successfully') return True else: _LOGGER.error('Unable to provide sosreport to Citellus for execution') metric_name = self.job + '-sosreport-error' metric_name = metric_name.replace('-', '_') job_comment_metric = Gauge(metric_name, 'unable to send sosreport to citellus', registry=prometheus_registry) job_comment_metric.inc() pass
def get_solutions(self, sosreport_dir): """ Gather the solutions from the access.redhat/solutions. """ f = open(f'{sosreport_dir}/citellus.json') report = json.load(f) hash_map = [] solution_data = [] solution = '' for hash_key, plugin in report['results'].items(): if plugin.get('result').get('rc') == 20: if plugin.get('kb') and self.redhat_solutions in plugin.get('kb'): kbase_id = re.search(r'\d+$', plugin.get('kb')).group(0) if kbase_id not in hash_map: hash_map.append(kbase_id) url = 'https://api.access.redhat.com/rs/solutions/' + kbase_id response = requests.get(url, auth=(self.rhn_username, self.rhn_password)) if response.status_code == 200: xml = response.text tree = ET.fromstring(xml) try: resolution = tree.find('{http://www.redhat.com/gss/strata}resolution') solution = resolution.find('{http://www.redhat.com/gss/strata}text').text except Exception as e: _LOGGER.error('xml parsing of the solution failed!') metric_name = self.job + '-solution-xml-parse-' + str(kbase_id) metric_name = metric_name.replace('-', '_') job_comment_metric = Gauge(metric_name, 'solution xml parsing failed', registry=prometheus_registry) job_comment_metric.inc() pass if solution: plugin['result']['solution'] = solution else: _LOGGER.error('Request to solution api failed!') metric_name = self.job + '-solution-request-' + str(kbase_id) metric_name = metric_name.replace('-', '_') job_comment_metric = Gauge(metric_name, 'solution request failed due to authentication', registry=prometheus_registry) job_comment_metric.inc() pass solution_data.append(plugin) solution_data = sorted(solution_data, key=lambda val: val['priority'], reverse=True) return solution_data
def main(self): """ Execute SBR OpenStack bot. """ job_name = self.job + '-job-exec-time' job_name = job_name.replace('-', '_') job_metric_time = Gauge(job_name, 'Runtime of application job execution', registry=prometheus_registry) try: with job_metric_time.time(): solutions = list() complete = False remote_host, remote_port, remote_dir = self.get_ticket_config() if remote_host and remote_port and remote_dir: self.ssh_copy_attachments(remote_host, remote_port, remote_dir) if os.path.isdir(self.path): sosreports = self.get_all_sosreports() print("List of extracted sosreports: ", sosreports) for sosreport in sosreports: execution_path = f"{self.path}/{sosreport}" self.execute_citellus(execution_path) solution_data = self.get_solutions(execution_path) solutions.append(solution_data) comment, link = self.generate_comments(solution_data) print("Comment:", comment) complete = True if comment: complete = self.publish_comments(comment, link) if complete: print('Script successfully completed') else: metric_name = self.job + '-application-failed' metric_name = metric_name.replace('-', '_') job_comment_metric = Gauge(metric_name, 'Script Unable to process the ticket', registry=prometheus_registry) job_comment_metric.inc() _LOGGER.info('Script Unable to process the ticket') print('Script Failed!') except Exception as e: print("Script Failed!") traceback.print_exc() self.pushgateway(self.job)
def get_all_sosreports(self): """ Extract the SOS report based upon there compression type. """ print('Extracting the compressed file!') sosreports = list() metric_count = 0 for sosreport in os.listdir(self.path): try: if sosreport.startswith("."): continue if tarfile.is_tarfile(f'{self.path}/{sosreport}'): print("sosreport is compressed as tar file") sosreport_tar_obj = tarfile.open(f'{self.path}/{sosreport}') sosreport_tar_obj.extractall(path=self.path) os.chmod(f'{self.path}/{sosreport_tar_obj.getnames()[0]}', 0o755) os.remove(f'{self.path}/{sosreport}') sosreports.append(sosreport_tar_obj.getnames()[0]) _LOGGER.info('Extracted the tar compressed sosreport from attachments') elif zipfile.is_zipfile(f'{self.path}/{sosreport}'): print("sosreport is compressed as zip file") sosreport_zip_obj = zipfile.ZipFile(f'{self.path}/{sosreport}') sosreport_zip_obj.extractall(path=f'{self.path}') os.chmod(f'{self.path}/{sosreport_zip_obj.namelist()[0]}', 0o755) os.remove(f'{self.path}/{sosreport}') sosreports.append(sosreport_zip_obj.namelist()[0]) _LOGGER.info('Extracted the zipped sosreport from attachments') else: print("failed sosreport extraction! compression type is not tar or zip! file:", sosreport) pass except: print('Error occurred in file: ', sosreport) metric_name = self.job + '-sosreport-extract-' + str(metric_count) metric_count += 1 metric_name = metric_name.replace('-', '_') job_comment_metric = Gauge(metric_name, 'unable to extract sosreport', registry=prometheus_registry) job_comment_metric.inc() pass return sosreports
def ssh_copy_attachments(self, remote_host, remote_port, remote_directory): """ Copy the ticket attachments from the storage server to /cases/<ticket> directory. """ try: print("Ticket attachment Directory: ", remote_directory) escape_known_host = f"-o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" scp_command = f"sshpass -f {self.rh_pwd_dir} scp {escape_known_host} -r -P {remote_port} {self.user}@{remote_host}:{remote_directory} /cases/{self.ticket}" scp_process = subprocess.run(scp_command.split(' ')) if scp_process.returncode == 0: _LOGGER.info('Successfully fetched the attachments') elif scp_process.returncode == 1: metric_name = self.job + '-scp-error' metric_name = metric_name.replace('-', '_') job_comment_metric = Gauge(metric_name, 'unable to scp ticket attachments as the file is not found', registry=prometheus_registry) job_comment_metric.inc(1) _LOGGER.error('Unable to fetch attachments as file is not found') elif scp_process.returncode == 5: metric_name = self.job + '-scp-error' metric_name = metric_name.replace('-', '_') job_comment_metric = Gauge(metric_name, 'unable to scp ticket attachments due to authentication failure', registry=prometheus_registry) job_comment_metric.inc(5) _LOGGER.error('Unable to fetch attachments due to authentication') else: _LOGGER.error(f'Unable to fetch attachments due to error code {scp_process.returncode}') raise Exception('scp failed!,Unable to fetch attachments.') except: metric_name = self.job + '-scp-error' metric_name = metric_name.replace('-', '_') job_comment_metric = Gauge(metric_name, 'unable to scp ticket attachments', registry=prometheus_registry) job_comment_metric.inc(2) _LOGGER.error('scp failed!, fetching attachments is not possible.') raise Exception('scp failed!, fetching attachments is not possible.') return True
from prometheus_client import Counter from prometheus_client import Gauge c = Counter('my_failures', 'Description of counter') print(c) c.inc() # Increment by 1 print(c) c.inc(1.6) # Increment by given value print(c) print(c.collect()) g = Gauge("my_gauge", "my description of gauge") g.set(1) g.inc(1) g.dec(2) g.track_inprogress() # 在进入时加1,在退出时减一 @g.track_inprogress() def f(): pass
class RequestHandler: """ Class that handles the requests arriving to the gateway and the result extracted from the requests future. :param metrics_registry: optional metrics registry for prometheus used if we need to expose metrics from the executor or from the data request handler :param runtime_name: optional runtime_name that will be registered during monitoring """ def __init__( self, metrics_registry: Optional['CollectorRegistry'] = None, runtime_name: Optional[str] = None, ): self._request_init_time = {} if metrics_registry else None self._executor_endpoint_mapping = None if metrics_registry: with ImportExtensions( required=True, help_text='You need to install the `prometheus_client` to use the montitoring functionality of jina', ): from prometheus_client import Gauge, Summary self._receiving_request_metrics = Summary( 'receiving_request_seconds', 'Time spent processing request', registry=metrics_registry, namespace='jina', labelnames=('runtime_name',), ).labels(runtime_name) self._pending_requests_metrics = Gauge( 'number_of_pending_requests', 'Number of pending requests', registry=metrics_registry, namespace='jina', labelnames=('runtime_name',), ).labels(runtime_name) else: self._receiving_request_metrics = None self._pending_requests_metrics = None def handle_request( self, graph: 'TopologyGraph', connection_pool: 'GrpcConnectionPool' ) -> Callable[['Request'], 'asyncio.Future']: """ Function that handles the requests arriving to the gateway. This will be passed to the streamer. :param graph: The TopologyGraph of the Flow. :param connection_pool: The connection pool to be used to send messages to specific nodes of the graph :return: Return a Function that given a Request will return a Future from where to extract the response """ async def gather_endpoints(request_graph): nodes = request_graph.all_nodes try: tasks_to_get_endpoints = [ node.get_endpoints(connection_pool) for node in nodes ] endpoints = await asyncio.gather(*tasks_to_get_endpoints) except InternalNetworkError as err: err_code = err.code() if err_code == grpc.StatusCode.UNAVAILABLE: err._details = ( err.details() + f' |Gateway: Communication error with deployment at address(es) {err.dest_addr}. Head or worker(s) may be down.' ) raise err else: raise self._executor_endpoint_mapping = {} for node, (endp, _) in zip(nodes, endpoints): self._executor_endpoint_mapping[node.name] = endp.endpoints def _handle_request(request: 'Request') -> 'asyncio.Future': if self._receiving_request_metrics: self._request_init_time[request.request_id] = time.time() if self._pending_requests_metrics: self._pending_requests_metrics.inc() # important that the gateway needs to have an instance of the graph per request request_graph = copy.deepcopy(graph) if graph.has_filter_conditions: request_doc_ids = request.data.docs[ :, 'id' ] # used to maintain order of docs that are filtered by executors tasks_to_respond = [] tasks_to_ignore = [] endpoint = request.header.exec_endpoint r = request.routes.add() r.executor = 'gateway' r.start_time.GetCurrentTime() # If the request is targeting a specific deployment, we can send directly to the deployment instead of # querying the graph for origin_node in request_graph.origin_nodes: leaf_tasks = origin_node.get_leaf_tasks( connection_pool, request, None, endpoint=endpoint, executor_endpoint_mapping=self._executor_endpoint_mapping, target_executor_pattern=request.header.target_executor, ) # Every origin node returns a set of tasks that are the ones corresponding to the leafs of each of their # subtrees that unwrap all the previous tasks. It starts like a chain of waiting for tasks from previous # nodes tasks_to_respond.extend([task for ret, task in leaf_tasks if ret]) tasks_to_ignore.extend([task for ret, task in leaf_tasks if not ret]) def _sort_response_docs(response): # sort response docs according to their order in the initial request def sort_by_request_order(doc): if doc.id in request_doc_ids: return request_doc_ids.index(doc.id) else: return len(request_doc_ids) # put new/unknown docs at the end sorted_docs = sorted(response.data.docs, key=sort_by_request_order) response.data.docs = DocumentArray(sorted_docs) async def _process_results_at_end_gateway( tasks: List[asyncio.Task], request_graph: TopologyGraph ) -> asyncio.Future: if self._executor_endpoint_mapping is None: await asyncio.gather(gather_endpoints(request_graph)) partial_responses = await asyncio.gather(*tasks) partial_responses, metadatas = zip(*partial_responses) filtered_partial_responses = list( filter(lambda x: x is not None, partial_responses) ) response = filtered_partial_responses[0] request_graph.add_routes(response) if graph.has_filter_conditions: _sort_response_docs(response) return response # In case of empty topologies if not tasks_to_respond: r.end_time.GetCurrentTime() future = asyncio.Future() future.set_result((request, {})) tasks_to_respond.append(future) return asyncio.ensure_future( _process_results_at_end_gateway(tasks_to_respond, request_graph) ) return _handle_request def handle_result(self) -> Callable[['Request'], 'asyncio.Future']: """ Function that handles the result when extracted from the request future :return: Return a Function that returns a request to be returned to the client """ def _handle_result(result: 'Request'): """ Function that handles the result when extracted from the request future :param result: The result returned to the gateway. It extracts the request to be returned to the client :return: Returns a request to be returned to the client """ for route in result.routes: if route.executor == 'gateway': route.end_time.GetCurrentTime() if self._receiving_request_metrics: init_time = self._request_init_time.pop( result.request_id ) # need to pop otherwise it stays in memory forever self._receiving_request_metrics.observe(time.time() - init_time) if self._pending_requests_metrics: self._pending_requests_metrics.dec() return result return _handle_result
c = Counter('static_increment_counter', 'Counter that increments by 50 every 2 seconds') g = Gauge('static_increment_gauge', 'Gauge that increments by 50 every 2 seconds') c_rand = Counter('random_increment_counter', 'Counter that increments in a random fashion every 15 seconds') g_rand = Gauge('random_increment_gauge', 'Gauge that increments in a random fashion every 15 seconds') #i = 1 run = 0 random.seed() while True: log.info("*******RUN %i **********",run) # Increment the counter and gauge by the same value at each run c.inc(50) g.inc(50) rand = random.randint(1,1000) c_rand.inc(rand) g_rand.inc(rand) log.info("Incremented Rand by {0}".format(rand)) # Scrape endpoint to get metric values logged to stdout metrics = requests.get("http://*****:*****@example.com\"") log.info(y) time.sleep(15) run+=1
class BroadcastWebsocketStats(): def __init__(self, local_hostname, remote_hostname): self._local_hostname = local_hostname self._remote_hostname = remote_hostname self._registry = CollectorRegistry() # TODO: More robust replacement self.name = self.safe_name(self._local_hostname) self.remote_name = self.safe_name(self._remote_hostname) self._messages_received_total = Counter(f'awx_{self.remote_name}_messages_received_total', 'Number of messages received, to be forwarded, by the broadcast websocket system', registry=self._registry) self._messages_received = Gauge(f'awx_{self.remote_name}_messages_received', 'Number forwarded messages received by the broadcast websocket system, for the duration of the current connection', registry=self._registry) self._connection = Enum(f'awx_{self.remote_name}_connection', 'Websocket broadcast connection', states=['disconnected', 'connected'], registry=self._registry) self._connection_start = Gauge(f'awx_{self.remote_name}_connection_start', 'Time the connection was established', registry=self._registry) self._messages_received_per_minute = Gauge(f'awx_{self.remote_name}_messages_received_per_minute', 'Messages received per minute', registry=self._registry) self._internal_messages_received_per_minute = FixedSlidingWindow() def safe_name(self, s): # Replace all non alpha-numeric characters with _ return re.sub('[^0-9a-zA-Z]+', '_', s) def unregister(self): self._registry.unregister(f'awx_{self.remote_name}_messages_received') self._registry.unregister(f'awx_{self.remote_name}_connection') def record_message_received(self): self._internal_messages_received_per_minute.record() self._messages_received.inc() self._messages_received_total.inc() def record_connection_established(self): self._connection.state('connected') self._connection_start.set_to_current_time() self._messages_received.set(0) def record_connection_lost(self): self._connection.state('disconnected') def get_connection_duration(self): return (datetime.datetime.now() - self._connection_established_ts).total_seconds() def render(self): msgs_per_min = self._internal_messages_received_per_minute.render() self._messages_received_per_minute.set(msgs_per_min) def serialize(self): self.render() registry_data = generate_latest(self._registry).decode('UTF-8') return registry_data
#Test code for exporting metrics to localhost push gateway from prometheus_client import CollectorRegistry, Gauge, push_to_gateway, Summary, Histogram, Counter, start_http_server import os registry = CollectorRegistry() _PACKAGES_NEW = Gauge('packages_added', 'Packages newly added', registry=registry) #Implement a logic for checking condition to increase gauge count for i in range(10): #some logic for checking if any new packages have been added packages_added = True if packages_added == True: _PACKAGES_NEW.inc() push_gateway = os.getenv('PROMETHEUS_PUSH_GATEWAY', 'pushgateway:9091') if push_gateway: try: push_to_gateway(push_gateway, job='package-releases', registry=registry) except Exception as e: print('An error occurred pushing the metrics: {}'.format(str(e)))
# Create a metric to track time spent and requests made. counter = Counter('sobi3ch_counter', 'Description of a counter') gauge = Gauge('sobi3ch_gauge', 'Description of gauge') gauge.set(50) SUMMARY = Summary('sobi3ch_summary_request_processing_seconds', 'Time spent processing request') histogram = Histogram('sobi3ch_histogram_request_latency_seconds', 'Description of histogram') # Decorate function with metric. @SUMMARY.time() def process_request(t): """A dummy function that takes some time.""" time.sleep(t) if __name__ == '__main__': # Start up the server to expose the metrics. start_http_server(8000) # Generate some requests. while True: r = random.random() process_request(r) if r > 0.8: counter.inc() if r < 0.5: gauge.inc() # Increment by 1 else: gauge.dec() histogram.observe(4.7) # Observe 4.7 (seconds in this case)
class PrometheusMonitor(Monitor): """ Prometheus Faust Sensor. This sensor, records statistics using prometheus_client and expose them using the aiohttp server running under /metrics by default Usage: import faust from faust.sensors.prometheus import PrometheusMonitor app = faust.App('example', broker='kafka://') app.monitor = PrometheusMonitor(app, pattern='/metrics') """ ERROR = 'error' COMPLETED = 'completed' KEYS_RETRIEVED = 'keys_retrieved' KEYS_UPDATED = 'keys_updated' KEYS_DELETED = 'keys_deleted' def __init__(self, app: AppT, pattern: str = '/metrics', **kwargs: Any) -> None: self.app = app self.pattern = pattern if prometheus_client is None: raise ImproperlyConfigured( 'prometheus_client requires `pip install prometheus_client`.') self._initialize_metrics() self.expose_metrics() super().__init__(**kwargs) def _initialize_metrics(self) -> None: """ Initialize Prometheus metrics """ # On message received self.messages_received = Counter('messages_received', 'Total messages received') self.active_messages = Gauge('active_messages', 'Total active messages') self.messages_received_per_topics = Counter( 'messages_received_per_topic', 'Messages received per topic', ['topic']) self.messages_received_per_topics_partition = Gauge( 'messages_received_per_topics_partition', 'Messages received per topic/partition', ['topic', 'partition']) self.events_runtime_latency = Histogram('events_runtime_ms', 'Events runtime in ms') # On Event Stream in self.total_events = Counter('total_events', 'Total events received') self.total_active_events = Gauge('total_active_events', 'Total active events') self.total_events_per_stream = Counter('total_events_per_stream', 'Events received per Stream', ['stream']) # On table changes get/set/del keys self.table_operations = Counter('table_operations', 'Total table operations', ['table', 'operation']) # On message send self.topic_messages_sent = Counter('topic_messages_sent', 'Total messages sent per topic', ['topic']) self.total_sent_messages = Counter('total_sent_messages', 'Total messages sent') self.producer_send_latency = Histogram('producer_send_latency', 'Producer send latency in ms') self.total_error_messages_sent = Counter('total_error_messages_sent', 'Total error messages sent') self.producer_error_send_latency = Histogram( 'producer_error_send_latency', 'Producer error send latency in ms') # Assignment self.assignment_operations = Counter( 'assignment_operations', 'Total assigment operations (completed/error)', ['operation']) self.assign_latency = Histogram('assign_latency', 'Assignment latency in ms') # Revalances self.total_rebalances = Gauge('total_rebalances', 'Total rebalances') self.total_rebalances_recovering = Gauge( 'total_rebalances_recovering', 'Total rebalances recovering') self.revalance_done_consumer_latency = Histogram( 'revalance_done_consumer_latency', 'Consumer replying that rebalance is done to broker in ms') self.revalance_done_latency = Histogram( 'revalance_done_latency', 'Revalance finished latency in ms') # Count Metrics by name self.count_metrics_by_name = Gauge('metrics_by_name', 'Total metrics by name', ['metric']) # Web self.http_status_codes = Counter('http_status_codes', 'Total http_status code', ['status_code']) self.http_latency = Histogram('http_latency', 'Http response latency in ms') # Topic/Partition Offsets self.topic_partition_end_offset = Gauge( 'topic_partition_end_offset', 'Offset ends per topic/partition', ['topic', 'partition']) self.topic_partition_offset_commited = Gauge( 'topic_partition_offset_commited', 'Offset commited per topic/partition', ['topic', 'partition']) self.consumer_commit_latency = Histogram( 'consumer_commit_latency', 'Consumer commit latency in ms') def on_message_in(self, tp: TP, offset: int, message: Message) -> None: """Call before message is delegated to streams.""" super().on_message_in(tp, offset, message) self.messages_received.inc() self.active_messages.inc() self.messages_received_per_topics.labels(topic=tp.topic).inc() self.messages_received_per_topics_partition.labels( topic=tp.topic, partition=tp.partition).set(offset) def on_stream_event_in(self, tp: TP, offset: int, stream: StreamT, event: EventT) -> typing.Optional[typing.Dict]: """Call when stream starts processing an event.""" state = super().on_stream_event_in(tp, offset, stream, event) self.total_events.inc() self.total_active_events.inc() self.total_events_per_stream.labels( stream=f'stream.{self._stream_label(stream)}.events').inc() return state def _stream_label(self, stream: StreamT) -> str: return self._normalize( stream.shortlabel.lstrip('Stream:'), ).strip('_').lower() def on_stream_event_out(self, tp: TP, offset: int, stream: StreamT, event: EventT, state: typing.Dict = None) -> None: """Call when stream is done processing an event.""" super().on_stream_event_out(tp, offset, stream, event, state) self.total_active_events.dec() self.events_runtime_latency.observe( self.secs_to_ms(self.events_runtime[-1])) def on_message_out(self, tp: TP, offset: int, message: Message) -> None: """Call when message is fully acknowledged and can be committed.""" super().on_message_out(tp, offset, message) self.active_messages.dec() def on_table_get(self, table: CollectionT, key: typing.Any) -> None: """Call when value in table is retrieved.""" super().on_table_get(table, key) self.table_operations.labels(table=f'table.{table.name}', operation=self.KEYS_RETRIEVED).inc() def on_table_set(self, table: CollectionT, key: typing.Any, value: typing.Any) -> None: """Call when new value for key in table is set.""" super().on_table_set(table, key, value) self.table_operations.labels(table=f'table.{table.name}', operation=self.KEYS_UPDATED).inc() def on_table_del(self, table: CollectionT, key: typing.Any) -> None: """Call when key in a table is deleted.""" super().on_table_del(table, key) self.table_operations.labels(table=f'table.{table.name}', operation=self.KEYS_DELETED).inc() def on_commit_completed(self, consumer: ConsumerT, state: typing.Any) -> None: """Call when consumer commit offset operation completed.""" super().on_commit_completed(consumer, state) self.consumer_commit_latency.observe( self.ms_since(typing.cast(float, state))) def on_send_initiated(self, producer: ProducerT, topic: str, message: PendingMessage, keysize: int, valsize: int) -> typing.Any: """Call when message added to producer buffer.""" self.topic_messages_sent.labels(topic=f'topic.{topic}').inc() return super().on_send_initiated(producer, topic, message, keysize, valsize) def on_send_completed(self, producer: ProducerT, state: typing.Any, metadata: RecordMetadata) -> None: """Call when producer finished sending message.""" super().on_send_completed(producer, state, metadata) self.total_sent_messages.inc() self.producer_send_latency.observe( self.ms_since(typing.cast(float, state))) def on_send_error(self, producer: ProducerT, exc: BaseException, state: typing.Any) -> None: """Call when producer was unable to publish message.""" super().on_send_error(producer, exc, state) self.total_error_messages_sent.inc() self.producer_error_send_latency.observe( self.ms_since(typing.cast(float, state))) def on_assignment_error(self, assignor: PartitionAssignorT, state: typing.Dict, exc: BaseException) -> None: """Partition assignor did not complete assignor due to error.""" super().on_assignment_error(assignor, state, exc) self.assignment_operations.labels(operation=self.ERROR).inc() self.assign_latency.observe(self.ms_since(state['time_start'])) def on_assignment_completed(self, assignor: PartitionAssignorT, state: typing.Dict) -> None: """Partition assignor completed assignment.""" super().on_assignment_completed(assignor, state) self.assignment_operations.labels(operation=self.COMPLETED).inc() self.assign_latency.observe(self.ms_since(state['time_start'])) def on_rebalance_start(self, app: AppT) -> typing.Dict: """Cluster rebalance in progress.""" state = super().on_rebalance_start(app) self.total_rebalances.inc() return state def on_rebalance_return(self, app: AppT, state: typing.Dict) -> None: """Consumer replied assignment is done to broker.""" super().on_rebalance_return(app, state) self.total_rebalances.dec() self.total_rebalances_recovering.inc() self.revalance_done_consumer_latency.observe( self.ms_since(state['time_return'])) def on_rebalance_end(self, app: AppT, state: typing.Dict) -> None: """Cluster rebalance fully completed (including recovery).""" super().on_rebalance_end(app, state) self.total_rebalances_recovering.dec() self.revalance_done_latency.observe(self.ms_since(state['time_end'])) def count(self, metric_name: str, count: int = 1) -> None: """Count metric by name.""" super().count(metric_name, count=count) self.count_metrics_by_name.labels(metric=metric_name).inc(count) def on_tp_commit(self, tp_offsets: TPOffsetMapping) -> None: """Call when offset in topic partition is committed.""" super().on_tp_commit(tp_offsets) for tp, offset in tp_offsets.items(): self.topic_partition_offset_commited.labels( topic=tp.topic, partition=tp.partition).set(offset) def track_tp_end_offset(self, tp: TP, offset: int) -> None: """Track new topic partition end offset for monitoring lags.""" super().track_tp_end_offset(tp, offset) self.topic_partition_end_offset.labels( topic=tp.topic, partition=tp.partition).set(offset) def on_web_request_end(self, app: AppT, request: web.Request, response: typing.Optional[web.Response], state: typing.Dict, *, view: web.View = None) -> None: """Web server finished working on request.""" super().on_web_request_end(app, request, response, state, view=view) status_code = int(state['status_code']) self.http_status_codes.labels(status_code=status_code).inc() self.http_latency.observe(self.ms_since(state['time_end'])) def expose_metrics(self) -> None: """Expose promethues metrics using the current aiohttp application.""" @self.app.page(self.pattern) async def metrics_handler(self: _web.View, request: _web.Request) -> _web.Response: headers = { 'Content-Type': 'text/plain; version=0.0.4; charset=utf-8', } return cast( _web.Response, Response(body=generate_latest(REGISTRY), headers=headers, status=200))
registry=registry) nodes_online = Gauge('total_online', 'total online nodes', namespace='gluon', registry=registry) clients_total = Gauge('clients_total', 'clients total', namespace='gluon', registry=registry) traffic_total = Gauge('traffic_total', 'traffic total', ['type'], namespace='gluon', registry=registry) for node in data: nodes_total.inc() nid = node['nodeid'] d = node['last_response'] hostname = d['nodeinfo']['hostname'] # default labels deflbl = { 'nodeid': nid, 'hostname': hostname, 'fw': d['nodeinfo']['software']['firmware']['release'] } # check node status if node['status'] != 'Up': online.labels(**deflbl).set(0)
}, { 'id': gen_new_uuid(), 'content': 'This is a second joke', 'reactions': 0 }] number_jokes_counter.inc() number_jokes_counter.inc() channel_members = [{ 'id': gen_new_uuid(), 'name': 'Horgix' }, { 'id': gen_new_uuid(), 'name': 'Frédéric' }] number_channel_members_gauge.inc() number_channel_members_gauge.inc() @app.route('/') def main(): pass # requests tracked by default @app.route('/jokes') def get_jokes(): return api_response_from_dict(jokes) @app.route('/add_joke') def add_joke():
class Prometheus(service.BuildbotService): ''' This service exposes buildbot metrics to Prometheus. Metrics state is initialised at service start and is (mostly) retained through a reconfiguration. Instance attributes holding a Prometheus metrics item are prefixed with a symbol indicating the kind of metric they are. For example: - Counters: c_<attr_label> - Gauges: g_<attr_label> - Histogram: h_<attr_label> - Summary: s_<attr_label> ''' name = "Prometheus" namespace = 'buildbot' def __init__(self, port=9101, **kwargs): service.BuildbotService.__init__(self, **kwargs) self.port = port self.server = None self.consumers = [] self.registry = None self.create_metrics() @defer.inlineCallbacks def reconfigService(self, builders=None, **kwargs): ''' Accumulated metrics are maintained through a reconfigure. ''' log.msg("Reconfiguring Prometheus reporter") yield service.BuildbotService.reconfigService(self) self.registerConsumers() @defer.inlineCallbacks def startService(self): log.msg("Starting Prometheus reporter") yield service.BuildbotService.startService(self) root = Resource() root.putChild(b'metrics', MetricsResource(registry=self.registry)) self.server = reactor.listenTCP(self.port, Site(root)) log.msg("Prometheus service starting on {}".format(self.server.port)) @defer.inlineCallbacks def stopService(self): log.msg("Stopping Prometheus reporter") yield self.server.stopListening() yield service.BuildbotService.stopService(self) self.removeConsumers() def create_metrics(self): ''' Create the Prometheus metrics that will be exposed. ''' log.msg("Creating Prometheus metrics") self.registry = CollectorRegistry() # build metrics builds_labels = ['builder_id', 'worker_id'] self.g_builds_duration = Gauge( 'builds_duration_seconds', 'Number of seconds spent performing builds', labelnames=builds_labels, namespace=self.namespace, registry=self.registry) self.c_builds_success = Counter('builds_success', 'Number of builds reporting success', labelnames=builds_labels, namespace=self.namespace, registry=self.registry) self.c_builds_failure = Counter('builds_failure', 'Number of builds reporting failure', labelnames=builds_labels, namespace=self.namespace, registry=self.registry) self.c_builds_error = Counter('builds_error', 'Number of builds reporting error', labelnames=builds_labels, namespace=self.namespace, registry=self.registry) # builders metrics builders_labels = ['builder_id', 'builder_name'] self.g_builders_running_total = Gauge( 'builders_running_total', 'Total number of builders running', namespace=self.namespace, registry=self.registry) self.g_builders_running = Gauge('builders_running', 'Number of builders running', labelnames=builders_labels, namespace=self.namespace, registry=self.registry) # buildsets metrics buildsets_labels = ['buildset_id'] self.g_buildsets_duration = Gauge( 'buildsets_duration_seconds', 'Number of seconds spent performing buildsets', labelnames=buildsets_labels, namespace=self.namespace, registry=self.registry) self.c_buildsets_success = Counter( 'buildsets_success', 'Number of buildsets reporting success', labelnames=buildsets_labels, namespace=self.namespace, registry=self.registry) self.c_buildsets_failure = Counter( 'buildsets_failure', 'Number of buildsets reporting failure', labelnames=buildsets_labels, namespace=self.namespace, registry=self.registry) self.c_buildsets_error = Counter('buildsets_error', 'Number of buildsets reporting error', labelnames=buildsets_labels, namespace=self.namespace, registry=self.registry) # build requests metrics build_requests_labels = ['builder_id'] self.g_build_requests_duration = Gauge( 'build_requests_duration_seconds', 'Number of seconds spent performing build requests', labelnames=build_requests_labels, namespace=self.namespace, registry=self.registry) self.c_build_requests_success = Counter( 'build_requests_success', 'Number of build requests reporting success', labelnames=build_requests_labels, namespace=self.namespace, registry=self.registry) self.c_build_requests_failure = Counter( 'build_requests_failure', 'Number of build requests reporting failure', labelnames=build_requests_labels, namespace=self.namespace, registry=self.registry) self.c_build_requests_error = Counter( 'build_requests_error', 'Number of build requests reporting error', labelnames=build_requests_labels, namespace=self.namespace, registry=self.registry) # steps metrics steps_labels = ['step_number', 'step_name', 'builder_id', 'worker_id'] self.g_steps_duration = Gauge( 'steps_duration_seconds', 'Number of seconds spent performing build steps', labelnames=steps_labels, namespace=self.namespace, registry=self.registry) self.c_steps_success = Counter('steps_success', 'Number of steps reporting success', labelnames=steps_labels, namespace=self.namespace, registry=self.registry) self.c_steps_failure = Counter('steps_failure', 'Number of steps reporting failure', labelnames=steps_labels, namespace=self.namespace, registry=self.registry) self.c_steps_error = Counter('steps_error', 'Number of steps reporting error', labelnames=steps_labels, namespace=self.namespace, registry=self.registry) # workers metrics workers_labels = ['worker_id', 'worker_name'] self.g_workers_running_total = Gauge('workers_running_total', 'Total number of workers running', namespace=self.namespace, registry=self.registry) self.g_workers_running = Gauge('workers_running', 'Number of workers running', labelnames=workers_labels, namespace=self.namespace, registry=self.registry) @defer.inlineCallbacks def registerConsumers(self): self.removeConsumers() startConsuming = self.master.mq.startConsuming handlers = ( (('builds', None, None), self.buildsConsumer), (('builders', None, None), self.buildersConsumer), (('buildsets', None, None), self.buildSetsConsumer), (('buildrequests', None, None), self.buildRequestsConsumer), (('steps', None, None), self.stepsConsumer), (('workers', None, None), self.workersConsumer), ) for routingKey, handler in handlers: consumer = yield startConsuming(handler, routingKey) self.consumers.append(consumer) @defer.inlineCallbacks def removeConsumers(self): for consumer in self.consumers: yield consumer.stopConsuming() self.consumers = [] # @defer.inlineCallbacks def buildsConsumer(self, key, msg): ''' This method is responsible for updating build related metrics. There are four build set metrics: - buildbot_builds_duration_seconds, - buildbot_builds_success, - buildbot_builds_failure, - buildbot_builds_error buildbot_builds_duration_seconds is a gauge metric used to track the duration of individual builds by making use of Prometheus multi dimensional labels. As builds complete, an instance of this metric is created by passing builder_id and worker_id labels and then setting the value. This allows visualisation tools to query and filter metrics for specific builder combinations. Similarly, the other counter metrics record success, failure and error states for each build. ''' action = key[2] labels = dict(builder_id=msg['builderid'], worker_id=msg['workerid']) # build_info = yield self.master.data.get(("builds", msg['buildid'])) if action == 'finished': assert msg['complete'] build_started = msg['started_at'] build_finished = msg['complete_at'] build_duration = build_finished - build_started duration_seconds = build_duration self.g_builds_duration.labels(**labels).set(duration_seconds) build_status = resolve_results_status(msg['results']) if build_status == 'success': self.c_builds_success.labels(**labels).inc() elif build_status == 'failure': self.c_builds_failure.labels(**labels).inc() elif build_status == 'error': self.c_builds_error.labels(**labels).inc() def buildersConsumer(self, key, msg): ''' The Buildmaster runs a collection of Builders, each of which handles a single type of build (e.g. full versus quick), on one or more workers. Builders serve as a kind of queue for a particular type of build. Each Builder gets a separate column in the waterfall display. In general, each Builder runs independently. Each builder is a long-lived object which controls a sequence of Builds. Each Builder is created when the config file is first parsed, and lives forever (or rather until it is removed from the config file). It mediates the connections to the workers that do all the work, and is responsible for creating the Build objects - Builds. This method is responsible for updating builder related metrics. There are two builder metrics ``buildbot_builders_running_total`` and ``buildbot_builders_running``. ``buildbot_builders_running_total`` is a gauge metric used to track the total number of running builders. As builders start the metric is increased and as they stop the metric is decreased. No extra labels are used with this metric. ``buildbot_builders_running`` is a gauge metric used to track the running state of individual workers by making use of Prometheus multi dimensional labels. As builders start, an instance of this metric is created by passing ``builder_id`` and ``builder_name`` labels and then incremented. When the worker disconnects the same gauge metric is decreased. This means that a gauge value of 1 indicates started while a gauge value of 0 indicates stopped. ''' action = key[2] labels = dict(builder_id=msg['builderid'], builder_name=msg['name']) if action == 'started': self.g_builders_running_total.inc() self.g_builders_running.labels(**labels).inc() elif action == 'stopped': self.g_builders_running_total.dec() self.g_builders_running.labels(**labels).dec() # @defer.inlineCallbacks def buildSetsConsumer(self, key, msg): ''' A BuildSet is the name given to a set of Builds that all compile/test the same version of the tree on multiple Builders. In general, all these component Builds will perform the same sequence of Steps, using the same source code, but on different platforms or against a different set of libraries. Each scheduler creates and submits BuildSet objects to the BuildMaster. The buildmaster is responsible for turning the BuildSet into a set of BuildRequest objects and queueing them on the appropriate Builders. This method is responsible for updating build set related metrics. There are four build set metrics: - buildbot_buildsets_duration_seconds, - buildbot_buildsets_success, - buildbot_buildsets_failure, - buildbot_buildsets_error buildbot_buildsets_duration_seconds is a gauge metric used to track the duration of individual build sets by making use of Prometheus multi dimensional labels. As build sets complete, an instance of this metric is created by passing buildset_id labels and then setting the value. This allows visualisation tools to query and filter metrics for specific builder combinations. Similarly, the other counter metrics record success, failure and error states for each build set. ''' action = key[2] # TODO: substitute bsid for something more useful. bsid is just # a number that increments. A better choice would be something # like the repo, project, etc labels = dict(buildset_id=msg['bsid']) # buildset_info = yield self.master.data.get(("buildsets", msg['bsid'])) if action == 'complete': assert msg['complete'] buildset_started = msg['submitted_at'] buildset_finished = msg['complete_at'] buildset_duration = buildset_finished - buildset_started duration_seconds = buildset_duration self.g_buildsets_duration.labels(**labels).set(duration_seconds) bs_success = resolve_results_status(msg['results']) if bs_success == 'success': self.c_buildsets_success.labels(**labels).inc() elif bs_success == 'failure': self.c_buildsets_failure.labels(**labels).inc() elif bs_success == 'error': self.c_buildsets_error.labels(**labels).inc() def buildRequestsConsumer(self, key, msg): ''' A BuildRequest is a request to build a specific set of source code on a single Builder. Each Builder runs the BuildRequest as soon as it can (i.e. when an associated worker becomes free). This method is responsible for updating build request related metrics. There are four nuild request metrics: - buildbot_build_requests_duration_seconds - buildbot_build_requests_success - buildbot_build_requests_failure - buildbot_build_requests_error buildbot_build_requests_duration_seconds is a gauge metric used to track the duration of individual build requests by making use of Prometheus multi dimensional labels. As build requests complete, an instance of this metric is created by passing builder_id labels and then setting the value. This allows visualisation tools to query and filter metrics for specific builder combinations. Similarly, the other counter metrics record success, failure and error states for each build request. ''' action = key[2] labels = dict(builder_id=msg['builderid']) if action == 'complete': assert msg['complete'] br_started = msg['submitted_at'] br_finished = msg['complete_at'] br_duration = br_finished - br_started duration_seconds = br_duration self.g_build_requests_duration.labels( **labels).set(duration_seconds) br_success = resolve_results_status(msg['results']) if br_success == 'success': self.c_build_requests_success.labels(**labels).inc() elif br_success == 'failure': self.c_build_requests_failure.labels(**labels).inc() elif br_success == 'error': self.c_build_requests_error.labels(**labels).inc() @defer.inlineCallbacks def stepsConsumer(self, key, msg): ''' This method is responsible for updating step related metrics. There are four steps metrics: - buildbot_steps_duration_seconds, - buildbot_steps_success - buildbot_steps_failure - buildbot_steps_error buildbot_steps_duration_seconds is a gauge metric used to track the duration of individual steps by making use of Prometheus multi dimensional labels. As steps complete, an instance of this metric is created by passing step_number, step_name, builder_id and worker_id labels and then setting the value. This allows visualisation tools to query and filter metrics for specific step, builder and worker combinations. Similarly, the other counter metrics record success, failure and error states for each step. ''' action = key[2] build_info = yield self.master.data.get(("builds", msg['buildid'])) labels = dict(step_number=msg['number'], step_name=msg['name'], builder_id=build_info['builderid'], worker_id=build_info['workerid']) if action == 'finished': assert msg['complete'] step_started = msg['started_at'] step_finished = msg['complete_at'] step_duration = step_finished - step_started duration_seconds = step_duration self.g_steps_duration.labels(**labels).set(duration_seconds) step_success = resolve_results_status(msg['results']) if step_success == 'success': self.c_steps_success.labels(**labels).inc() elif step_success == 'failure': self.c_steps_failure.labels(**labels).inc() elif step_success == 'error': self.c_steps_error.labels(**labels).inc() def workersConsumer(self, key, msg): ''' This method is responsible for updating worker related metrics. There are two worker metrics ``buildbot_workers_running_total`` and ``buildbot_workers_running``. ``buildbot_workers_running_total`` is a gauge metric used to track the total number of running workers. As workers connect the metric is increased and as they disconnect the metric is decreased. No extra labels are used with this metric. ``buildbot_workers_running`` is a gauge metric used to track the running state of individual workers by making use of Prometheus multi dimensional labels. As workers connect, an instance of this metric is created by passing ``worker_id`` and ``worker_name`` labels and then incremented. When the worker disconnects the same gauge metric is decreased. This means that a gauge value of 1 indicates connected while a gauge value of 0 indicates disconnected. ''' action = key[2] labels = dict(worker_id=msg['workerid'], worker_name=msg['name']) if action == 'connected': self.g_workers_running_total.inc() self.g_workers_running.labels(**labels).inc() elif action == 'disconnected': self.g_workers_running_total.dec() self.g_workers_running.labels(**labels).dec()
def process_request(t): """A dummy function that takes some time.""" time.sleep(t) if __name__ == '__main__': # Start up the server to expose the metrics. start_http_server(8111) # examples for counter/gauge/summary/histogram c = Counter('myfake_failures_total', 'Description of counter') g = Gauge('myfake_inprogress_requests', 'Description of gauge') s = Summary('myfake_summary_request_latency_seconds', 'Description of summary') h = Histogram('myfake_histogram_request_latency_seconds', 'Description of histogram') while True: # counter example c.inc() # Increment by 1 # c.inc(random.random()) # Increment by given value # gauge example g.inc() # Increment by 1 # g.dec(10) # Decrement by given value # g.set(4.2) # Set to a given value # summary example s.observe(1.1) # Observe 1.1 (seconds in this case) # Generate some requests. process_request(random.random()) # histogram example h.observe(4.7) # Observe 4.7 (seconds in this case)