class VarzPoller(Poller): endpoint = '/varz' uptime = ts_mon.FloatMetric('uptime') accepting_builds = ts_mon.BooleanMetric('buildbot/master/accepting_builds') connected = ts_mon.GaugeMetric('buildbot/master/builders/connected_slaves') current_builds = ts_mon.GaugeMetric( 'buildbot/master/builders/current_builds') pending_builds = ts_mon.GaugeMetric( 'buildbot/master/builders/pending_builds') state = ts_mon.StringMetric('buildbot/master/builders/state') total = ts_mon.GaugeMetric('buildbot/master/builders/total_slaves') def handle_response(self, data): self.uptime.set(data['server_uptime'], fields=self.fields()) self.accepting_builds.set(data['accepting_builds'], self.fields()) for builder_name, builder_info in data['builders'].iteritems(): fields = self.fields({'builder': builder_name}) self.connected.set(builder_info.get('connected_slaves', 0), fields=fields) self.current_builds.set(builder_info.get('current_builds', 0), fields=fields) self.pending_builds.set(builder_info.get('pending_builds', 0), fields=fields) self.state.set(builder_info.get('state', 'unknown'), fields=fields) self.total.set(builder_info.get('total_slaves', 0), fields=fields)
def main(argv): # pragma: no cover # Does nothing when no arguments are passed, to make it safe to import this # module (main() is executed on import, because this file is called __main__). status = 0 if len(argv) == 0: return status success_metric = ts_mon.BooleanMetric('send_monitoring_event/success') try: args = send_event.get_arguments(argv) send_event.process_argparse_options(args) if args.build_event_type: success_metric.set(send_event.send_build_event(args)) elif args.service_event_type: success_metric.set(send_event.send_service_event(args)) elif args.events_from_file: success_metric.set(send_event.send_events_from_file(args)) else: print >> sys.stderr, ( 'At least one of the --*-event-type options or ' '--events-from-file should be provided. Nothing ' 'was sent.') status = 2 success_metric.set(False) except Exception: success_metric.set(False) finally: event_mon.close() try: ts_mon.flush() except ts_mon.MonitoringNoConfiguredMonitorError: pass return status
def Boolean(name, reset_after=False): """Returns a metric handle for a boolean named |name|.""" return ts_mon.BooleanMetric(name)
def Boolean(name): """Returns a metric handle for a boolean named |name|.""" return ts_mon.BooleanMetric(name)
def BooleanMetric(name, reset_after=False, description=None, field_spec=_MISSING): """Returns a metric handle for a boolean named |name|.""" return ts_mon.BooleanMetric(name, description=description, field_spec=field_spec)
# Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. import logging import sys import traceback from infra_libs import app from infra_libs import event_mon from infra_libs import ts_mon from infra.tools.send_monitoring_event import common success_metric = ts_mon.BooleanMetric('send_monitoring_event/success', 'Set to True if the monitoring event was sent successfully', None) class SendMonitoringEvent(app.BaseApplication): DESCRIPTION = """Send an event to the monitoring pipeline. Examples: run.py infra.tools.send_monitoring_event --service-event-type=START \\ --service-event-revinfo <filename> run.py infra.tools.send_monitoring_event \\ --service-event-stack-trace "<stack trace>" run.py infra.tools.send_monitoring_event --build-event-type=SCHEDULER \\ --build-event-build-name=foo
from twisted.spread import pb from twisted.python import log from twisted.internet import error, reactor, task from twisted.application import service, internet from twisted.cred import credentials import buildslave from buildslave.pbutil import ReconnectingPBClientFactory from buildslave.commands import registry, base from buildslave import monkeypatches from infra_libs import ts_mon connected_metric = ts_mon.BooleanMetric( 'buildbot/slave/connected', 'Whether the slave is currently connected to its master.', None) connection_failures_metric = ts_mon.CounterMetric( 'buildbot/slave/connection_failures', 'Count of failures connecting to the buildbot master.', [ts_mon.StringField('reason')]) running_metric = ts_mon.BooleanMetric( 'buildbot/slave/is_building', 'Whether a build step is currently in progress.', [ts_mon.StringField('builder')]) steps_metric = ts_mon.CounterMetric( 'buildbot/slave/steps', 'Count of build steps run by each builder on this slave.', [ts_mon.StringField('builder'), ts_mon.BooleanField('success')])
def loop(task, sleep_timeout, duration=None, max_errors=None, time_mod=time): """Runs the task in a loop for a given duration. Handles and logs all uncaught exceptions. ``task`` callback should return True on success, and False (or raise an exception) in error. Doesn't leak any exceptions (including KeyboardInterrupt). Args: @param task: Callable with no arguments returning True or False. @param sleep_timeout: A function returning how long to sleep between task invocations (sec), called once per loop. @param duration: How long to run the loop (sec), or None for forever. @param max_errors: Max number of consecutive errors before loop aborts. @param time_mod: Object implementing the interface of the standard `time` module. Used by tests to mock time.time and time.sleep. Returns: @returns LoopResults. """ deadline = None if duration is None else (time_mod.time() + duration) errors_left = max_errors seen_success = False failed = False loop_count = 0 error_count = 0 count_metric = ts_mon.CounterMetric('proc/outer_loop/count') success_metric = ts_mon.BooleanMetric('proc/outer_loop/success') durations_metric = ts_mon.DistributionMetric('proc/outer_loop/durations') try: while True: # Log that new attempt is starting. start = time_mod.time() LOGGER.info('-------------------') if deadline is not None: LOGGER.info( 'Begin loop %d (%.1f sec to deadline)', loop_count, deadline - start) else: LOGGER.info('Begin loop %d', loop_count) # Do it. Abort if number of consecutive errors is too large. attempt_success = False try: with ts_mon.ScopedIncrementCounter(count_metric) as cm: attempt_success = task() if not attempt_success: # pragma: no cover cm.set_failure() # Due to branch coverage bug in coverage.py except KeyboardInterrupt: raise except Exception: LOGGER.exception('Uncaught exception in the task') finally: elapsed = time_mod.time() - start LOGGER.info('End loop %d (%f sec)', loop_count, elapsed) durations_metric.add(elapsed) LOGGER.info('-------------------') # Reset error counter on success, or abort on too many errors. if attempt_success: seen_success = True errors_left = max_errors else: error_count += 1 if errors_left is not None: errors_left -= 1 if errors_left <= 0: failed = True LOGGER.warn( 'Too many consecutive errors (%d), stopping.', max_errors) break # Sleep before trying again. # TODO(vadimsh): Make sleep timeout dynamic. now = time_mod.time() timeout = sleep_timeout() if deadline is not None and now + timeout >= deadline: when = now - deadline if when > 0: LOGGER.info('Deadline reached %.1f sec ago, stopping.', when) else: LOGGER.info('Deadline is in %.1f sec, stopping now', -when) break LOGGER.debug('Sleeping %.1f sec', timeout) time_mod.sleep(timeout) loop_count += 1 except KeyboardInterrupt: seen_success = True LOGGER.warn('Stopping due to KeyboardInterrupt') success = not failed and seen_success success_metric.set(success) return LoopResults(success, error_count)
LoopResults = collections.namedtuple( 'LoopResults', [ # True on no errors or if all failed attempts were successfully retried. 'success', # Total number of errors seen (some may have been fixed with retries). 'error_count', ], ) count_metric = ts_mon.CounterMetric( 'proc/outer_loop/count', 'Counter of loop iterations for this process, by success or failure', [ts_mon.StringField('status')]) success_metric = ts_mon.BooleanMetric('proc/outer_loop/success', 'Set immediately before the loop exits', None) durations_metric = ts_mon.CumulativeDistributionMetric( 'proc/outer_loop/durations', 'Times (in seconds) taken to execute the task', None) def loop(task, sleep_timeout, duration=None, max_errors=None, time_mod=time): """Runs the task in a loop for a given duration. Handles and logs all uncaught exceptions. ``task`` callback should return True on success, and False (or raise an exception) in error. Doesn't leak any exceptions (including KeyboardInterrupt). Args:
import os import time import buildbot.status.results from buildbot.status.base import StatusReceiverMultiService from twisted.internet import defer, reactor, task, threads from twisted.python import log, threadpool from infra_libs import ts_mon uptime = ts_mon.FloatMetric('buildbot/master/uptime', 'Time (in seconds) since the master was started', [ts_mon.StringField('master')]) accepting_builds = ts_mon.BooleanMetric( 'buildbot/master/accepting_builds', 'Whether the master\'s BuildRequestDistributor is running', [ts_mon.StringField('master')]) connected = ts_mon.GaugeMetric( 'buildbot/master/builders/connected_slaves', 'Number of slaves currently connected, per builder', [ts_mon.StringField('master'), ts_mon.StringField('builder')]) current_builds = ts_mon.GaugeMetric( 'buildbot/master/builders/current_builds', 'Number of builds currently running, per builder', [ts_mon.StringField('master'), ts_mon.StringField('builder')]) pending_builds = ts_mon.GaugeMetric( 'buildbot/master/builders/pending_builds', 'Number of builds pending, per builder',