Ejemplo n.º 1
0
class VarzPoller(Poller):
    endpoint = '/varz'

    uptime = ts_mon.FloatMetric('uptime')
    accepting_builds = ts_mon.BooleanMetric('buildbot/master/accepting_builds')

    connected = ts_mon.GaugeMetric('buildbot/master/builders/connected_slaves')
    current_builds = ts_mon.GaugeMetric(
        'buildbot/master/builders/current_builds')
    pending_builds = ts_mon.GaugeMetric(
        'buildbot/master/builders/pending_builds')
    state = ts_mon.StringMetric('buildbot/master/builders/state')
    total = ts_mon.GaugeMetric('buildbot/master/builders/total_slaves')

    def handle_response(self, data):
        self.uptime.set(data['server_uptime'], fields=self.fields())
        self.accepting_builds.set(data['accepting_builds'], self.fields())

        for builder_name, builder_info in data['builders'].iteritems():
            fields = self.fields({'builder': builder_name})

            self.connected.set(builder_info.get('connected_slaves', 0),
                               fields=fields)
            self.current_builds.set(builder_info.get('current_builds', 0),
                                    fields=fields)
            self.pending_builds.set(builder_info.get('pending_builds', 0),
                                    fields=fields)
            self.state.set(builder_info.get('state', 'unknown'), fields=fields)
            self.total.set(builder_info.get('total_slaves', 0), fields=fields)
Ejemplo n.º 2
0
def main(argv):  # pragma: no cover
    # Does nothing when no arguments are passed, to make it safe to import this
    # module (main() is executed on import, because this file is called __main__).
    status = 0

    if len(argv) == 0:
        return status

    success_metric = ts_mon.BooleanMetric('send_monitoring_event/success')

    try:
        args = send_event.get_arguments(argv)

        send_event.process_argparse_options(args)

        if args.build_event_type:
            success_metric.set(send_event.send_build_event(args))

        elif args.service_event_type:
            success_metric.set(send_event.send_service_event(args))

        elif args.events_from_file:
            success_metric.set(send_event.send_events_from_file(args))

        else:
            print >> sys.stderr, (
                'At least one of the --*-event-type options or '
                '--events-from-file should be provided. Nothing '
                'was sent.')
            status = 2
            success_metric.set(False)
    except Exception:
        success_metric.set(False)
    finally:
        event_mon.close()
        try:
            ts_mon.flush()
        except ts_mon.MonitoringNoConfiguredMonitorError:
            pass
    return status
Ejemplo n.º 3
0
def Boolean(name, reset_after=False):
    """Returns a metric handle for a boolean named |name|."""
    return ts_mon.BooleanMetric(name)
Ejemplo n.º 4
0
def Boolean(name):
    """Returns a metric handle for a boolean named |name|."""
    return ts_mon.BooleanMetric(name)
Ejemplo n.º 5
0
def BooleanMetric(name, reset_after=False, description=None,
                  field_spec=_MISSING):
  """Returns a metric handle for a boolean named |name|."""
  return ts_mon.BooleanMetric(name, description=description,
                              field_spec=field_spec)
Ejemplo n.º 6
0
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import logging
import sys
import traceback

from infra_libs import app
from infra_libs import event_mon
from infra_libs import ts_mon

from infra.tools.send_monitoring_event import common


success_metric = ts_mon.BooleanMetric('send_monitoring_event/success',
    'Set to True if the monitoring event was sent successfully',
    None)


class SendMonitoringEvent(app.BaseApplication):
  DESCRIPTION = """Send an event to the monitoring pipeline.

    Examples:
    run.py infra.tools.send_monitoring_event --service-event-type=START \\
                                     --service-event-revinfo <filename>

    run.py infra.tools.send_monitoring_event \\
                                     --service-event-stack-trace "<stack trace>"

    run.py infra.tools.send_monitoring_event --build-event-type=SCHEDULER \\
                                     --build-event-build-name=foo
Ejemplo n.º 7
0
from twisted.spread import pb
from twisted.python import log
from twisted.internet import error, reactor, task
from twisted.application import service, internet
from twisted.cred import credentials

import buildslave
from buildslave.pbutil import ReconnectingPBClientFactory
from buildslave.commands import registry, base
from buildslave import monkeypatches

from infra_libs import ts_mon

connected_metric = ts_mon.BooleanMetric(
    'buildbot/slave/connected',
    'Whether the slave is currently connected to its master.', None)
connection_failures_metric = ts_mon.CounterMetric(
    'buildbot/slave/connection_failures',
    'Count of failures connecting to the buildbot master.',
    [ts_mon.StringField('reason')])
running_metric = ts_mon.BooleanMetric(
    'buildbot/slave/is_building',
    'Whether a build step is currently in progress.',
    [ts_mon.StringField('builder')])
steps_metric = ts_mon.CounterMetric(
    'buildbot/slave/steps',
    'Count of build steps run by each builder on this slave.',
    [ts_mon.StringField('builder'),
     ts_mon.BooleanField('success')])
Ejemplo n.º 8
0
def loop(task, sleep_timeout, duration=None, max_errors=None, time_mod=time):
  """Runs the task in a loop for a given duration.

  Handles and logs all uncaught exceptions. ``task`` callback should return True
  on success, and False (or raise an exception) in error.

  Doesn't leak any exceptions (including KeyboardInterrupt).

  Args:
    @param task: Callable with no arguments returning True or False.
    @param sleep_timeout: A function returning how long to sleep between task
                          invocations (sec), called once per loop.
    @param duration: How long to run the loop (sec), or None for forever.
    @param max_errors: Max number of consecutive errors before loop aborts.
    @param time_mod: Object implementing the interface of the standard `time`
                     module. Used by tests to mock time.time and time.sleep.

  Returns:
    @returns LoopResults.
  """
  deadline = None if duration is None else (time_mod.time() + duration)
  errors_left = max_errors
  seen_success = False
  failed = False
  loop_count = 0
  error_count = 0
  count_metric = ts_mon.CounterMetric('proc/outer_loop/count')
  success_metric = ts_mon.BooleanMetric('proc/outer_loop/success')
  durations_metric = ts_mon.DistributionMetric('proc/outer_loop/durations')
  try:
    while True:
      # Log that new attempt is starting.
      start = time_mod.time()
      LOGGER.info('-------------------')
      if deadline is not None:
        LOGGER.info(
            'Begin loop %d (%.1f sec to deadline)',
            loop_count, deadline - start)
      else:
        LOGGER.info('Begin loop %d', loop_count)

      # Do it. Abort if number of consecutive errors is too large.
      attempt_success = False
      try:
        with ts_mon.ScopedIncrementCounter(count_metric) as cm:
          attempt_success = task()
          if not attempt_success:  # pragma: no cover
            cm.set_failure()       # Due to branch coverage bug in coverage.py
      except KeyboardInterrupt:
        raise
      except Exception:
        LOGGER.exception('Uncaught exception in the task')
      finally:
        elapsed = time_mod.time() - start
        LOGGER.info('End loop %d (%f sec)', loop_count, elapsed)
        durations_metric.add(elapsed)
        LOGGER.info('-------------------')

      # Reset error counter on success, or abort on too many errors.
      if attempt_success:
        seen_success = True
        errors_left = max_errors
      else:
        error_count += 1
        if errors_left is not None:
          errors_left -= 1
          if errors_left <= 0:
            failed = True
            LOGGER.warn(
                'Too many consecutive errors (%d), stopping.', max_errors)
            break

      # Sleep before trying again.
      # TODO(vadimsh): Make sleep timeout dynamic.
      now = time_mod.time()
      timeout = sleep_timeout()
      if deadline is not None and now + timeout >= deadline:
        when = now - deadline
        if when > 0:
          LOGGER.info('Deadline reached %.1f sec ago, stopping.', when)
        else:
          LOGGER.info('Deadline is in %.1f sec, stopping now', -when)
        break
      LOGGER.debug('Sleeping %.1f sec', timeout)
      time_mod.sleep(timeout)

      loop_count += 1
  except KeyboardInterrupt:
    seen_success = True
    LOGGER.warn('Stopping due to KeyboardInterrupt')

  success = not failed and seen_success
  success_metric.set(success)
  return LoopResults(success, error_count)
Ejemplo n.º 9
0
LoopResults = collections.namedtuple(
    'LoopResults',
    [
        # True on no errors or if all failed attempts were successfully retried.
        'success',
        # Total number of errors seen (some may have been fixed with retries).
        'error_count',
    ],
)

count_metric = ts_mon.CounterMetric(
    'proc/outer_loop/count',
    'Counter of loop iterations for this process, by success or failure',
    [ts_mon.StringField('status')])
success_metric = ts_mon.BooleanMetric('proc/outer_loop/success',
                                      'Set immediately before the loop exits',
                                      None)
durations_metric = ts_mon.CumulativeDistributionMetric(
    'proc/outer_loop/durations',
    'Times (in seconds) taken to execute the task', None)


def loop(task, sleep_timeout, duration=None, max_errors=None, time_mod=time):
    """Runs the task in a loop for a given duration.

  Handles and logs all uncaught exceptions. ``task`` callback should return True
  on success, and False (or raise an exception) in error.

  Doesn't leak any exceptions (including KeyboardInterrupt).

  Args:
Ejemplo n.º 10
0
import os
import time

import buildbot.status.results

from buildbot.status.base import StatusReceiverMultiService
from twisted.internet import defer, reactor, task, threads
from twisted.python import log, threadpool

from infra_libs import ts_mon

uptime = ts_mon.FloatMetric('buildbot/master/uptime',
                            'Time (in seconds) since the master was started',
                            [ts_mon.StringField('master')])
accepting_builds = ts_mon.BooleanMetric(
    'buildbot/master/accepting_builds',
    'Whether the master\'s BuildRequestDistributor is running',
    [ts_mon.StringField('master')])

connected = ts_mon.GaugeMetric(
    'buildbot/master/builders/connected_slaves',
    'Number of slaves currently connected, per builder',
    [ts_mon.StringField('master'),
     ts_mon.StringField('builder')])
current_builds = ts_mon.GaugeMetric(
    'buildbot/master/builders/current_builds',
    'Number of builds currently running, per builder',
    [ts_mon.StringField('master'),
     ts_mon.StringField('builder')])
pending_builds = ts_mon.GaugeMetric(
    'buildbot/master/builders/pending_builds',
    'Number of builds pending, per builder',