Ejemplo n.º 1
0
    def _get(self, id, timeout):
        """Gets a container from the pool.

        @param id: A ContainerId to assign to the new container.
        @param timeout: A timeout (in seconds) to wait for the pool.  If a
                        container is not available from the pool within the
                        given period, None will be returned.

        @return: A container from the pool.
        """
        logging.debug('Received get request (id=%s)', id)
        container = self._pool.get(timeout)
        # Assign an ID to the container as soon as it is removed from the pool.
        # This associates the container with the process to which it will be
        # handed off.
        if container is not None:
            logging.debug('Assigning container (name=%s, id=%s)',
                          container.name, id)
            container.id = id
        else:
            logging.debug('No container (id=%s)', id)
        metrics.Counter(METRICS_PREFIX + '/container_requests',
                        field_spec=[ts_mon.BooleanField('success')]).increment(
                            fields={'success': (container is not None)})
        return container
    def __init__(self, container_path=constants.DEFAULT_CONTAINER_PATH,
                 container_factory=None):
        """Initialize a ContainerBucket.

        @param container_path: Path to the directory used to store containers.
                               Default is set to AUTOSERV/container_path in
                               global config.
        @param container_factory: A factory for creating Containers.
        """
        self.container_path = os.path.realpath(container_path)
        if container_factory is not None:
            self._factory = container_factory
        else:
            # Pass in the container path so that the bucket is hermetic (i.e. so
            # that if the container path is customized, the base image doesn't
            # fall back to using the default container path).
            try:
                base_image_ok = True
                container = BaseImage(self.container_path).get()
            except error.ContainerError as e:
                base_image_ok = False
                raise e
            finally:
                metrics.Counter(METRICS_PREFIX + '/base_image',
                                field_spec=[ts_mon.BooleanField('corrupted')]
                                ).increment(
                                    fields={'corrupted': not base_image_ok})
            self._factory = ContainerFactory(
                base_container=container,
                lxc_path=self.container_path)
        self.container_cache = {}
def EmitStop(_m, graceful):
    """Emits a Counter metric for apache service stops

    @param _m: A regex match object
    @param graceful: Whether apache was stopped gracefully.
    """
    metrics.Counter(STOP_METRIC,
                    description="A metric counting Apache service stops.",
                    field_spec=[ts_mon.BooleanField('graceful')
                                ]).increment(fields={'graceful': graceful})
Ejemplo n.º 4
0
 def testGetMetricFieldSpec(self):
   """Test each field type gets its FieldSpec."""
   fields = {
       'int': 12,
       'bool': True,
       'str': 'string',
   }
   expected_fieldspec = [ts_mon.IntegerField('int'),
                         ts_mon.BooleanField('bool'),
                         ts_mon.StringField('str')]
   self.assertEqual(ts_mon_config.GetMetricFieldSpec(fields),
                    expected_fieldspec)
Ejemplo n.º 5
0
    def _create_workers(self):
        """Spawns workers to handle container requests.

        This method modifies the _workers list and should only be called from
        within run().
        """
        if self._pool.full():
            return

        # Do not exceed the worker limit.
        if len(self._workers) >= self._worker_max:
            return

        too_many_errors = len(self._error_timestamps) >= _MAX_ERRORS_PER_HOUR
        metrics.Counter(METRICS_PREFIX + '/error_throttled',
                        field_spec=[
                            ts_mon.BooleanField('throttled')
                        ]).increment(fields={'throttled': too_many_errors})
        # Throttle if too many errors occur.
        if too_many_errors:
            logging.warning('Error throttled (until %d)',
                            self._error_timestamps[0] + 3600)
            return

        # Create workers to refill the pool.
        qsize = self._pool.qsize()
        shortfall = self._pool.maxsize - qsize
        old_worker_count = len(self._workers)

        # Avoid spamming - only log if the monitor is taking some action.  Log
        # this before creating worker threads, because we are counting live
        # threads and want to avoid race conditions w.r.t. threads actually
        # starting.
        if (old_worker_count < shortfall
                and old_worker_count < self._worker_max):
            # This can include workers that aren't currently in the self._worker
            # list, e.g. workers that were dropped from the list because they
            # timed out.
            active_workers = sum(
                [1 for t in threading.enumerate() if type(t) is _Worker])
            # qsize    : Current size of the container pool.
            # shortfall: Number of empty slots currently in the pool.
            # workers  : m+n, where m is the current number of active worker
            #            threads and n is the number of new threads created.
            logging.debug('qsize:%d shortfall:%d workers:%d', qsize, shortfall,
                          active_workers)
        if len(self._workers) < shortfall:
            worker = _Worker(self._factory, self._on_worker_result,
                             self._on_worker_error)
            worker.start()
            self._workers.append(worker)
Ejemplo n.º 6
0
def main(argv):
    """Entry point for dut_mon."""
    logging.getLogger().setLevel(logging.INFO)

    with ts_mon_config.SetupTsMonGlobalState('dut_mon', indirect=True):
        afe = frontend.AFE()
        counters = collections.defaultdict(lambda: 0)

        field_spec = [ts_mon.StringField('board'),
                      ts_mon.StringField('model'),
                      ts_mon.StringField('pool'),
                      ts_mon.BooleanField('is_locked'),
                      ts_mon.StringField('status'),
                      ]
        dut_count = metrics.Gauge('chromeos/autotest/dut_mon/dut_count',
                                  description='The number of duts in a given '
                                              'state and bucket.',
                                  field_spec=field_spec)
        tick_count = metrics.Counter('chromeos/autotest/dut_mon/tick',
                                     description='Tick counter of dut_mon.')

        while True:
            # Note: We reset all counters to zero in each loop rather than
            # creating a new defaultdict, because we want to ensure that any
            # gauges that were previously set to a nonzero value by this process
            # get set back to zero if necessary.
            for k in counters:
                counters[k] = 0

            logging.info('Fetching all hosts.')
            hosts = afe.get_hosts()
            logging.info('Fetched %s hosts.', len(hosts))
            for host in hosts:
                fields = _get_bucket_for_host(host)
                counters[fields] += 1

            for field, value in counters.iteritems():
                logging.info('%s %s', field, value)
                dut_count.set(value, fields=field.__dict__)

            tick_count.increment()
            logging.info('Sleeping for 2 minutes.')
            time.sleep(120)
    def create_container(self, new_id):
        """Creates a new container.

        Attempts to retrieve a container from the container pool.  If that
        operation fails, this falls back to the parent class behaviour.

        @param new_id: ContainerId to assign to the new container.  Containers
                       must be assigned an ID before they can be released from
                       the container pool.

        @return: The new container.
        """
        container = None
        if self._client:
            try:
                container = self._client.get_container(
                    new_id, _CONTAINER_POOL_TIMEOUT)
            except Exception:
                logging.exception('Error communicating with container pool.')
            else:
                if container is not None:
                    logging.debug('Retrieved container from pool: %s',
                                  container.name)
                    return container
        metrics.Counter(
            METRICS_PREFIX + '/containers_served',
            field_spec=[
                ts_mon.BooleanField('from_pool')
            ]).increment(fields={'from_pool': (container is not None)})
        if container is not None:
            return container

        # If the container pool did not yield a container, make one locally.
        logging.warning('Unable to obtain container from pre-populated pool.  '
                        'Creating container locally.  This slows server tests '
                        'down and should be debugged even if local creation '
                        'works out.')
        return super(_PoolBasedFactory, self).create_container(new_id)
Ejemplo n.º 8
0
from __future__ import absolute_import

from framework import authdata
from framework import sql
from framework import xsrf

from gae_ts_mon.handlers import TSMonJSHandler

from google.appengine.api import users

from infra_libs import ts_mon

STANDARD_FIELDS = [
    ts_mon.StringField('client_id'),
    ts_mon.StringField('host_name'),
    ts_mon.BooleanField('document_visible'),
]

# User action metrics.
ISSUE_CREATE_LATENCY_METRIC = ts_mon.CumulativeDistributionMetric(
    'monorail/frontend/issue_create_latency',
    ('Latency between Issue Entry form submission and page load of '
     'the subsequent issue page.'),
    field_spec=STANDARD_FIELDS,
    units=ts_mon.MetricsDataUnits.MILLISECONDS)
ISSUE_UPDATE_LATENCY_METRIC = ts_mon.CumulativeDistributionMetric(
    'monorail/frontend/issue_update_latency',
    ('Latency between Issue Update form submission and page load of '
     'the subsequent issue page.'),
    field_spec=STANDARD_FIELDS,
    units=ts_mon.MetricsDataUnits.MILLISECONDS)
Ejemplo n.º 9
0
def SecondsInstanceTimer(name, fields=None, description=None,
                         field_spec=_MISSING, record_on_exception=True,
                         add_exception_field=False):
  """Record the time of an operation to a FloatMetric.

  Records the time taken inside of the context block, to the
  Float metric named |name|, with the given fields.  This is
  a non-cumulative metric; this represents the absolute time
  taken for a specific block.  The duration is stored in a float
  to provide flexibility in the future for higher accuracy.

  Examples:
    # Time the doSomething() call, with field values that are independent of the
    # results of the operation.
    with SecondsInstanceTimer('timer/name', fields={'foo': 'bar'},
                              description='My timer',
                              field_spec=[ts_mon.StringField('foo'),
                                          ts_mon.BooleanField('success')]):
      doSomething()

    # Time the doSomethingElse call, with field values that depend on the
    # results of that operation. Note that it is important that a default value
    # is specified for these fields, in case an exception is thrown by
    # doSomethingElse()
    f = {'success': False, 'foo': 'bar'}
    with SecondsInstanceTimer('timer/name', fields=f, description='My timer',
                              field_spec=[ts_mon.StringField('foo')]) as c:
      doSomethingElse()
      c['success'] = True

    # Incorrect Usage!
    with SecondsInstanceTimer('timer/name', description='My timer') as c:
      doSomething()
      c['foo'] = bar # 'foo' is not a valid field, because no default
                     # value for it was specified in the context constructor.
                     # It will be silently ignored.

  Args:
    name: The name of the metric to create
    fields: The fields of the metric to create.
    description: A string description of the metric.
    field_spec: A sequence of ts_mon.Field objects to specify the field schema.
    record_on_exception: Whether to record metrics if an exception is raised.
    add_exception_field: Whether to add a BooleanField('encountered_exception')
        to the FieldSpec provided, and set its value to True iff an exception
        was raised in the context.

  Yields:
    Float based metric measing the duration of execution.
  """
  if field_spec is not None and field_spec is not _MISSING:
    field_spec.append(ts_mon.BooleanField('encountered_exception'))

  m = FloatMetric(name, description=description, field_spec=field_spec)
  f = dict(fields or {})
  keys = list(f)
  t0 = _GetSystemClock()

  error = True
  try:
    yield f
    error = False
  finally:
    if record_on_exception and add_exception_field:
      keys.append('encountered_exception')
      f.setdefault('encountered_exception', error)
    # Filter out keys that were not part of the initial key set. This is to
    # avoid inconsistent fields.
    # TODO(akeshet): Doing this filtering isn't super efficient. Would be better
    # to implement some key-restricted subclass or wrapper around dict, and just
    # yield that above rather than yielding a regular dict.
    if record_on_exception or not error:
      dt = _GetSystemClock() - t0
      m.set(dt, fields={k: f[k] for k in keys})
Ejemplo n.º 10
0
def SecondsTimer(name, fields=None, description=None, field_spec=_MISSING,
                 scale=1, record_on_exception=True, add_exception_field=False):
  """Record the time of an operation to a CumulativeSecondsDistributionMetric.

  Records the time taken inside of the context block, to the
  CumulativeSecondsDistribution named |name|, with the given fields.

  Usage:

  # Time the doSomething() call, with field values that are independent of the
  # results of the operation.
  with SecondsTimer('timer/name', fields={'foo': 'bar'},
                    description="My timer",
                    field_spec=[ts_mon.StringField('foo'),
                                ts_mon.BooleanField('success')]):
    doSomething()

  # Time the doSomethingElse call, with field values that depend on the results
  # of that operation. Note that it is important that a default value is
  # specified for these fields, in case an exception is thrown by
  # doSomethingElse()
  f = {'success': False, 'foo': 'bar'}
  with SecondsTimer('timer/name', fields=f, description="My timer",
                    field_spec=[ts_mon.StringField('foo')]) as c:
    doSomethingElse()
    c['success'] = True

  # Incorrect Usage!
  with SecondsTimer('timer/name', description="My timer") as c:
    doSomething()
    c['foo'] = bar # 'foo' is not a valid field, because no default
                   # value for it was specified in the context constructor.
                   # It will be silently ignored.

  Args:
    name: The name of the metric to create
    fields: The fields of the metric to create.
    description: A string description of the metric.
    field_spec: A sequence of ts_mon.Field objects to specify the field schema.
    scale: A float to scale the CumulativeSecondsDistribution buckets by.
    record_on_exception: Whether to record metrics if an exception is raised.
    add_exception_field: Whether to add a BooleanField("encountered_exception")
        to the FieldSpec provided, and set its value to True iff an exception
        was raised in the context.
  """
  if field_spec is not None and field_spec is not _MISSING:
    field_spec.append(ts_mon.BooleanField('encountered_exception'))

  m = CumulativeSecondsDistribution(
      name, scale=scale, description=description, field_spec=field_spec)
  f = fields or {}
  f = dict(f)
  keys = f.keys()
  t0 = datetime.datetime.now()

  error = True
  try:
    yield f
    error = False
  finally:
    if record_on_exception and add_exception_field:
      keys.append('encountered_exception')
      f.setdefault('encountered_exception', error)
    # Filter out keys that were not part of the initial key set. This is to
    # avoid inconsistent fields.
    # TODO(akeshet): Doing this filtering isn't super efficient. Would be better
    # to implement some key-restricted subclass or wrapper around dict, and just
    # yield that above rather than yielding a regular dict.
    if record_on_exception or not error:
      dt = (datetime.datetime.now() - t0).total_seconds()
      m.add(dt, fields={k: f[k] for k in keys})
Ejemplo n.º 11
0
connected_metric = ts_mon.BooleanMetric(
    'buildbot/slave/connected',
    'Whether the slave is currently connected to its master.', None)
connection_failures_metric = ts_mon.CounterMetric(
    'buildbot/slave/connection_failures',
    'Count of failures connecting to the buildbot master.',
    [ts_mon.StringField('reason')])
running_metric = ts_mon.BooleanMetric(
    'buildbot/slave/is_building',
    'Whether a build step is currently in progress.',
    [ts_mon.StringField('builder')])
steps_metric = ts_mon.CounterMetric(
    'buildbot/slave/steps',
    'Count of build steps run by each builder on this slave.',
    [ts_mon.StringField('builder'),
     ts_mon.BooleanField('success')])


class UnknownCommand(pb.Error):
    pass


class SlaveBuilder(pb.Referenceable, service.Service):
    """This is the local representation of a single Builder: it handles a
    single kind of build (like an all-warnings build). It has a name and a
    home directory. The rest of its behavior is determined by the master.
    """

    stopCommandOnShutdown = True

    # remote is a ref to the Builder object on the master side, and is set
Ejemplo n.º 12
0
# Copyright 2019 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

from gae_ts_mon.handlers import TSMonJSHandler
from infra_libs import ts_mon

FIELDS = [
    ts_mon.IntegerField('fe_version'),
    ts_mon.BooleanField('signed_in'),
]

METRICS = [
    ts_mon.CumulativeDistributionMetric(
        'chromeperf/load/page',
        'page loadEventEnd - fetchStart',
        units=ts_mon.MetricsDataUnits.MILLISECONDS,
        field_spec=FIELDS),
    ts_mon.CumulativeDistributionMetric(
        'chromeperf/load/chart',
        'chart load latency',
        units=ts_mon.MetricsDataUnits.MILLISECONDS,
        field_spec=FIELDS),
    ts_mon.CumulativeDistributionMetric(
        'chromeperf/load/alerts',
        'alerts load latency',
        units=ts_mon.MetricsDataUnits.MILLISECONDS,
        field_spec=FIELDS),
    ts_mon.CumulativeDistributionMetric(
        'chromeperf/action/triage',
        'alert triage latency',
Ejemplo n.º 13
0
class LoadApiClientConfigs(webapp2.RequestHandler):

    config_loads = ts_mon.CounterMetric(
        'monorail/client_config_svc/loads',
        'Results of fetches from luci-config.',
        [ts_mon.BooleanField('success'),
         ts_mon.StringField('type')])

    def get(self):
        authorization_token, _ = app_identity.get_access_token(
            framework_constants.OAUTH_SCOPE)
        response = urlfetch.fetch(LUCI_CONFIG_URL,
                                  method=urlfetch.GET,
                                  follow_redirects=False,
                                  headers={
                                      'Content-Type':
                                      'application/json; charset=UTF-8',
                                      'Authorization':
                                      'Bearer ' + authorization_token
                                  })

        if response.status_code != 200:
            logging.error('Invalid response from luci-config: %r', response)
            self.config_loads.increment({
                'success': False,
                'type': 'luci-cfg-error'
            })
            self.abort(500, 'Invalid response from luci-config')

        try:
            content_text = self._process_response(response)
        except Exception as e:
            self.abort(500, str(e))

        logging.info('luci-config content decoded: %r.', content_text)
        configs = ClientConfig(configs=content_text,
                               key_name='api_client_configs')
        configs.put()
        self.config_loads.increment({'success': True, 'type': 'success'})

    def _process_response(self, response):
        try:
            content = json.loads(response.content)
        except ValueError:
            logging.error('Response was not JSON: %r', response.content)
            self.config_loads.increment({
                'success': False,
                'type': 'json-load-error'
            })
            raise

        try:
            config_content = content['content']
        except KeyError:
            logging.error('JSON contained no content: %r', content)
            self.config_loads.increment({
                'success': False,
                'type': 'json-key-error'
            })
            raise

        try:
            content_text = base64.b64decode(config_content)
        except TypeError:
            logging.error('Content was not b64: %r', config_content)
            self.config_loads.increment({
                'success': False,
                'type': 'b64-decode-error'
            })
            raise

        try:
            cfg = api_clients_config_pb2.ClientCfg()
            protobuf.text_format.Merge(content_text, cfg)
        except:
            logging.error('Content was not a valid ClientCfg proto: %r',
                          content_text)
            self.config_loads.increment({
                'success': False,
                'type': 'proto-load-error'
            })
            raise

        return content_text
Ejemplo n.º 14
0
    def PerformStage(self):
        """Perform the actual work for this stage.

    This includes final metadata archival, and update CIDB with our final status
    as well as producting a logged build result summary.
    """
        build_identifier, _ = self._run.GetCIDBHandle()
        build_id = build_identifier.cidb_id
        buildbucket_id = build_identifier.buildbucket_id
        if results_lib.Results.BuildSucceededSoFar(self.buildstore,
                                                   buildbucket_id, self.name):
            final_status = constants.BUILDER_STATUS_PASSED
        else:
            final_status = constants.BUILDER_STATUS_FAILED

        if not hasattr(self._run.attrs, 'release_tag'):
            # If, for some reason, sync stage was not completed and
            # release_tag was not set. Set it to None here because
            # ArchiveResults() depends the existence of this attr.
            self._run.attrs.release_tag = None

        # Set up our report metadata.
        self._run.attrs.metadata.UpdateWithDict(
            self.GetReportMetadata(
                final_status=final_status,
                completion_instance=self._completion_instance))

        src_root = self._build_root
        # Workspace builders use a different buildroot for overlays.
        if self._run.config.workspace_branch and self._run.options.workspace:
            src_root = self._run.options.workspace

        # Add tags for the arches and statuses of the build.
        # arches requires crossdev which isn't available at the early part of the
        # build.
        arches = []
        for board in self._run.config['boards']:
            toolchains = toolchain.GetToolchainsForBoard(board,
                                                         buildroot=src_root)
            default = list(
                toolchain.FilterToolchains(toolchains, 'default', True))
            if default:
                try:
                    arches.append(toolchain.GetArchForTarget(default[0]))
                except cros_build_lib.RunCommandError as e:
                    logging.warning(
                        'Unable to retrieve arch for board %s default toolchain %s: %s',
                        board, default, e)
        tags = {
            'arches': arches,
            'status': final_status,
        }
        results = self._run.attrs.metadata.GetValue('results')
        for stage in results:
            tags['stage_status:%s' % stage['name']] = stage['status']
            tags['stage_summary:%s' % stage['name']] = stage['summary']
        self._run.attrs.metadata.UpdateKeyDictWithDict(constants.METADATA_TAGS,
                                                       tags)

        # Some operations can only be performed if a valid version is available.
        try:
            self._run.GetVersionInfo()
            self.ArchiveResults(final_status)
            metadata_url = os.path.join(self.upload_url,
                                        constants.METADATA_JSON)
        except cbuildbot_run.VersionNotSetError:
            logging.error('A valid version was never set for this run. '
                          'Can not archive results.')
            metadata_url = ''

        results_lib.Results.Report(sys.stdout,
                                   current_version=(self._run.attrs.release_tag
                                                    or ''))

        # Upload goma log if used for BuildPackage and TestSimpleChrome.
        _UploadAndLinkGomaLogIfNecessary(
            'BuildPackages', self._run.config.name, self._run.options.goma_dir,
            self._run.options.goma_client_json,
            self._run.attrs.metadata.GetValueWithDefault('goma_tmp_dir'))
        _UploadAndLinkGomaLogIfNecessary(
            'TestSimpleChromeWorkflow', self._run.config.name,
            self._run.options.goma_dir, self._run.options.goma_client_json,
            self._run.attrs.metadata.GetValueWithDefault(
                'goma_tmp_dir_for_simple_chrome'))

        if self.buildstore.AreClientsReady():
            status_for_db = final_status

            # TODO(pprabhu): After BuildData and CBuildbotMetdata are merged, remove
            # this extra temporary object creation.
            # XXX:HACK We're creating a BuildData with an empty URL. Don't try to
            # MarkGathered this object.
            build_data = metadata_lib.BuildData(
                '', self._run.attrs.metadata.GetDict())
            # TODO(akeshet): Find a clearer way to get the "primary upload url" for
            # the metadata.json file. One alternative is _GetUploadUrls(...)[0].
            # Today it seems that element 0 of its return list is the primary upload
            # url, but there is no guarantee or unit test coverage of that.
            self.buildstore.FinishBuild(build_id,
                                        status=status_for_db,
                                        summary=build_data.failure_message,
                                        metadata_url=metadata_url)

            duration = self._GetBuildDuration()

            mon_fields = {
                'status': status_for_db,
                'build_config': self._run.config.name,
                'important': self._run.config.important
            }
            metrics.Counter(
                constants.MON_BUILD_COMP_COUNT).increment(fields=mon_fields)
            metrics.CumulativeSecondsDistribution(
                constants.MON_BUILD_DURATION).add(duration, fields=mon_fields)

            if self._run.options.sanity_check_build:
                metrics.Counter(
                    constants.MON_BUILD_SANITY_COMP_COUNT).increment(
                        fields=mon_fields)
                metrics.Gauge(
                    constants.MON_BUILD_SANITY_ID,
                    description=
                    'The build number of the latest sanity build. Used '
                    'for recovering the link to the latest failing build '
                    'in the alert when a sanity build fails.',
                    field_spec=[
                        ts_mon.StringField('status'),
                        ts_mon.StringField('build_config'),
                        ts_mon.StringField('builder_name'),
                        ts_mon.BooleanField('important')
                    ]).set(self._run.buildnumber,
                           fields=dict(
                               mon_fields,
                               builder_name=self._run.GetBuilderName()))

            if config_lib.IsMasterCQ(self._run.config):
                self_destructed = self._run.attrs.metadata.GetValueWithDefault(
                    constants.SELF_DESTRUCTED_BUILD, False)
                mon_fields = {
                    'status': status_for_db,
                    'self_destructed': self_destructed
                }
                metrics.CumulativeSecondsDistribution(
                    constants.MON_CQ_BUILD_DURATION).add(duration,
                                                         fields=mon_fields)
                annotator_link = uri_lib.ConstructAnnotatorUri(build_id)
                logging.PrintBuildbotLink('Build annotator', annotator_link)

            # From this point forward, treat all exceptions as warnings.
            self._post_completion = True

            # Dump report about things we retry.
            retry_stats.ReportStats(sys.stdout)