Exemple #1
0
 def __init__(self, label, description=''):
     """Initialize a context manager labeled `label` that measures the wall time
 between enter and exit.
 If description is truthy, the measurement is reported to ts mon.
 """
     self._label = label
     self._start = None
     self.seconds = 0
     self._metric = None
     if description:
         self._metric = gae_ts_mon.FloatMetric(
             self._Suffix(), description, [gae_ts_mon.StringField('label')],
             's')
Exemple #2
0
# requires the argument to have create_time and start_time.
add_build_scheduling_duration = _duration_adder(  # pragma: no branch
    'scheduling_durations', 'Duration between build creation and start',
    lambda b: _ts_delta_sec(b.create_time, b.start_time))

BUILD_COUNT_PROD = gae_ts_mon.GaugeMetric(
    _METRIC_PREFIX_PROD + 'count', 'Number of pending/running prod builds',
    _build_fields('bucket', 'builder', 'status'))
BUILD_COUNT_EXPERIMENTAL = gae_ts_mon.GaugeMetric(
    _METRIC_PREFIX_EXPERIMENTAL + 'count',
    'Number of pending/running experimental builds',
    _build_fields('bucket', 'builder', 'status'))

MAX_AGE_SCHEDULED = gae_ts_mon.FloatMetric(
    'buildbucket/builds/max_age_scheduled',
    'Age of the oldest SCHEDULED build.',
    (_build_fields('bucket', 'builder') +
     [gae_ts_mon.BooleanField('must_be_never_leased')]),
    units=gae_ts_mon.MetricsDataUnits.SECONDS)
SEQUENCE_NUMBER_GEN_DURATION_MS = gae_ts_mon.CumulativeDistributionMetric(
    'buildbucket/sequence_number/gen_duration',
    'Duration of a sequence number generation in ms',
    [gae_ts_mon.StringField('sequence')],
    # Bucketer for 1ms..5s range
    bucketer=BUCKETER_5_SEC,
    units=gae_ts_mon.MetricsDataUnits.MILLISECONDS)
TAG_INDEX_INCONSISTENT_ENTRIES = gae_ts_mon.NonCumulativeDistributionMetric(
    'buildbucket/tag_index/inconsistent_entries',
    'Number of inconsistent entries encountered during build search',
    [gae_ts_mon.StringField('tag')],
    # We can't have more than 1000 entries in a tag index.
    bucketer=BUCKETER_1K)
Exemple #3
0
    ],
    bucketer=_bucketer)


# Global metric. Target fields:
# - hostname = 'autogen:<executor_id>' (bot id).
# Status value must be 'ready', 'running', or anything else, possibly
# swarming-specific, when it cannot run a job. E.g. 'quarantined' or
# 'dead'.
# Note that 'running' will report data as long as the job is running,
# so it is best to restrict data to status == 'pending.'
_jobs_max_pending_duration = gae_ts_mon.FloatMetric(
    'jobs/max_pending_duration',
    'Maximum pending seconds of pending jobs.', [
        gae_ts_mon.StringField('spec_name'),
        gae_ts_mon.StringField('project_id'),
        gae_ts_mon.StringField('subproject_id'),
        gae_ts_mon.StringField('pool'),
        gae_ts_mon.StringField('status'),
    ])


# Global metric. Metric fields:
# - busy = Whether or not the count is for machines that are busy.
# - machine_type = server.lease_management.MachineType.key.id().
_machine_types_actual_size = gae_ts_mon.GaugeMetric(
    'swarming/machine_types/actual_size',
    'Actual number of Machine Provider bots per MachineType.', [
        gae_ts_mon.BooleanField('busy'),
        gae_ts_mon.StringField('machine_type'),
  ])
Exemple #4
0
class ProcessIssue(webapp2.RequestHandler):
  time_since_first_flake = gae_ts_mon.FloatMetric(
      'flakiness_pipeline/time_since_first_flake',
      'The delay in seconds from the moment first flake occurrence in this '
      'flakiness period happens and until the time an issue is created to '
      'track it.',
      None)
  time_since_threshold_exceeded = gae_ts_mon.FloatMetric(
      'flakiness_pipeline/time_since_threshold_exceeded',
      'The delay in seconds from the moment when the last flake occurrence '
      'happens that makes a flake exceed the threshold and until the time an '
      'issue is created to track it.',
      None)
  issue_updates = gae_ts_mon.CounterMetric(
      'flakiness_pipeline/issue_updates',
      'Issues updated/created',
      [gae_ts_mon.StringField('operation')])

  @ndb.transactional
  def _get_flake_update_singleton_key(self):
    singleton_key = ndb.Key('FlakeUpdateSingleton', 'singleton')
    if not singleton_key.get():
      FlakeUpdateSingleton(key=singleton_key).put()
    return singleton_key

  @ndb.transactional
  def _increment_update_counter(self):
    FlakeUpdate(parent=self._get_flake_update_singleton_key()).put()

  @ndb.non_transactional
  def _time_difference(self, flaky_run):
    return (flaky_run.success_run.get().time_finished -
            flaky_run.failure_run_time_finished).total_seconds()

  @ndb.non_transactional
  def _is_same_day(self, flaky_run):
    time_since_finishing = (
        datetime.datetime.utcnow() - flaky_run.failure_run_time_finished)
    return time_since_finishing <= datetime.timedelta(days=1)

  @ndb.non_transactional
  def _get_new_flakes(self, flake):
    num_runs = len(flake.occurrences) - flake.num_reported_flaky_runs
    flaky_runs = ndb.get_multi(flake.occurrences[-num_runs:])
    flaky_runs = [run for run in flaky_runs if run is not None]
    return [
      flaky_run for flaky_run in flaky_runs
      if self._is_same_day(flaky_run) and
         self._time_difference(flaky_run) <= MAX_TIME_DIFFERENCE_SECONDS]

  @staticmethod
  @ndb.non_transactional
  def _find_flakiness_period_occurrences(flake):
    """Finds all occurrences in the current flakiness period."""
    assert flake.occurrences, 'Flake entity has no occurrences'
    flaky_runs = sorted([run for run in ndb.get_multi(flake.occurrences)
                             if run is not None],
                        key=lambda run: run.failure_run_time_finished)

    cur = flaky_runs[-1]
    for i, prev in enumerate(reversed(flaky_runs[:-1])):
      if (cur.failure_run_time_finished - prev.failure_run_time_finished >
          MAX_GAP_FOR_FLAKINESS_PERIOD):
        return flaky_runs[-i-1:]  # not including prev, but including cur
      cur = prev
    return flaky_runs

  @staticmethod
  def _get_time_threshold_exceeded(flakiness_period_occurrences):
    assert flakiness_period_occurrences, 'No occurrences in flakiness period'
    window = []
    for flaky_run in flakiness_period_occurrences:  # pragma: no cover
      window.append(flaky_run)

      # Remove flaky runs that happened more than a day before the latest run.
      flaky_run_finished = flaky_run.failure_run_time_finished
      window = [
          prev_run for prev_run in window
          if flaky_run_finished - prev_run.failure_run_time_finished <=
             datetime.timedelta(days=1)
      ]

      if len(window) >= MIN_REQUIRED_FLAKY_RUNS:
        return flaky_run.failure_run_time_finished

  @ndb.transactional
  def _recreate_issue_for_flake(self, flake):
    """Updates a flake to re-create an issue and creates a respective task."""
    flake.old_issue_id = flake.issue_id
    flake.issue_id = 0
    taskqueue.add(url='/issues/process/%s' % flake.key.urlsafe(),
                  queue_name='issue-updates', transactional=True)

  @staticmethod
  @ndb.non_transactional
  def _update_new_occurrences_with_issue_id(name, new_flaky_runs, issue_id):
    # TODO(sergiyb): Find a way to do this asynchronously to avoid block
    # transaction-bound method calling this. Possible solutions are to use
    # put_multi_sync (need to find a way to test this) or to use deferred
    # execution.
    for fr in new_flaky_runs:
      for occ in fr.flakes:
        if occ.failure == name:
          occ.issue_id = issue_id
    ndb.put_multi(new_flaky_runs)

  @staticmethod
  @ndb.non_transactional
  def _report_flakes_to_findit(flake, flaky_runs):
    try:
      findit.FindItAPI().flake(flake, flaky_runs)
    except (httplib.HTTPException, apiclient.errors.Error):
      logging.warning('Failed to send flakes to FindIt', exc_info=True)

  @staticmethod
  def follow_duplication_chain(api, starting_issue_id):
    """Finds last merged-into issue in the deduplication chain.

    Args:
      api: Issue Tracker API object.
      starting_issue_id: ID of the issue to start with.

    Returns:
      Issue object for the last issue in the chain (can be the same issue as
      passed in if it is not marked as Duplicate) or None if duplication loop is
      detected.
    """
    seen_issues = set()
    flake_issue = api.getIssue(starting_issue_id)
    # We need to check both status and merged_into, since it's possible to
    # create an issue with Duplicate status but without merged_into field set
    # and vice versa (see http://crbug.com/669054 and http://crbug.com/669056).
    while flake_issue.status == 'Duplicate' and flake_issue.merged_into:
      seen_issues.add(flake_issue.id)
      if flake_issue.merged_into in seen_issues:
        logging.info('Detected issue duplication loop: %s.', seen_issues)
        return None
      flake_issue = api.getIssue(flake_issue.merged_into)

    return flake_issue

  @ndb.transactional
  def _update_issue(self, api, flake, new_flakes, now):
    """Updates an issue on the issue tracker."""
    flake_issue = self.follow_duplication_chain(api, flake.issue_id)

    if flake_issue is None:
      # If the issue duplication loop was detected, we re-create the issue.
      self._recreate_issue_for_flake(flake)
      return

    if flake_issue.id != flake.issue_id:
      # Update the issue ID stored in datastore to avoid following deduplication
      # chain next time.
      flake.issue_id = flake_issue.id

    if not flake_issue.open:
      # If the issue was closed, we do not update it. This allows changes made
      # to reduce flakiness to propagate and take effect. If after
      # DAYS_TO_REOPEN_ISSUE days we still detect flakiness, we will create a
      # new issue.
      recent_cutoff = now - datetime.timedelta(days=DAYS_TO_REOPEN_ISSUE)
      if flake_issue.updated < recent_cutoff:
        self._recreate_issue_for_flake(flake)
      return

    # Make sure issue is in the appropriate bug queue as flakiness is ongoing as
    # the sheriffs are supposed to disable flaky tests. For steps, only return
    # if there is no owner on the bug.
    suffix = None
    queue_name, expected_label = get_queue_details(flake.name)
    if expected_label not in flake_issue.labels:
      if not flake.is_step or not flake_issue.owner:
        flake_issue.labels.append(expected_label)
        suffix = RETURN_TO_QUEUE_SUFFIX % queue_name

    new_flaky_runs_msg = FLAKY_RUNS_TEMPLATE % {
        'name': flake.name,
        'new_flakes_count': len(new_flakes),
        'flakes_url': FLAKES_URL_TEMPLATE % flake.key.urlsafe(),
        'suffix': ' %s' % suffix if suffix else ''}
    api.update(flake_issue, comment=new_flaky_runs_msg)
    self.issue_updates.increment_by(1, {'operation': 'update'})
    logging.info('Updated issue %d for flake %s with %d flake runs',
                 flake.issue_id, flake.name, len(new_flakes))
    self._update_new_occurrences_with_issue_id(
        flake.name, new_flakes, flake_issue.id)
    flake.num_reported_flaky_runs = len(flake.occurrences)
    flake.issue_last_updated = now

    self._report_flakes_to_findit(flake, new_flakes)

  @ndb.transactional
  def _create_issue(self, api, flake, new_flakes, now):
    _, qlabel = get_queue_details(flake.name)
    labels = ['Type-Bug', 'Pri-1', 'Via-TryFlakes', qlabel]
    if is_trooper_flake(flake.name):
      other_queue_msg = TROOPER_QUEUE_MSG
    else:
      other_queue_msg = SHERIFF_QUEUE_MSG

    summary = SUMMARY_TEMPLATE % {'name': flake.name}
    description = DESCRIPTION_TEMPLATE % {
        'summary': summary,
        'flakes_url': FLAKES_URL_TEMPLATE % flake.key.urlsafe(),
        'flakes_count': len(new_flakes),
        'other_queue_msg': other_queue_msg,
        'footer': CTF_CAN_FILE_BUGS_FOR_TESTS if flake.is_step
                                              else DESCRIPTION_TEST_FOOTER}
    if flake.old_issue_id:
      description = REOPENED_DESCRIPTION_TEMPLATE % {
          'description': description, 'old_issue': flake.old_issue_id}

    new_issue = issue.Issue({'summary': summary,
                             'description': description,
                             'status': 'Untriaged',
                             'labels': labels,
                             'components': ['Tests>Flaky']})
    flake_issue = api.create(new_issue)
    flake.issue_id = flake_issue.id
    self._update_new_occurrences_with_issue_id(
        flake.name, new_flakes, flake_issue.id)
    flake.num_reported_flaky_runs = len(flake.occurrences)
    flake.issue_last_updated = now
    self.issue_updates.increment_by(1, {'operation': 'create'})
    logging.info('Created a new issue %d for flake %s', flake.issue_id,
                 flake.name)

    self._report_flakes_to_findit(flake, new_flakes)

    # Find all flakes in the current flakiness period to compute metrics. The
    # flakiness period is a series of flakes with a gap no larger than
    # MAX_GAP_FOR_FLAKINESS_PERIOD seconds.
    period_flakes = self._find_flakiness_period_occurrences(flake)

    # Compute the delay since the first flake in the current flakiness period.
    time_since_first_flake = (
        now - period_flakes[0].failure_run_time_finished).total_seconds()
    self.time_since_first_flake.set(time_since_first_flake)
    logging.info('Reported time_since_first_flake %d for flake %s',
                 time_since_first_flake, flake.name)

    # Find the first flake that exceeded the threshold needed to create an
    # issue and report delay from the moment this flake happend and until we've
    # actually created the issue.
    time_since_threshold_exceeded = (
        now - self._get_time_threshold_exceeded(period_flakes)).total_seconds()
    self.time_since_threshold_exceeded.set(time_since_threshold_exceeded)
    logging.info('Reported time_since_threshold_exceeded %d for flake %s',
                 time_since_threshold_exceeded, flake.name)


  @ndb.transactional(xg=True)  # pylint: disable=E1120
  def post(self, urlsafe_key):
    api = issue_tracker_api.IssueTrackerAPI('chromium')

    # Check if we should stop processing this issue because we've posted too
    # many updates to issue tracker today already.
    day_ago = datetime.datetime.utcnow() - datetime.timedelta(days=1)
    num_updates_last_day = FlakeUpdate.query(
        FlakeUpdate.time_updated > day_ago,
        ancestor=self._get_flake_update_singleton_key()).count()
    if num_updates_last_day >= MAX_UPDATED_ISSUES_PER_DAY:
      logging.info('Too many issues updated in the last 24 hours')
      return

    now = datetime.datetime.utcnow()
    flake = ndb.Key(urlsafe=urlsafe_key).get()
    logging.info('Processing %s', flake.key)

    # Only update/file issues if there are new flaky runs.
    if flake.num_reported_flaky_runs == len(flake.occurrences):
      logging.info(
          'No new flakes (reported %d, total %d)',
          flake.num_reported_flaky_runs, len(flake.occurrences))
      return

    # Retrieve flaky runs outside of the transaction, because we are not
    # planning to modify them and because there could be more of them than the
    # number of groups supported by cross-group transactions on AppEngine.
    new_flakes = self._get_new_flakes(flake)

    if len(new_flakes) < MIN_REQUIRED_FLAKY_RUNS:
      logging.info('Too few new flakes: %d', len(new_flakes))
      return

    if flake.issue_id > 0:
      # Update issues at most once a day.
      if flake.issue_last_updated > now - datetime.timedelta(days=1):
        logging.info('Issue was updated less than 24 hours ago')
        return

      self._update_issue(api, flake, new_flakes, now)
      self._increment_update_counter()
    else:
      self._create_issue(api, flake, new_flakes, now)
      # Don't update the issue just yet, this may fail, and we need the
      # transaction to succeed in order to avoid filing duplicate bugs.
      self._increment_update_counter()

    # Note that if transaction fails for some reason at this point, we may post
    # updates or create issues multiple times. On the other hand, this should be
    # extremely rare because we set the number of concurrently running tasks to
    # 1, therefore there should be no contention for updating this issue's
    # entity.
    flake.put()
Exemple #5
0
jobs_completed = gae_ts_mon.CounterMetric(
    'jobs/completed',
    description='Number of completed jobs.')


jobs_durations = gae_ts_mon.CumulativeDistributionMetric(
    'jobs/durations', bucketer=_bucketer,
    description='Cycle times of completed jobs, in seconds.')


jobs_pending_durations = gae_ts_mon.NonCumulativeDistributionMetric(
    'jobs/pending_durations', bucketer=_bucketer,
    description='Pending times of active jobs, in seconds.')

jobs_max_pending_duration = gae_ts_mon.FloatMetric(
    'jobs/max_pending_duration',
    description='Maximum pending seconds of pending jobs.')

# Similar to jobs/completed and jobs/duration, but with a dedup field.
# - project_id: e.g. 'chromium'
# - subproject_id: e.g. 'blink'. Set to empty string if not used.
# - spec_name: name of a job specification, e.g. '<master>:<builder>:<test>'
#     for buildbot jobs.
# - deduped: boolean describing whether the job was deduped or not.
jobs_requested = gae_ts_mon.CounterMetric(
    'jobs/requested',
    description='Number of requested jobs over time.')


# Swarming-specific metric. Metric fields:
# - project_id: e.g. 'chromium'