コード例 #1
0
def FindMatchingWaterfallStep(build_step, test_name):
    """Finds the matching Waterfall step and checks whether it is supported.

  Only Swarmed and gtest-based steps are supported at the moment.

  Args:
    build_step (BuildStep): A build step on Waterfall or Commit Queue. It
        will be updated with the matching Waterfall step and whether it is
        Swarmed and supported.
    test_name (str): The name of the test.
  """

    build_step.swarmed = False
    build_step.supported = False

    http_client = HttpClientAppengine()

    if build_step.on_cq:
        wf_master_name, wf_builder_name, wf_build_number, wf_step_name, metadata = (
            _GetMatchingWaterfallBuildStep(build_step, http_client))

        build_step.wf_master_name = wf_master_name
        build_step.wf_builder_name = wf_builder_name
        build_step.wf_build_number = wf_build_number
        build_step.wf_step_name = wf_step_name

        if not build_step.has_matching_waterfall_step:
            return
    else:
        build_step.wf_master_name = build_step.master_name
        build_step.wf_builder_name = build_step.builder_name
        build_step.wf_build_number = build_step.build_number
        build_step.wf_step_name = build_step.step_name
        metadata = buildbot.GetStepLog(build_step.master_name,
                                       build_step.builder_name,
                                       build_step.build_number,
                                       build_step.step_name, http_client,
                                       'step_metadata')
        if not metadata:
            logging.error('Couldn\'t get step_metadata')
            return

    # Query Swarming for isolated data.
    build_step.swarmed = True if metadata.get('swarm_task_ids') else False

    if build_step.swarmed:
        # Retrieve a sample output from Isolate.
        task_id = metadata['swarm_task_ids'][0]
        output = swarming_util.GetIsolatedOutputForTask(task_id, http_client)
        if output:
            # Guess from the format.
            build_step.supported = (
                isinstance(output, dict)
                and isinstance(output.get('all_tests'), list)
                and test_name in output.get('all_tests', [])
                and isinstance(output.get('per_iteration_data'), list) and all(
                    isinstance(i, dict)
                    for i in output.get('per_iteration_data')))
コード例 #2
0
    def run(self, failure_info):
        """Extracts failure signals from failed steps.

    Args:
      failure_info (dict): Output of pipeline DetectFirstFailurePipeline.run().

    Returns:
      A dict like below:
      {
        'step_name1': waterfall.failure_signal.FailureSignal.ToDict(),
        ...
      }
    """
        signals = {}
        if not failure_info['failed'] or not failure_info['chromium_revision']:
            # Bail out if no failed step or no chromium revision.
            return signals

        # Bail out on infra failure
        if failure_info.get('failure_type') == failure_type.INFRA:
            return signals

        master_name = failure_info['master_name']
        builder_name = failure_info['builder_name']
        build_number = failure_info['build_number']

        for step_name in failure_info.get('failed_steps', []):
            if not waterfall_config.StepIsSupportedForMaster(
                    step_name, master_name):
                # Bail out if the step is not supported.
                continue

            step = WfStep.Get(master_name, builder_name, build_number,
                              step_name)
            if step and step.log_data:
                failure_log = step.log_data
            else:
                # TODO: do test-level analysis instead of step-level.
                # TODO: Use swarming test result instead of archived gtest results
                gtest_result = buildbot.GetGtestResultLog(
                    master_name, builder_name, build_number, step_name)
                if gtest_result:
                    failure_log = _GetReliableTestFailureLog(gtest_result)

                if gtest_result is None or failure_log == 'invalid':
                    if not lock_util.WaitUntilDownloadAllowed(
                            master_name):  # pragma: no cover
                        raise pipeline.Retry(
                            'Failed to pull log of step %s of master %s' %
                            (step_name, master_name))
                    try:
                        failure_log = buildbot.GetStepLog(
                            master_name, builder_name, build_number, step_name,
                            self.HTTP_CLIENT)
                    except ResponseTooLargeError:  # pragma: no cover.
                        logging.exception(
                            'Log of step "%s" is too large for urlfetch.',
                            step_name)
                        # If the stdio log of a step is too large, we don't want to pull it
                        # again in next run, because that might lead to DDoS to the master.
                        # TODO: Use archived stdio logs in Google Storage instead.
                        failure_log = 'Stdio log is too large for urlfetch.'

                    if not failure_log:  # pragma: no cover
                        raise pipeline.Retry(
                            'Failed to pull stdio of step %s of master %s' %
                            (step_name, master_name))

                # Save step log in datastore and avoid downloading again during retry.
                if not step:  # pragma: no cover
                    step = WfStep.Create(master_name, builder_name,
                                         build_number, step_name)

                step.log_data = _ExtractStorablePortionOfLog(failure_log)

                try:
                    step.put()
                except Exception as e:  # pragma: no cover
                    # Sometimes, the step log is too large to save in datastore.
                    logging.exception(e)

            # TODO: save result in datastore?
            if step.isolated:
                try:
                    json_failure_log = (json.loads(failure_log)
                                        if failure_log != 'flaky' else {})
                except ValueError:  # pragma: no cover
                    json_failure_log = {}
                    logging.warning('failure_log %s is not valid JSON.' %
                                    failure_log)

                signals[step_name] = {'tests': {}}
                step_signal = FailureSignal()

                for test_name, test_failure_log in json_failure_log.iteritems(
                ):
                    signals[step_name]['tests'][
                        test_name] = extractors.ExtractSignal(
                            master_name, builder_name, step_name, test_name,
                            base64.b64decode(test_failure_log)).ToDict()

                    # Save signals in test failure log to step level.
                    step_signal.MergeFrom(
                        signals[step_name]['tests'][test_name])

                signals[step_name]['files'] = step_signal.files
                signals[step_name]['keywords'] = step_signal.keywords
            else:
                signals[step_name] = extractors.ExtractSignal(
                    master_name, builder_name, step_name, None,
                    failure_log).ToDict()

        return signals
コード例 #3
0
  def _callback(self, callback_params, pipeline_id=None):
    """Updates the TryJobData entities with status from buildbucket."""
    # callback_params may have been serialized if the callback was converted to
    # a URL.
    if isinstance(callback_params, basestring):
      callback_params = json.loads(callback_params)

    self.last_params = callback_params

    _ = pipeline_id  # We do nothing with this id.

    try_job_id = callback_params['try_job_id']
    assert try_job_id

    urlsafe_try_job_key = callback_params['urlsafe_try_job_key']
    try_job_type = callback_params['try_job_type']
    deadline = callback_params['deadline']
    already_set_started = callback_params['already_set_started']
    error_count = callback_params['error_count']
    max_error_times = callback_params['max_error_times']
    default_pipeline_wait_seconds = callback_params[
        'default_pipeline_wait_seconds']
    timeout_hours = callback_params['timeout_hours']
    backoff_time = callback_params['backoff_time']

    if try_job_type == failure_type.FLAKY_TEST:
      try_job_data = FlakeTryJobData.Get(try_job_id)
    else:
      try_job_data = WfTryJobData.Get(try_job_id)

    error, build = buildbucket_client.GetTryJobs([try_job_id])[0]

    if error:
      if error_count < max_error_times:
        error_count += 1
        self.delay_callback(
            backoff_time,
            callback_params={
                'try_job_id': try_job_id,
                'try_job_type': try_job_type,
                'urlsafe_try_job_key': urlsafe_try_job_key,
                'deadline': deadline,
                'already_set_started': already_set_started,
                'error_count': error_count,
                'max_error_times': max_error_times,
                'default_pipeline_wait_seconds': default_pipeline_wait_seconds,
                'timeout_hours': timeout_hours,
                'backoff_time': backoff_time * 2,
            }
        )
        return
      else:  # pragma: no cover
        # Buildbucket has responded error more than 5 times, retry pipeline.
        _UpdateTryJobMetadata(
            try_job_data, try_job_type, build, error, False)
        raise pipeline.Retry(
            'Error "%s" occurred. Reason: "%s"' % (error.message,
                                                   error.reason))
    elif build.status == BuildbucketBuild.COMPLETED:
      swarming_task_id = buildbot.GetSwarmingTaskIdFromUrl(
          build.url)

      if swarming_task_id:
        try:
          report = json.loads(swarming_util.GetStepLog(
              try_job_id, 'report', HttpClientAppengine(), 'report'))
        except (ValueError, TypeError) as e:  # pragma: no cover
          report = {}
          logging.exception(
              'Failed to load result report for swarming/%s '
              'due to exception %s.' % (swarming_task_id, e.message))
      else:
        try_job_master_name, try_job_builder_name, try_job_build_number = (
            buildbot.ParseBuildUrl(build.url))

        try:
          report = json.loads(buildbot.GetStepLog(
              try_job_master_name, try_job_builder_name, try_job_build_number,
              'report', HttpClientAppengine(), 'report'))
        except (ValueError, TypeError) as e:  # pragma: no cover
          report = {}
          logging.exception(
              'Failed to load result report for %s/%s/%s due to exception %s.'
              % (try_job_master_name, try_job_builder_name,
                 try_job_build_number, e.message))

      _UpdateTryJobMetadata(
          try_job_data, try_job_type, build, error, False,
          report if report else {})
      result_to_update = self._UpdateTryJobResult(
          urlsafe_try_job_key, try_job_type, try_job_id,
          build.url, BuildbucketBuild.COMPLETED, report)
      self.complete(result_to_update[-1])
      return
    else:
      error_count = 0
      backoff_time = default_pipeline_wait_seconds
      if build.status == BuildbucketBuild.STARTED and not (
          already_set_started):
        # It is possible this branch is skipped if a fast build goes from
        # 'SCHEDULED' to 'COMPLETED' between queries, so start_time may be
        # unavailable.
        start_time = time_util.MicrosecondsToDatetime(build.updated_time)
        self._UpdateTryJobResult(
            urlsafe_try_job_key, try_job_type, try_job_id,
            build.url, BuildbucketBuild.STARTED)

        already_set_started = True

        # Update as much try job metadata as soon as possible to avoid data
        # loss in case of errors.
        try_job_data.start_time = start_time
        try_job_data.request_time = (
            time_util.MicrosecondsToDatetime(build.request_time))
        try_job_data.try_job_url = build.url
        try_job_data.callback_url = self.get_callback_url(
            callback_params=json.dumps({
                'try_job_id': try_job_id,
                'try_job_type': try_job_type,
                'urlsafe_try_job_key': urlsafe_try_job_key,
                'deadline': deadline,
                'already_set_started': already_set_started,
                'error_count': error_count,
                'max_error_times': max_error_times,
                'default_pipeline_wait_seconds': default_pipeline_wait_seconds,
                'timeout_hours': timeout_hours,
                'backoff_time': backoff_time,
            })
        )
        try_job_data.put()

    if time.time() > deadline:  # pragma: no cover
      _UpdateTryJobMetadata(
          try_job_data, try_job_type, build, error, True)
      # Explicitly abort the whole pipeline.
      raise pipeline.Abort(
          'Try job %s timed out after %d hours.' % (
              try_job_id, timeout_hours))

    # Ensure last_buildbucket_response is always the most recent
    # whenever available during intermediate queries.
    _UpdateLastBuildbucketResponse(try_job_data, build)
コード例 #4
0
 def testGetStepLogStdioIfNoStream(self, *_):
   self.assertEqual('log1/nlog2', buildbot.GetStepLog(
       self.master_name, self.builder_name, self.build_number, self.step_name,
       self.http_client))
コード例 #5
0
 def testGetStepMetadataStreamNone(self, *_):
   step_metadata = buildbot.GetStepLog(
     self.master_name, self.builder_name, self.build_number, self.step_name,
     self.http_client, 'step_metadata')
   self.assertIsNone(step_metadata)
コード例 #6
0
 def testGetStepMetadata(self, *_):
   step_metadata = buildbot.GetStepLog(
     self.master_name, self.builder_name, self.build_number, self.step_name,
     self.http_client, 'step_metadata')
   self.assertEqual(step_metadata, wf_testcase.SAMPLE_STEP_METADATA)
コード例 #7
0
def _GetMatchingWaterfallBuildStep(cq_build_step, http_client):
    """Returns the matching Waterfall build step of the given CQ one.

  Args:
    cq_build_step (BuildStep): A build step on Commit Queue.
    http_client (RetryHttpClient): A http client to send http requests.

  Returns:
      (master_name, builder_name, build_number, step_name, step_metadata)
    or
      None
  """
    no_matching_result = (None, None, None, None, None)

    # 0. Get step_metadata.
    step_metadata = buildbot.GetStepLog(cq_build_step.master_name,
                                        cq_build_step.builder_name,
                                        cq_build_step.build_number,
                                        cq_build_step.step_name, http_client,
                                        'step_metadata')
    if not step_metadata:
        logging.error('Couldn\'t get step_metadata')
        return no_matching_result

    # 1. Map a cq trybot to the matching waterfall buildbot:
    # get master_name and builder_name.
    wf_master_name = step_metadata.get('waterfall_mastername')
    wf_builder_name = step_metadata.get('waterfall_buildername')
    if not wf_master_name or not wf_builder_name:
        # Either waterfall_mastername or waterfall_buildername doesn't exist.
        logging.info('%s/%s has no matching Waterfall buildbot',
                     cq_build_step.master_name, cq_build_step.builder_name)
        return no_matching_result  # No matching Waterfall buildbot.

    # 2. Get "name" of the CQ trybot step.

    # Name of the step in the tags of a Swarming task.
    # Can't use step name, as cq one is with "(with patch)" while waterfall one
    # without.
    name = step_metadata.get('canonical_step_name')
    # The OS in which the test runs on. The same test binary might run on two
    # different OS platforms.
    os_name = step_metadata.get('dimensions', {}).get('os')
    if not name or not os_name:
        logging.error('Couldn\'t find name/os')
        return no_matching_result  # No name of the step.

    # TODO: cache and throttle QPS to the same master.
    # 3. Retrieve latest completed build cycle on the buildbot.
    builds = buildbot.GetRecentCompletedBuilds(wf_master_name, wf_builder_name,
                                               http_client)
    if not builds:
        logging.error('Couldn\'t find latest builds.')
        return no_matching_result  # No name of the step.

    # 4. Check whether there is matching step.
    tasks = swarming_util.ListSwarmingTasksDataByTags(wf_master_name,
                                                      wf_builder_name,
                                                      builds[0], http_client, {
                                                          'name': name,
                                                          'os': os_name
                                                      })
    if tasks:  # One matching buildbot is found.
        wf_step_name = swarming_util.GetTagValue(tasks[0].get('tags', []),
                                                 'stepname')
        logging.info('%s/%s/%s is mapped to %s/%s/%s',
                     cq_build_step.master_name, cq_build_step.builder_name,
                     cq_build_step.step_name, wf_master_name, wf_builder_name,
                     wf_step_name)
        return (wf_master_name, wf_builder_name, builds[0], wf_step_name,
                step_metadata)

    return no_matching_result
コード例 #8
0
def ScheduleAnalysisIfNeeded(
        normalized_test,
        original_test,
        bug_id=None,
        allow_new_analysis=False,
        force=False,
        manually_triggered=False,
        user_email=None,
        triggering_source=triggering_sources.FINDIT_PIPELINE,
        queue_name=constants.DEFAULT_QUEUE):
    """Schedules an analysis if needed and returns the MasterFlakeAnalysis.

  When the build failure was already analyzed and a new analysis is scheduled,
  the returned WfAnalysis will still have the result of last completed analysis.

  Args:
    normalized_test (TestInfo): Info of the normalized flaky test after mapping
       a CQ trybot step to a Waterfall buildbot step, striping prefix "PRE_"
       from a gtest, etc.
    original_test (TestInfo): Info of the original flaky test.
    bug_id (int): The monorail bug id to update when analysis is done.
    allow_new_analysis (bool): Indicate whether a new analysis is allowed.
    force (bool): Indicate whether to force a rerun of current analysis.
    manually_triggered (bool): True if the analysis was requested manually,
      such as by a Chromium sheriff.
    user_email (str): The email of the user requesting the analysis.
    triggering_source (int): From where this analysis was triggered, such as
      through Findit pipeline, UI, or through Findit API.
    queue_name (str): The App Engine queue to run the analysis.

  Returns:
    A MasterFlakeAnalysis instance.
    None if no analysis was scheduled and the user has no permission to.
  """
    flake_settings = waterfall_config.GetCheckFlakeSettings()
    use_nearby_neighbor = flake_settings.get('swarming_rerun',
                                             {}).get('use_nearby_neighbor',
                                                     False)

    need_new_analysis, analysis = _NeedANewAnalysis(
        normalized_test,
        original_test,
        flake_settings,
        bug_id=bug_id,
        allow_new_analysis=allow_new_analysis,
        force=force,
        user_email=user_email,
        triggering_source=triggering_source)

    if need_new_analysis:
        # _NeedANewAnalysis just created master_flake_analysis. Use the latest
        # version number and pass that along to the other pipelines for updating
        # results and data.
        logging.info(
            'A new master flake analysis was successfully saved for %s (%s) and '
            'will be captured in version %s', repr(normalized_test),
            repr(original_test), analysis.version_number)

        step_metadata = buildbot.GetStepLog(normalized_test.master_name,
                                            normalized_test.builder_name,
                                            normalized_test.build_number,
                                            normalized_test.step_name,
                                            HttpClientAppengine(),
                                            'step_metadata')

        pipeline_job = RecursiveFlakePipeline(
            normalized_test.master_name,
            normalized_test.builder_name,
            normalized_test.build_number,
            normalized_test.step_name,
            normalized_test.test_name,
            analysis.version_number,
            triggering_build_number=normalized_test.build_number,
            step_metadata=step_metadata,
            manually_triggered=manually_triggered,
            use_nearby_neighbor=use_nearby_neighbor)
        pipeline_job.target = appengine_util.GetTargetNameForModule(
            constants.WATERFALL_BACKEND)
        pipeline_job.start(queue_name=queue_name)

    return analysis