Exemple #1
0
    def HandleGet(self):
        client_id = self.request.get('client_id', CrashClient.CRACAS)

        now = time_util.GetUTCNow()
        last_week = time_util.GetUTCNow() - timedelta(days=7)

        start_date, end_date = time_util.GetStartEndDates(
            self.request.get('start_date'),
            self.request.get('end_date'),
            default_start=last_week,
            default_end=now)

        publish_to_client = bool(self.request.get('publish'))
        count = 0
        for crash_keys in IterateCrashBatches(client_id, start_date, end_date):
            pipeline = RerunPipeline(client_id, crash_keys, publish_to_client)
            # Attribute defined outside __init__ - pylint: disable=W0201
            pipeline.target = appengine_util.GetTargetNameForModule(
                RERUN_SERVICE)
            pipeline.start(queue_name=RERUN_QUEUE)
            count += 1

        if count == 0:
            message = 'No rerun pipeline started.'
        else:
            message = '%d rerun pipeline(s) kicked off.' % count

        return {'data': {'message': message}}
 def _ContinueTryJobPipeline(self, pipeline_input, failure_info):
     master_name, builder_name, build_number = (
         pipeline_input.build_key.GetParts())
     heuristic_result = {
         'failure_info': failure_info,
         'heuristic_result': None
     }
     start_waterfall_try_job_inputs = StartTestTryJobInputs(
         build_key=BuildKey(master_name=master_name,
                            builder_name=builder_name,
                            build_number=build_number),
         build_completed=pipeline_input.build_completed,
         force=pipeline_input.force,
         heuristic_result=TestHeuristicAnalysisOutput.FromSerializable(
             heuristic_result),
         consistent_failures=CollectSwarmingTaskResultsOutputs.
         FromSerializable({}))
     try_job_pipeline = StartTestTryJobPipeline(
         start_waterfall_try_job_inputs)
     try_job_pipeline.target = appengine_util.GetTargetNameForModule(
         constants.WATERFALL_BACKEND)
     try_job_pipeline.start(queue_name=constants.WATERFALL_ANALYSIS_QUEUE)
     logging.info(
         'A try job pipeline for build %s, %s, %s starts after heuristic '
         'analysis was aborted. Check pipeline at: %s.', master_name,
         builder_name, build_number, self.pipeline_status_path)
def AnalyzeRecentCommitPosition(analysis_urlsafe_key):
  """Schedules an analysis of a recent commit for a MasterFlakeAnalysis.

  Args:
    analysis_urlsafe_key (str): The url-safe key to the analysis for which to
      analyze a recent commit position for.
  """
  analysis = ndb.Key(urlsafe=analysis_urlsafe_key).get()
  assert analysis, 'Analysis missing unexpectedly!'

  analyze_recent_flakiness_input = AnalyzeRecentFlakinessInput(
      analysis_urlsafe_key=analysis_urlsafe_key)

  if (analysis.status in [analysis_status.RUNNING, analysis_status.PENDING] or
      analysis.analyze_recent_flakiness_status == analysis_status.RUNNING):
    # Bail out if the analysis is still in progress.
    return

  pipeline_job = AnalyzeRecentFlakinessPipeline(analyze_recent_flakiness_input)
  pipeline_job.target = appengine_util.GetTargetNameForModule(
      constants.WATERFALL_BACKEND)
  pipeline_job.start(queue_name=constants.DEFAULT_QUEUE)

  analysis.Update(
      analyze_recent_flakiness_status=analysis_status.RUNNING,
      analyze_recent_flakiness_pipeline_status_path=(
          pipeline_job.pipeline_status_path))

  analysis.LogInfo(
      'An analysis of recent flakiness was scheduled with path {}'.format(
          pipeline_job.pipeline_status_path))
 def delay_callback(self, countdown, callback_params, name=None):
   target = appengine_util.GetTargetNameForModule(constants.WATERFALL_BACKEND)
   task = self.get_callback_task(
       countdown=countdown, target=target,
       params={'callback_params': json.dumps(callback_params)},
       name=name)
   task.add(queue_name=constants.WATERFALL_ANALYSIS_QUEUE)
Exemple #5
0
def _AsyncProcessFlakeReport(flake_analysis_request, user_email, is_admin):
  """Pushes a task on the backend to process the flake report."""
  target = appengine_util.GetTargetNameForModule(constants.WATERFALL_BACKEND)
  payload = pickle.dumps((flake_analysis_request, user_email, is_admin))
  taskqueue.add(
      url=constants.WATERFALL_PROCESS_FLAKE_ANALYSIS_REQUEST_URL,
      payload=payload, target=target,
      queue_name=constants.WATERFALL_FLAKE_ANALYSIS_REQUEST_QUEUE)
Exemple #6
0
def _AsyncProcessFailureAnalysisRequests(builds):
  """Pushes a task on the backend to process requests of failure analysis."""
  target = appengine_util.GetTargetNameForModule(constants.WATERFALL_BACKEND)
  payload = json.dumps({'builds': builds})
  taskqueue.add(
      url=constants.WATERFALL_PROCESS_FAILURE_ANALYSIS_REQUESTS_URL,
      payload=payload, target=target,
      queue_name=constants.WATERFALL_FAILURE_ANALYSIS_REQUEST_QUEUE)
  # Needed for @Cached to work, but ignored by caller.
  return 'Only semantically None.'
def ScheduleAnalysisIfNeeded(master_name,
                             builder_name,
                             build_number,
                             failed_steps=None,
                             build_completed=False,
                             force=False,
                             queue_name=constants.DEFAULT_QUEUE):
    """Schedules an analysis if needed and returns the build analysis.

  When the build failure was already analyzed and a new analysis is scheduled,
  the returned WfAnalysis will still have the result of last completed analysis.

  Args:
    master_name (str): The master name of the failed build.
    builder_name (str): The builder name of the failed build.
    build_number (int): The build number of the failed build.
    failed_steps (list): The names of all failed steps reported for the build.
    build_completed (bool): Indicate whether the build is completed.
    force (bool): If True, a fresh new analysis will be triggered even when an
        old one was completed already; otherwise bail out.
    queue_name (str): The task queue to be used for pipeline tasks.

  Returns:
    A WfAnalysis instance.
  """
    if NeedANewAnalysis(master_name, builder_name, build_number, failed_steps,
                        build_completed, force):
        pipeline_job = AnalyzeBuildFailurePipeline(master_name, builder_name,
                                                   build_number,
                                                   build_completed, force)
        # Explicitly run analysis in the backend module "waterfall-backend".
        # Note: Just setting the target in queue.yaml does NOT work for pipeline
        # when deployed to App Engine, but it does work in dev-server locally.
        # A possible reason is that pipeline will pick a default target if none is
        # specified explicitly, and the default target is used rather than the one
        # in the queue.yaml file, but this contradicts the documentation in
        # https://cloud.google.com/appengine/docs/python/taskqueue/tasks#Task.
        pipeline_job.target = appengine_util.GetTargetNameForModule(
            constants.WATERFALL_BACKEND)
        pipeline_job.start(queue_name=queue_name)

        logging.info('An analysis was scheduled for build %s, %s, %s: %s',
                     master_name, builder_name, build_number,
                     pipeline_job.pipeline_status_path())
    else:
        logging.info('An analysis is not needed for build %s, %s, %s',
                     master_name, builder_name, build_number)

    return WfAnalysis.Get(master_name, builder_name, build_number)
Exemple #8
0
    def HandleGet(self):
        client_id = self.request.get('client_id', CrashClient.CRACAS)
        key = self.request.get('key')
        if not key:
            return self.CreateError(
                'Should provide key of the analysis to rerun.')

        pipeline = RerunPipeline(client_id, [key],
                                 publish_to_client=bool(
                                     self.request.get('publish')))
        # Attribute defined outside __init__ - pylint: disable=W0201
        pipeline.target = appengine_util.GetTargetNameForModule(RERUN_SERVICE)
        pipeline.start(queue_name=RERUN_QUEUE)

        return {'data': {'success': True}}
Exemple #9
0
def AsyncProcessFlakeReport(flake_analysis_request, user_email, is_admin):
    """Pushes a task on the backend to process the flake report."""
    if appengine_util.IsStaging():
        # Bails out for staging.
        logging.info(
            'Got flake_analysis_request for %s on staging. No flake '
            'analysis runs on staging.', flake_analysis_request.name)
        return

    target = appengine_util.GetTargetNameForModule(constants.WATERFALL_BACKEND)
    payload = pickle.dumps((flake_analysis_request, user_email, is_admin))
    taskqueue.add(url=constants.WATERFALL_PROCESS_FLAKE_ANALYSIS_REQUEST_URL,
                  payload=payload,
                  target=target,
                  queue_name=constants.WATERFALL_FLAKE_ANALYSIS_REQUEST_QUEUE)
Exemple #10
0
def _HandlePossibleFailuresInBuild(project, bucket, builder_name, build_id,
                                   build_result):  # pragma: no cover
    """Schedules a taskqueue task to process a completed failed build."""
    try:
        taskqueue.add(
            name='buildfailure-%s' % build_id,  # Avoid duplicate tasks.
            url='/findit/internal/v2/task/build-completed',
            payload=json.dumps({
                'project': project,
                'bucket': bucket,
                'builder_name': builder_name,
                'build_id': build_id,
                'build_result': build_result,
            }),
            target=appengine_util.GetTargetNameForModule('findit-backend'),
            queue_name='failure-detection-queue')
    except (taskqueue.TombstonedTaskError, taskqueue.TaskAlreadyExistsError):
        logging.warning('Build %s was already scheduled to be processed',
                        build_id)
Exemple #11
0
def StartAnalysis(json_crash_data):
    """Creates a pipeline object to perform the analysis, and start it.

  Args:
    client_id (CrashClient): Can be CrashClient.FRACAS, CrashClient.CRACAS or
      CrashClient.CLUSTERFUZZ.
    identifiers (dict): key value pairs to uniquely identify a crash.
    need_analysis (bool): Whether or not we should schedule
      CrashAnalysisPipeline.
  """
    # N.B., we cannot pass ``predator_client`` directly to the _pipeline_cls,
    # because it is not JSON-serializable (and there's no way to make it such,
    # since JSON-serializability is defined by JSON-encoders rather than
    # as methods on the objects being encoded).
    pipeline = crash_pipeline.CrashWrapperPipeline(json_crash_data)
    # Attribute defined outside __init__ - pylint: disable=W0201
    pipeline.target = appengine_util.GetTargetNameForModule(
        constants.CRASH_BACKEND[json_crash_data['client_id']])
    queue_name = constants.CRASH_ANALYSIS_QUEUE[json_crash_data['client_id']]
    pipeline.start(queue_name=queue_name)
def StartNewAnalysis(client_id, identifiers):
  """Creates a pipeline object to perform the analysis, and start it.

  Args:
    client_id (CrashClient): Can be CrashClient.FRACAS, CrashClient.CRACAS or
      CrashClient.CLUSTERFUZZ.
    identifiers (dict): key value pairs to uniquely identify a crash.
  """
  logging.info('New %s analysis is scheduled for %s',
               client_id, repr(identifiers))
  # N.B., we cannot pass ``findit_client`` directly to the _pipeline_cls,
  # because it is not JSON-serializable (and there's no way to make it such,
  # since JSON-serializability is defined by JSON-encoders rather than
  # as methods on the objects being encoded).
  pipeline = crash_pipeline.CrashWrapperPipeline(client_id, identifiers)
  # Attribute defined outside __init__ - pylint: disable=W0201
  pipeline.target = appengine_util.GetTargetNameForModule(
      constants.CRASH_BACKEND[client_id])
  queue_name = constants.CRASH_ANALYSIS_QUEUE[client_id]
  pipeline.start(queue_name=queue_name)
def _EnqueueDetectFlakeByBuildTasks(build_id, flake_type_desc):
    """Enqueues a task to detect a type of flakes for the build in the row.

  Caches task names to deduplicate tasks for the same build and flake_type.
  """
    target = appengine_util.GetTargetNameForModule(
        constants.FLAKE_DETECTION_BACKEND)
    params = DetectFlakesFromFlakyCQBuildParam(
        build_id=build_id, flake_type_desc=flake_type_desc).ToSerializable()

    try:
        task_name = 'detect-flake-{}-{}'.format(
            build_id, flake_type_desc.replace(' ', '_'))
        taskqueue.add(name=task_name,
                      url=_DETECT_FLAKES_IN_BUILD_TASK_URL,
                      payload=json.dumps(params),
                      target=target,
                      queue_name=constants.FLAKE_DETECTION_MULTITASK_QUEUE)
        return task_name
    except (taskqueue.TombstonedTaskError, taskqueue.TaskAlreadyExistsError):
        logging.info('%s flakes of build %s was already checked.',
                     flake_type_desc, build_id)
Exemple #14
0
    def _ContinueTryJobPipeline(self, pipeline_input, failure_info, signals):
        heuristic_result = {
            'failure_info': failure_info,
            'signals': signals,
            'heuristic_result': None
        }
        start_compile_try_job_input = StartCompileTryJobInput(
            build_key=pipeline_input.build_key,
            heuristic_result=CompileHeuristicAnalysisOutput.FromSerializable(
                heuristic_result),
            build_completed=pipeline_input.build_completed,
            force=pipeline_input.force)
        try_job_pipeline = StartCompileTryJobPipeline(
            start_compile_try_job_input)
        try_job_pipeline.target = appengine_util.GetTargetNameForModule(
            constants.WATERFALL_BACKEND)
        try_job_pipeline.start(queue_name=constants.WATERFALL_ANALYSIS_QUEUE)

        master_name, builder_name, build_number = (
            pipeline_input.build_key.GetParts())
        logging.info(
            'A try job pipeline for build %s, %s, %s starts after heuristic '
            'analysis was aborted. Check pipeline at: %s.', master_name,
            builder_name, build_number, self.pipeline_status_path)
  def HandlePost(self):
    # TODO(robertocn): Find out why one of these works for local testing, and
    # the other one for deploy-test-prod
    try:
      envelope = json.loads(self.request.body)
    except ValueError:
      envelope = json.loads(self.request.params.get('data'))
    try:
      token = envelope['message']['attributes']['auth_token']
      if token != GetVerificationToken():
        return {'return_code': 400}
      build_id = envelope['message']['attributes']['build_id']
      payload = base64.b64decode(envelope['message']['data'])
      # Expected payload format:
      # json.dumps({
      #   'build_id': '123412342130498',  # Buildbucket id
      #   'user_data': json.dumps({
      #       'Message-Type': 'BuildbucketStatusChange'}),
      #       # Plus any data from MakePubsubCallback
      #   })
      message = json.loads(payload)
      user_data = json.loads(message['user_data'])

      if user_data['Message-Type'] == 'BuildbucketStatusChange':
        for kind in ['WfTryJobData', 'FlakeTryJobData']:
          try_job_data = ndb.Key(kind, build_id).get()
          if not try_job_data:
            continue
          if try_job_data.callback_url:
            url = try_job_data.callback_url
            # TODO(robertocn): After a transitional period, all try_job_data
            # entities should have a targed defined. Remove the or clause.
            target = try_job_data.callback_target or (
                appengine_util.GetTargetNameForModule(
                    constants.WATERFALL_BACKEND))
            taskqueue.add(method='GET', url=url, target=target,
                          queue_name=constants.WATERFALL_ANALYSIS_QUEUE)
            return {}
          else:
            logging.warning('The tryjob referenced by pubsub does not have an '
                            'associated pipeline callback url.')
            # We return 200 because we don't want pubsub to retry the push.
            return {}
        logging.warning('The build is not known by findit.')
        # We return 200 because we don't want pubsub to retry the push.
        return {}
      else:
        # We raise an exception instead of accepting the push because we might
        # be an older version (than the one that sent the new message type)
        raise Exception('Unsupported message type %s' % user_data[
            'Message-Type'])
    except KeyError:
      raise Exception('The message was not in the expected format: \n'
                      '{"message": {\n'
                      '  "attributes": {\n'
                      '    "auth_token": <valid_token>,\n'
                      '    "build_id": <Buildbucket id>,\n'
                      '  }\n'
                      '  "data": <serialization of {\n'
                      '    "build_id": <Buildbucket id>,  # Second copy.\n'
                      '    "user_data": <serialization of {\n'
                      '      "Message-Type": "BuildbucketStatusChange"\n'
                      '    }>\n'
                      '  }>\n'
                      '}}\n')
def ScheduleAnalysisIfNeeded(master_name,
                             builder_name,
                             build_number,
                             failed_steps=None,
                             build_completed=False,
                             force=False,
                             queue_name=constants.DEFAULT_QUEUE):
    """Schedules an analysis if needed and returns the build analysis.

  When the build failure was already analyzed and a new analysis is scheduled,
  the returned WfAnalysis will still have the result of last completed analysis.

  Args:
    master_name (str): The master name of the failed build.
    builder_name (str): The builder name of the failed build.
    build_number (int): The build number of the failed build.
    failed_steps (list): The names of all failed steps reported for the build.
    build_completed (bool): Indicate whether the build is completed.
    force (bool): If True, a fresh new analysis will be triggered even when an
        old one was completed already; otherwise bail out.
    queue_name (str): The task queue to be used for pipeline tasks.

  Returns:
    A WfAnalysis instance.
  """

    if NeedANewAnalysis(master_name, builder_name, build_number, failed_steps,
                        build_completed, force):
        failure_info, should_proceed = ci_failure.GetBuildFailureInfo(
            master_name, builder_name, build_number)
        if not should_proceed:
            return WfAnalysis.Get(master_name, builder_name, build_number)
        build_key = BuildKey(master_name=master_name,
                             builder_name=builder_name,
                             build_number=build_number)

        if failure_info['failure_type'] == failure_type.COMPILE:
            # Use new compile pipelines.
            # TODO(crbug/869684): Use a gauge metric to track intermittent statuses.
            compile_pipeline_input = AnalyzeCompileFailureInput(
                build_key=build_key,
                current_failure_info=CompileFailureInfo.FromSerializable(
                    failure_info),
                build_completed=build_completed,
                force=force)
            pipeline_job = AnalyzeCompileFailurePipeline(
                compile_pipeline_input)
        else:
            # TODO(crbug/869684): Use a gauge metric to track intermittent statuses.
            test_pipeline_input = AnalyzeTestFailureInput(
                build_key=build_key,
                current_failure_info=TestFailureInfo.FromSerializable(
                    failure_info),
                build_completed=build_completed,
                force=force)
            pipeline_job = AnalyzeTestFailurePipeline(test_pipeline_input)

        # Explicitly run analysis in the backend module "waterfall-backend".
        # Note: Just setting the target in queue.yaml does NOT work for pipeline
        # when deployed to App Engine, but it does work in dev-server locally.
        # A possible reason is that pipeline will pick a default target if none is
        # specified explicitly, and the default target is used rather than the one
        # in the queue.yaml file, but this contradicts the documentation in
        # https://cloud.google.com/appengine/docs/python/taskqueue/tasks#Task.
        pipeline_job.target = appengine_util.GetTargetNameForModule(
            constants.WATERFALL_BACKEND)
        pipeline_job.start(queue_name=queue_name)

        logging.info('An analysis was scheduled for build %s, %s, %s: %s',
                     master_name, builder_name, build_number,
                     pipeline_job.pipeline_status_path)
    else:
        logging.info('An analysis is not needed for build %s, %s, %s',
                     master_name, builder_name, build_number)

    return WfAnalysis.Get(master_name, builder_name, build_number)
Exemple #17
0
def ScheduleAnalysisIfNeeded(
        normalized_test,
        original_test,
        flake_key,
        bug_id=None,
        allow_new_analysis=False,
        force=False,
        manually_triggered=False,
        user_email=None,
        triggering_source=triggering_sources.FINDIT_PIPELINE,
        queue_name=constants.DEFAULT_QUEUE):
    """Schedules an analysis if needed and returns the MasterFlakeAnalysis.

  When the build failure was already analyzed and a new analysis is scheduled,
  the returned WfAnalysis will still have the result of last completed analysis.

  Args:
    normalized_test (TestInfo): Info of the normalized flaky test after mapping
      a CQ trybot step to a Waterfall buildbot step, striping prefix "PRE_"
      from a gtest, etc.
    original_test (TestInfo): Info of the original flaky test.
    flake_key (ndb.Key): The key to the Flake responsible for triggering this
      analysis.
    bug_id (int): The monorail bug id to update when analysis is done.
    allow_new_analysis (bool): Indicate whether a new analysis is allowed.
    force (bool): Indicate whether to force a rerun of current analysis.
    manually_triggered (bool): True if the analysis was requested manually,
      such as by a Chromium sheriff.
    user_email (str): The email of the user requesting the analysis.
    triggering_source (int): From where this analysis was triggered, such as
      through Findit pipeline, UI, or through Findit API.
    queue_name (str): The App Engine queue to run the analysis.

  Returns:
    A MasterFlakeAnalysis instance.
    None if no analysis was scheduled and the user has no permission to.
  """
    need_new_analysis, analysis = _NeedANewAnalysis(
        normalized_test,
        original_test,
        flake_key,
        bug_id=bug_id,
        allow_new_analysis=allow_new_analysis,
        force=force,
        user_email=user_email,
        triggering_source=triggering_source)

    if need_new_analysis:
        # _NeedANewAnalysis just created master_flake_analysis. Use the latest
        # version number and pass that along to the other pipelines for updating
        # results and data.
        logging.info(
            'A new master flake analysis was successfully saved for %s (%s) and '
            'will be captured in version %s', repr(normalized_test),
            repr(original_test), analysis.version_number)

        step_metadata = (step_util.LegacyGetStepMetadata(
            normalized_test.master_name, normalized_test.builder_name,
            normalized_test.build_number,
            normalized_test.step_name) or step_util.LegacyGetStepMetadata(
                original_test.master_name, original_test.builder_name,
                original_test.build_number, original_test.step_name))

        logging.info('Initializing flake analysis pipeline for key: %s',
                     analysis.key)

        starting_build_info = build_util.GetBuildInfo(
            normalized_test.master_name, normalized_test.builder_name,
            normalized_test.build_number)

        original_build_info = build_util.GetBuildInfo(
            original_test.master_name, original_test.builder_name,
            original_test.build_number)

        assert starting_build_info, (
            'Failed to get starting build for flake analysis')
        starting_commit_position = starting_build_info.commit_position

        assert starting_commit_position is not None, (
            'Cannot analyze flake without a starting commit position')

        assert original_build_info, 'Failed to get original build info'

        # Get the dimensions of the bot for when try jobs are needed to compile.
        dimensions = try_job_service.GetDimensionsFromBuildInfo(
            starting_build_info)

        analyze_flake_input = AnalyzeFlakeInput(
            analysis_urlsafe_key=analysis.key.urlsafe(),
            analyze_commit_position_parameters=NextCommitPositionOutput(
                culprit_commit_id=None,
                next_commit_id=CommitID(
                    commit_position=starting_commit_position,
                    revision=starting_build_info.chromium_revision)),
            commit_position_range=IntRange(lower=None,
                                           upper=starting_commit_position),
            dimensions=ListOfBasestring.FromSerializable(dimensions),
            manually_triggered=manually_triggered,
            retries=0,
            rerun=force,
            step_metadata=StepMetadata.FromSerializable(step_metadata))

        pipeline_job = AnalyzeFlakePipeline(analyze_flake_input)

        pipeline_job.target = appengine_util.GetTargetNameForModule(
            constants.WATERFALL_BACKEND)
        pipeline_job.start(queue_name=queue_name)
        analysis.pipeline_status_path = pipeline_job.pipeline_status_path
        analysis.root_pipeline_id = pipeline_job.root_pipeline_id
        analysis.build_id = starting_build_info.buildbucket_id
        analysis.original_build_id = original_build_info.buildbucket_id
        analysis.put()
        analysis.LogInfo((
            'A flake analysis was scheduled using commit-based pipelines with '
            'path {}').format(pipeline_job.pipeline_status_path))
    else:
        logging.info('A flake analysis not necessary for build %s, %s, %s, %s',
                     normalized_test.master_name, normalized_test.builder_name,
                     normalized_test.build_number, normalized_test.step_name)

    return analysis
Exemple #18
0
  def run(self, master_name, builder_name, build_number, step_name,
          task_id=None, *args):
    """Monitors a swarming task.

    Args:
      master_name (str): The master name.
      builder_name (str): The builder name.
      build_number (str): The build number.
      step_name (str): The failed test step name.
      task_id (str): The task id to query the swarming server on the progresss
        of a swarming task.
    """
    call_args = self._GetArgs(master_name, builder_name, build_number,
                              step_name, *args)
    task = self._GetSwarmingTask(*call_args)

    task_id = task_id or task.task_id

    if not task_id:
      # The swarming task encountered an error when being triggered.
      if not task.error:  # pragma no branch
        task.error = {
            'error': 'Undetected error in swarming task. No task id found!',
            'message': 'Undetected error in swarming task. No task id found!'
        }
        task.put()
      return

    # Check to make this method idempotent.
    if task.callback_url and self.pipeline_id in task.callback_url:
      return

    timeout_hours = waterfall_config.GetSwarmingSettings().get(
        'task_timeout_hours')
    deadline = time.time() + timeout_hours * 60 * 60
    server_query_interval_seconds = waterfall_config.GetSwarmingSettings().get(
        'server_query_interval_seconds')
    task_started = False
    task_completed = False
    step_name_no_platform = None

    if task_id.lower() in (NO_TASK, NO_TASK_EXCEPTION):  # pragma: no branch
      # This situation happens in flake analysis: if the step with flaky test
      # didn't exist in checked build or the build had exception so the step
      # with flaky test didn't run at all, we should skip the build.
      has_valid_artifact = task_id != NO_TASK_EXCEPTION
      task.task_id = None
      task.status = analysis_status.SKIPPED
      task.put()
      self._UpdateMasterFlakeAnalysis(
          *call_args, pass_rate=-1, flake_swarming_task=task,
          has_valid_artifact=has_valid_artifact)
      self.complete(self._GetPipelineResult(
          step_name, step_name_no_platform, task))
      return

    self.last_params = {
        'task_id': task_id,
        'step_name': step_name,
        'call_args': call_args,
        'deadline': deadline,
        'server_query_interval_seconds': server_query_interval_seconds,
        'task_started': task_started,
        'task_completed': task_completed,
        'step_name_no_platform': step_name_no_platform,
    }

    task.callback_url = self.get_callback_url(callback_params=json.dumps(
        self.last_params))
    task.callback_target = appengine_util.GetTargetNameForModule(
        constants.WATERFALL_BACKEND)
    task.put()

    # Guarantee one callback 10 minutes after the deadline to clean up even if
    # Swarming fails to call us back.
    self.delay_callback((timeout_hours * 60 + 10) * 60, self.last_params,
                        name=task_id + '_cleanup_task')

    # Run immediately in case the task already went from scheduled to started.
    self.callback(callback_params=self.last_params)
    def run(self,
            master_name,
            builder_name,
            triggering_build_number,
            current_build_number,
            step_name,
            test_name,
            version_number,
            step_metadata=None,
            use_nearby_neighbor=False,
            manually_triggered=False):
        # Get MasterFlakeAnalysis success list corresponding to parameters.
        analysis = MasterFlakeAnalysis.GetVersion(master_name,
                                                  builder_name,
                                                  triggering_build_number,
                                                  step_name,
                                                  test_name,
                                                  version=version_number)

        flake_swarming_task = FlakeSwarmingTask.Get(master_name, builder_name,
                                                    current_build_number,
                                                    step_name, test_name)

        # Don't call another pipeline if we fail.
        if flake_swarming_task.status == analysis_status.ERROR:
            # Report the last flake swarming task's error that it encountered.
            # TODO(lijeffrey): Another neighboring swarming task may be needed in this
            # one's place instead of failing altogether.
            error = flake_swarming_task.error or {
                'error': 'Swarming task failed',
                'message':
                'The last swarming task did not complete as expected'
            }

            _UpdateAnalysisStatusUponCompletion(analysis, None,
                                                analysis_status.ERROR, error)
            logging.error('Error in Swarming task')
            yield UpdateFlakeBugPipeline(analysis.key.urlsafe())
            return

        if not analysis.algorithm_parameters:
            # Uses analysis' own algorithm_parameters.
            flake_settings = waterfall_config.GetCheckFlakeSettings()
            analysis.algorithm_parameters = flake_settings
            analysis.put()
        algorithm_settings = analysis.algorithm_parameters.get(
            'swarming_rerun')

        data_points = _NormalizeDataPoints(analysis.data_points)
        # Figure out what build_number to trigger a swarming rerun on next, if any.
        next_build_number, suspected_build, iterations_to_rerun = (
            lookback_algorithm.GetNextRunPointNumber(data_points,
                                                     algorithm_settings))
        if iterations_to_rerun:
            # Need to rerun the first build with more iterations.
            _UpdateIterationsToRerun(analysis, iterations_to_rerun)
            _RemoveRerunBuildDataPoint(analysis, next_build_number)
            analysis.put()

        max_build_numbers_to_look_back = algorithm_settings.get(
            'max_build_numbers_to_look_back', _DEFAULT_MAX_BUILD_NUMBERS)
        last_build_number = max(
            0, triggering_build_number - max_build_numbers_to_look_back)

        if ((next_build_number < last_build_number
             or next_build_number >= triggering_build_number)
                and not iterations_to_rerun):  # Finished.
            build_confidence_score = None
            if suspected_build is not None:
                # Use steppiness as the confidence score.
                build_confidence_score = confidence.SteppinessForBuild(
                    analysis.data_points, suspected_build)

            # Update suspected build and the confidence score.
            _UpdateAnalysisStatusUponCompletion(
                analysis,
                suspected_build,
                analysis_status.COMPLETED,
                None,
                build_confidence_score=build_confidence_score)

            if build_confidence_score is None:
                logging.info(
                    ('Skipping try jobs due to no suspected flake build being '
                     'identified'))
            elif not _HasSufficientConfidenceToRunTryJobs(analysis):
                logging.info(
                    ('Skipping try jobs due to insufficient confidence in '
                     'suspected build'))
            else:
                # Hook up with try-jobs. Based on analysis of historical data, 60%
                # confidence could filter out almost all false positives.
                suspected_build_point = analysis.GetDataPointOfSuspectedBuild()
                assert suspected_build_point

                blamed_cls, lower_bound = _GetFullBlamedCLsAndLowerBound(
                    suspected_build_point, analysis.data_points)

                if blamed_cls:
                    if len(blamed_cls) > 1:
                        logging.info(
                            'Running try-jobs against commits in regressions')
                        start_commit_position = suspected_build_point.commit_position - 1
                        start_revision = blamed_cls[start_commit_position]
                        build_info = build_util.GetBuildInfo(
                            master_name, builder_name, triggering_build_number)
                        parent_mastername = build_info.parent_mastername or master_name
                        parent_buildername = build_info.parent_buildername or builder_name
                        cache_name = swarming_util.GetCacheName(
                            parent_mastername, parent_buildername)
                        dimensions = waterfall_config.GetTrybotDimensions(
                            parent_mastername, parent_buildername)
                        yield RecursiveFlakeTryJobPipeline(
                            analysis.key.urlsafe(), start_commit_position,
                            start_revision, lower_bound, cache_name,
                            dimensions)
                        return  # No update to bug yet.
                    else:
                        logging.info(
                            'Single commit in the blame list of suspected build'
                        )
                        culprit_confidence_score = confidence.SteppinessForCommitPosition(
                            analysis.data_points,
                            suspected_build_point.commit_position)
                        culprit = recursive_flake_try_job_pipeline.CreateCulprit(
                            suspected_build_point.git_hash,
                            suspected_build_point.commit_position,
                            culprit_confidence_score)
                        UpdateAnalysisUponCompletion(analysis, culprit,
                                                     analysis_status.COMPLETED,
                                                     None)
                else:
                    logging.error(
                        'Cannot run flake try jobs against empty blame list')
                    error = {
                        'error': 'Could not start try jobs',
                        'message': 'Empty blame list'
                    }
                    UpdateAnalysisUponCompletion(analysis, None,
                                                 analysis_status.ERROR, error)

            yield UpdateFlakeBugPipeline(analysis.key.urlsafe())
            return

        pipeline_job = RecursiveFlakePipeline(
            master_name,
            builder_name,
            next_build_number,
            step_name,
            test_name,
            version_number,
            triggering_build_number,
            step_metadata=step_metadata,
            manually_triggered=manually_triggered,
            use_nearby_neighbor=use_nearby_neighbor,
            step_size=(current_build_number - next_build_number))
        # Disable attribute 'target' defined outside __init__ pylint warning,
        # because pipeline generates its own __init__ based on run function.
        pipeline_job.target = (  # pylint: disable=W0201
            appengine_util.GetTargetNameForModule(constants.WATERFALL_BACKEND))
        pipeline_job.start(
            queue_name=self.queue_name or constants.DEFAULT_QUEUE)
    def run(self,
            master_name,
            builder_name,
            preferred_run_build_number,
            step_name,
            test_name,
            version_number,
            triggering_build_number,
            step_metadata=None,
            manually_triggered=False,
            use_nearby_neighbor=False,
            step_size=0,
            retries=0):
        """Pipeline to determine the regression range of a flaky test.

    Args:
      master_name (str): The master name.
      builder_name (str): The builder name.
      preferred_run_build_number (int): The build number the check flake
        algorithm should perform a swarming rerun on, but may be overridden to
        use the results of a nearby neighbor if use_nearby_neighbor is True.
      step_name (str): The step name.
      test_name (str): The test name.
      version_number (int): The version to save analysis results and data to.
      triggering_build_number (int): The build number that triggered this
        analysis.
      step_metadata (dict): Step_metadata for the test.
      manually_triggered (bool): True if the analysis is from manual request,
        like by a Chromium sheriff.
      use_nearby_neighbor (bool): Whether the optimization for using the
        swarming results of a nearby build number, if available, should be used
        in place of triggering a new swarming task on
        preferred_run_build_number.
      step_size (int): The difference in build numbers since the last call to
        RecursiveFlakePipeline to determine the bounds for how far a nearby
        build's swarming task results should be used. Only relevant if
        use_nearby_neighbor is True.
      retries (int): Number of retries of this pipeline. If reties exceeds the
        _MAX_RETRY_TIMES, start this pipeline off peak hours.
    Returns:
      A dict of lists for reliable/flaky tests.
    """

        # If retries has not exceeded max count and there are available bots,
        # we can start the analysis.
        can_start_analysis = (self._BotsAvailableForTask(step_metadata)
                              if retries <= _MAX_RETRY_TIMES else True)

        if not can_start_analysis:
            retries += 1
            pipeline_job = RecursiveFlakePipeline(
                master_name,
                builder_name,
                preferred_run_build_number,
                step_name,
                test_name,
                version_number,
                triggering_build_number,
                step_metadata,
                manually_triggered=manually_triggered,
                use_nearby_neighbor=use_nearby_neighbor,
                step_size=step_size,
                retries=retries)
            # Disable attribute 'target' defined outside __init__ pylint warning,
            # because pipeline generates its own __init__ based on run function.
            pipeline_job.target = (  # pylint: disable=W0201
                appengine_util.GetTargetNameForModule(
                    constants.WATERFALL_BACKEND))

            if retries > _MAX_RETRY_TIMES:
                pipeline_job._StartOffPSTPeakHours(
                    queue_name=self.queue_name or constants.DEFAULT_QUEUE)
                logging.info(
                    'Retrys exceed max count, RecursiveFlakePipeline on '
                    'MasterFlakeAnalysis %s/%s/%s/%s/%s will start off peak '
                    'hour', master_name, builder_name, triggering_build_number,
                    step_name, test_name)
            else:
                pipeline_job._RetryWithDelay(
                    queue_name=self.queue_name or constants.DEFAULT_QUEUE)
                countdown = retries * _BASE_COUNT_DOWN_SECONDS
                logging.info(
                    'No available swarming bots, RecursiveFlakePipeline on '
                    'MasterFlakeAnalysis %s/%s/%s/%s/%s will be tried after'
                    '%d seconds', master_name, builder_name,
                    triggering_build_number, step_name, test_name, countdown)
        else:
            # Bots are available or pipeline starts off peak hours, trigger the task.
            flake_analysis = MasterFlakeAnalysis.GetVersion(
                master_name,
                builder_name,
                triggering_build_number,
                step_name,
                test_name,
                version=version_number)

            logging.info(
                'Running RecursiveFlakePipeline on MasterFlakeAnalysis'
                ' %s/%s/%s/%s/%s', master_name, builder_name,
                triggering_build_number, step_name, test_name)
            logging.info('MasterFlakeAnalysis %s version %s', flake_analysis,
                         version_number)

            if flake_analysis.status != analysis_status.RUNNING:  # pragma: no branch
                flake_analysis.status = analysis_status.RUNNING
                flake_analysis.start_time = time_util.GetUTCNow()
                flake_analysis.put()

            # TODO(lijeffrey): Allow custom parameters supplied by user.
            iterations = flake_analysis.algorithm_parameters.get(
                'swarming_rerun', {}).get('iterations_to_rerun', 100)
            hard_timeout_seconds = _GetHardTimeoutSeconds(
                master_name, builder_name, triggering_build_number, step_name,
                iterations)
            actual_run_build_number = _GetBestBuildNumberToRun(
                master_name, builder_name, preferred_run_build_number,
                step_name, test_name, step_size,
                iterations) if use_nearby_neighbor else (
                    preferred_run_build_number)
            # Call trigger pipeline (flake style).
            task_id = yield TriggerFlakeSwarmingTaskPipeline(
                master_name, builder_name, actual_run_build_number, step_name,
                [test_name], iterations, hard_timeout_seconds)

            with pipeline.InOrder():
                yield ProcessFlakeSwarmingTaskResultPipeline(
                    master_name, builder_name, actual_run_build_number,
                    step_name, task_id, triggering_build_number, test_name,
                    version_number)
                yield NextBuildNumberPipeline(
                    master_name,
                    builder_name,
                    triggering_build_number,
                    actual_run_build_number,
                    step_name,
                    test_name,
                    version_number,
                    step_metadata=step_metadata,
                    use_nearby_neighbor=use_nearby_neighbor,
                    manually_triggered=manually_triggered)
Exemple #21
0
    def run(self, urlsafe_flake_analysis_key, urlsafe_try_job_key,
            lower_boundary_commit_position, cache_name, dimensions):
        """Determines the next commit position to run a try job on.

    Args:
      urlsafe_flake_analysis_key (str): The url-safe key to the corresponding
          flake analysis that triggered this pipeline.
      urlsafe_try_job_key (str): The url-safe key to the try job that was just
          run.
      lower_boundary_commit_position (int):  The lower bound of commit position
          that can run a try job.
    """
        flake_analysis = ndb.Key(urlsafe=urlsafe_flake_analysis_key).get()
        try_job = ndb.Key(urlsafe=urlsafe_try_job_key).get()
        assert flake_analysis
        assert try_job
        assert try_job.try_job_ids

        try_job_id = try_job.try_job_ids[-1]
        try_job_data = FlakeTryJobData.Get(try_job_id)

        # Don't call another pipeline if the previous try job failed.
        if try_job_data.error:
            UpdateAnalysisUponCompletion(flake_analysis, None,
                                         analysis_status.ERROR,
                                         try_job_data.error)
            yield UpdateFlakeBugPipeline(flake_analysis.key.urlsafe())
            return

        suspected_build_data_point = flake_analysis.GetDataPointOfSuspectedBuild(
        )

        # Because there are hard lower and upper bounds, only the data points
        # involved in try jobs should be considered when determining the next
        # commit position to test.
        try_job_data_points = _GetNormalizedTryJobDataPoints(flake_analysis)
        algorithm_settings = flake_analysis.algorithm_parameters.get(
            'try_job_rerun', {})

        # Figure out what commit position to trigger the next try job on, if any.
        next_commit_position, suspected_commit_position, _ = (
            lookback_algorithm.GetNextRunPointNumber(
                try_job_data_points, algorithm_settings,
                lower_boundary_commit_position))

        if suspected_commit_position is not None:  # Finished.
            confidence_score = confidence.SteppinessForCommitPosition(
                flake_analysis.data_points, suspected_commit_position)
            culprit_revision = suspected_build_data_point.GetRevisionAtCommitPosition(
                suspected_commit_position)
            culprit = CreateCulprit(culprit_revision,
                                    suspected_commit_position,
                                    confidence_score)
            UpdateAnalysisUponCompletion(flake_analysis, culprit,
                                         analysis_status.COMPLETED, None)

            yield UpdateFlakeBugPipeline(flake_analysis.key.urlsafe())
            return

        next_revision = suspected_build_data_point.GetRevisionAtCommitPosition(
            next_commit_position)

        pipeline_job = RecursiveFlakeTryJobPipeline(
            urlsafe_flake_analysis_key, next_commit_position, next_revision,
            lower_boundary_commit_position, cache_name, dimensions)
        # Disable attribute 'target' defined outside __init__ pylint warning,
        # because pipeline generates its own __init__ based on run function.
        pipeline_job.target = (  # pylint: disable=W0201
            appengine_util.GetTargetNameForModule(constants.WATERFALL_BACKEND))
        pipeline_job.start()
  def run(self, urlsafe_try_job_key, try_job_type, try_job_id):
    """Monitors try job until it's complete.

    This method stores parameters in self so that the callback method can
    perform appropriate checks.

    callback(), defined below is expected to run when a pubsub notification from
    the buildbucket service is sent to this application indicating that the job
    has changed status.

    callback() is also run in two occassions separate from pubsub:
      - at the end of this run method (i.e. when creating this pipeline)
      - after timeout_hours have passed without the job completing.

    Args:
      urlsafe_try_job_key (str): The urlsafe key for the corresponding try job
        entity.
      try_job_type (str): The type of the try job.
      try_job_id (str): The try job id to query buildbucket with.
    """

    if not try_job_id:
      self.complete()
      return

    if try_job_type == failure_type.FLAKY_TEST:
      try_job_kind = FlakeTryJobData
    else:
      try_job_kind = WfTryJobData
    try_job_data = try_job_kind.Get(try_job_id)

    if not try_job_data:
      logging.error('%(kind)s entity does not exist for id %(id)s: creating it',
                    {'kind': try_job_kind, 'id': try_job_id})
      try_job_data = try_job_kind.Create(try_job_id)
      try_job_data.try_job_key = ndb.Key(urlsafe=urlsafe_try_job_key)

    # Check if callback url is already registered with the TryJobData entity to
    # guarantee this run method is idempotent when called again with the same
    # params.
    if try_job_data.callback_url and (
        self.pipeline_id in try_job_data.callback_url):
      return

    timeout_hours = waterfall_config.GetTryJobSettings().get(
        'job_timeout_hours')
    default_pipeline_wait_seconds = waterfall_config.GetTryJobSettings(
        ).get( 'server_query_interval_seconds')
    max_error_times = waterfall_config.GetTryJobSettings().get(
        'allowed_response_error_times')

    # TODO(chanli): Make sure total wait time equals to timeout_hours
    # regardless of retries.
    deadline = time.time() + timeout_hours * 60 * 60
    already_set_started = False
    backoff_time = default_pipeline_wait_seconds
    error_count = 0

    self.last_params = {
        'try_job_id': try_job_id,
        'try_job_type': try_job_type,
        'urlsafe_try_job_key': urlsafe_try_job_key,
        'deadline': deadline,
        'already_set_started': already_set_started,
        'error_count': error_count,
        'max_error_times': max_error_times,
        'default_pipeline_wait_seconds': default_pipeline_wait_seconds,
        'timeout_hours': timeout_hours,
        'backoff_time': backoff_time,
    }

    callback_url = self.get_callback_url(callback_params=json.dumps(
        self.last_params))

    try_job_data.callback_url = callback_url
    try_job_data.callback_target = appengine_util.GetTargetNameForModule(
        constants.WATERFALL_BACKEND)
    try_job_data.put()

    # Guarantee one callback 10 minutes after the deadline to clean up even if
    # buildbucket fails to call us back.
    self.delay_callback(
        (timeout_hours * 60 + 10) * 60,
        self.last_params,
        name=try_job_id + '_cleanup_task')

    # Run immediately in case the job already went from scheduled to started.
    self.callback(callback_params=self.last_params)
Exemple #23
0
def ScheduleAnalysisIfNeeded(
        normalized_test,
        original_test,
        bug_id=None,
        allow_new_analysis=False,
        force=False,
        manually_triggered=False,
        user_email=None,
        triggering_source=triggering_sources.FINDIT_PIPELINE,
        queue_name=constants.DEFAULT_QUEUE):
    """Schedules an analysis if needed and returns the MasterFlakeAnalysis.

  When the build failure was already analyzed and a new analysis is scheduled,
  the returned WfAnalysis will still have the result of last completed analysis.

  Args:
    normalized_test (TestInfo): Info of the normalized flaky test after mapping
       a CQ trybot step to a Waterfall buildbot step, striping prefix "PRE_"
       from a gtest, etc.
    original_test (TestInfo): Info of the original flaky test.
    bug_id (int): The monorail bug id to update when analysis is done.
    allow_new_analysis (bool): Indicate whether a new analysis is allowed.
    force (bool): Indicate whether to force a rerun of current analysis.
    manually_triggered (bool): True if the analysis was requested manually,
      such as by a Chromium sheriff.
    user_email (str): The email of the user requesting the analysis.
    triggering_source (int): From where this analysis was triggered, such as
      through Findit pipeline, UI, or through Findit API.
    queue_name (str): The App Engine queue to run the analysis.

  Returns:
    A MasterFlakeAnalysis instance.
    None if no analysis was scheduled and the user has no permission to.
  """
    flake_settings = waterfall_config.GetCheckFlakeSettings()
    use_nearby_neighbor = flake_settings.get('swarming_rerun',
                                             {}).get('use_nearby_neighbor',
                                                     False)

    need_new_analysis, analysis = _NeedANewAnalysis(
        normalized_test,
        original_test,
        flake_settings,
        bug_id=bug_id,
        allow_new_analysis=allow_new_analysis,
        force=force,
        user_email=user_email,
        triggering_source=triggering_source)

    if need_new_analysis:
        # _NeedANewAnalysis just created master_flake_analysis. Use the latest
        # version number and pass that along to the other pipelines for updating
        # results and data.
        logging.info(
            'A new master flake analysis was successfully saved for %s (%s) and '
            'will be captured in version %s', repr(normalized_test),
            repr(original_test), analysis.version_number)

        step_metadata = buildbot.GetStepLog(normalized_test.master_name,
                                            normalized_test.builder_name,
                                            normalized_test.build_number,
                                            normalized_test.step_name,
                                            HttpClientAppengine(),
                                            'step_metadata')

        pipeline_job = RecursiveFlakePipeline(
            normalized_test.master_name,
            normalized_test.builder_name,
            normalized_test.build_number,
            normalized_test.step_name,
            normalized_test.test_name,
            analysis.version_number,
            triggering_build_number=normalized_test.build_number,
            step_metadata=step_metadata,
            manually_triggered=manually_triggered,
            use_nearby_neighbor=use_nearby_neighbor)
        pipeline_job.target = appengine_util.GetTargetNameForModule(
            constants.WATERFALL_BACKEND)
        pipeline_job.start(queue_name=queue_name)

    return analysis