def HandleGet(self): client_id = self.request.get('client_id', CrashClient.CRACAS) now = time_util.GetUTCNow() last_week = time_util.GetUTCNow() - timedelta(days=7) start_date, end_date = time_util.GetStartEndDates( self.request.get('start_date'), self.request.get('end_date'), default_start=last_week, default_end=now) publish_to_client = bool(self.request.get('publish')) count = 0 for crash_keys in IterateCrashBatches(client_id, start_date, end_date): pipeline = RerunPipeline(client_id, crash_keys, publish_to_client) # Attribute defined outside __init__ - pylint: disable=W0201 pipeline.target = appengine_util.GetTargetNameForModule( RERUN_SERVICE) pipeline.start(queue_name=RERUN_QUEUE) count += 1 if count == 0: message = 'No rerun pipeline started.' else: message = '%d rerun pipeline(s) kicked off.' % count return {'data': {'message': message}}
def _ContinueTryJobPipeline(self, pipeline_input, failure_info): master_name, builder_name, build_number = ( pipeline_input.build_key.GetParts()) heuristic_result = { 'failure_info': failure_info, 'heuristic_result': None } start_waterfall_try_job_inputs = StartTestTryJobInputs( build_key=BuildKey(master_name=master_name, builder_name=builder_name, build_number=build_number), build_completed=pipeline_input.build_completed, force=pipeline_input.force, heuristic_result=TestHeuristicAnalysisOutput.FromSerializable( heuristic_result), consistent_failures=CollectSwarmingTaskResultsOutputs. FromSerializable({})) try_job_pipeline = StartTestTryJobPipeline( start_waterfall_try_job_inputs) try_job_pipeline.target = appengine_util.GetTargetNameForModule( constants.WATERFALL_BACKEND) try_job_pipeline.start(queue_name=constants.WATERFALL_ANALYSIS_QUEUE) logging.info( 'A try job pipeline for build %s, %s, %s starts after heuristic ' 'analysis was aborted. Check pipeline at: %s.', master_name, builder_name, build_number, self.pipeline_status_path)
def AnalyzeRecentCommitPosition(analysis_urlsafe_key): """Schedules an analysis of a recent commit for a MasterFlakeAnalysis. Args: analysis_urlsafe_key (str): The url-safe key to the analysis for which to analyze a recent commit position for. """ analysis = ndb.Key(urlsafe=analysis_urlsafe_key).get() assert analysis, 'Analysis missing unexpectedly!' analyze_recent_flakiness_input = AnalyzeRecentFlakinessInput( analysis_urlsafe_key=analysis_urlsafe_key) if (analysis.status in [analysis_status.RUNNING, analysis_status.PENDING] or analysis.analyze_recent_flakiness_status == analysis_status.RUNNING): # Bail out if the analysis is still in progress. return pipeline_job = AnalyzeRecentFlakinessPipeline(analyze_recent_flakiness_input) pipeline_job.target = appengine_util.GetTargetNameForModule( constants.WATERFALL_BACKEND) pipeline_job.start(queue_name=constants.DEFAULT_QUEUE) analysis.Update( analyze_recent_flakiness_status=analysis_status.RUNNING, analyze_recent_flakiness_pipeline_status_path=( pipeline_job.pipeline_status_path)) analysis.LogInfo( 'An analysis of recent flakiness was scheduled with path {}'.format( pipeline_job.pipeline_status_path))
def delay_callback(self, countdown, callback_params, name=None): target = appengine_util.GetTargetNameForModule(constants.WATERFALL_BACKEND) task = self.get_callback_task( countdown=countdown, target=target, params={'callback_params': json.dumps(callback_params)}, name=name) task.add(queue_name=constants.WATERFALL_ANALYSIS_QUEUE)
def _AsyncProcessFlakeReport(flake_analysis_request, user_email, is_admin): """Pushes a task on the backend to process the flake report.""" target = appengine_util.GetTargetNameForModule(constants.WATERFALL_BACKEND) payload = pickle.dumps((flake_analysis_request, user_email, is_admin)) taskqueue.add( url=constants.WATERFALL_PROCESS_FLAKE_ANALYSIS_REQUEST_URL, payload=payload, target=target, queue_name=constants.WATERFALL_FLAKE_ANALYSIS_REQUEST_QUEUE)
def _AsyncProcessFailureAnalysisRequests(builds): """Pushes a task on the backend to process requests of failure analysis.""" target = appengine_util.GetTargetNameForModule(constants.WATERFALL_BACKEND) payload = json.dumps({'builds': builds}) taskqueue.add( url=constants.WATERFALL_PROCESS_FAILURE_ANALYSIS_REQUESTS_URL, payload=payload, target=target, queue_name=constants.WATERFALL_FAILURE_ANALYSIS_REQUEST_QUEUE) # Needed for @Cached to work, but ignored by caller. return 'Only semantically None.'
def ScheduleAnalysisIfNeeded(master_name, builder_name, build_number, failed_steps=None, build_completed=False, force=False, queue_name=constants.DEFAULT_QUEUE): """Schedules an analysis if needed and returns the build analysis. When the build failure was already analyzed and a new analysis is scheduled, the returned WfAnalysis will still have the result of last completed analysis. Args: master_name (str): The master name of the failed build. builder_name (str): The builder name of the failed build. build_number (int): The build number of the failed build. failed_steps (list): The names of all failed steps reported for the build. build_completed (bool): Indicate whether the build is completed. force (bool): If True, a fresh new analysis will be triggered even when an old one was completed already; otherwise bail out. queue_name (str): The task queue to be used for pipeline tasks. Returns: A WfAnalysis instance. """ if NeedANewAnalysis(master_name, builder_name, build_number, failed_steps, build_completed, force): pipeline_job = AnalyzeBuildFailurePipeline(master_name, builder_name, build_number, build_completed, force) # Explicitly run analysis in the backend module "waterfall-backend". # Note: Just setting the target in queue.yaml does NOT work for pipeline # when deployed to App Engine, but it does work in dev-server locally. # A possible reason is that pipeline will pick a default target if none is # specified explicitly, and the default target is used rather than the one # in the queue.yaml file, but this contradicts the documentation in # https://cloud.google.com/appengine/docs/python/taskqueue/tasks#Task. pipeline_job.target = appengine_util.GetTargetNameForModule( constants.WATERFALL_BACKEND) pipeline_job.start(queue_name=queue_name) logging.info('An analysis was scheduled for build %s, %s, %s: %s', master_name, builder_name, build_number, pipeline_job.pipeline_status_path()) else: logging.info('An analysis is not needed for build %s, %s, %s', master_name, builder_name, build_number) return WfAnalysis.Get(master_name, builder_name, build_number)
def HandleGet(self): client_id = self.request.get('client_id', CrashClient.CRACAS) key = self.request.get('key') if not key: return self.CreateError( 'Should provide key of the analysis to rerun.') pipeline = RerunPipeline(client_id, [key], publish_to_client=bool( self.request.get('publish'))) # Attribute defined outside __init__ - pylint: disable=W0201 pipeline.target = appengine_util.GetTargetNameForModule(RERUN_SERVICE) pipeline.start(queue_name=RERUN_QUEUE) return {'data': {'success': True}}
def AsyncProcessFlakeReport(flake_analysis_request, user_email, is_admin): """Pushes a task on the backend to process the flake report.""" if appengine_util.IsStaging(): # Bails out for staging. logging.info( 'Got flake_analysis_request for %s on staging. No flake ' 'analysis runs on staging.', flake_analysis_request.name) return target = appengine_util.GetTargetNameForModule(constants.WATERFALL_BACKEND) payload = pickle.dumps((flake_analysis_request, user_email, is_admin)) taskqueue.add(url=constants.WATERFALL_PROCESS_FLAKE_ANALYSIS_REQUEST_URL, payload=payload, target=target, queue_name=constants.WATERFALL_FLAKE_ANALYSIS_REQUEST_QUEUE)
def _HandlePossibleFailuresInBuild(project, bucket, builder_name, build_id, build_result): # pragma: no cover """Schedules a taskqueue task to process a completed failed build.""" try: taskqueue.add( name='buildfailure-%s' % build_id, # Avoid duplicate tasks. url='/findit/internal/v2/task/build-completed', payload=json.dumps({ 'project': project, 'bucket': bucket, 'builder_name': builder_name, 'build_id': build_id, 'build_result': build_result, }), target=appengine_util.GetTargetNameForModule('findit-backend'), queue_name='failure-detection-queue') except (taskqueue.TombstonedTaskError, taskqueue.TaskAlreadyExistsError): logging.warning('Build %s was already scheduled to be processed', build_id)
def StartAnalysis(json_crash_data): """Creates a pipeline object to perform the analysis, and start it. Args: client_id (CrashClient): Can be CrashClient.FRACAS, CrashClient.CRACAS or CrashClient.CLUSTERFUZZ. identifiers (dict): key value pairs to uniquely identify a crash. need_analysis (bool): Whether or not we should schedule CrashAnalysisPipeline. """ # N.B., we cannot pass ``predator_client`` directly to the _pipeline_cls, # because it is not JSON-serializable (and there's no way to make it such, # since JSON-serializability is defined by JSON-encoders rather than # as methods on the objects being encoded). pipeline = crash_pipeline.CrashWrapperPipeline(json_crash_data) # Attribute defined outside __init__ - pylint: disable=W0201 pipeline.target = appengine_util.GetTargetNameForModule( constants.CRASH_BACKEND[json_crash_data['client_id']]) queue_name = constants.CRASH_ANALYSIS_QUEUE[json_crash_data['client_id']] pipeline.start(queue_name=queue_name)
def StartNewAnalysis(client_id, identifiers): """Creates a pipeline object to perform the analysis, and start it. Args: client_id (CrashClient): Can be CrashClient.FRACAS, CrashClient.CRACAS or CrashClient.CLUSTERFUZZ. identifiers (dict): key value pairs to uniquely identify a crash. """ logging.info('New %s analysis is scheduled for %s', client_id, repr(identifiers)) # N.B., we cannot pass ``findit_client`` directly to the _pipeline_cls, # because it is not JSON-serializable (and there's no way to make it such, # since JSON-serializability is defined by JSON-encoders rather than # as methods on the objects being encoded). pipeline = crash_pipeline.CrashWrapperPipeline(client_id, identifiers) # Attribute defined outside __init__ - pylint: disable=W0201 pipeline.target = appengine_util.GetTargetNameForModule( constants.CRASH_BACKEND[client_id]) queue_name = constants.CRASH_ANALYSIS_QUEUE[client_id] pipeline.start(queue_name=queue_name)
def _EnqueueDetectFlakeByBuildTasks(build_id, flake_type_desc): """Enqueues a task to detect a type of flakes for the build in the row. Caches task names to deduplicate tasks for the same build and flake_type. """ target = appengine_util.GetTargetNameForModule( constants.FLAKE_DETECTION_BACKEND) params = DetectFlakesFromFlakyCQBuildParam( build_id=build_id, flake_type_desc=flake_type_desc).ToSerializable() try: task_name = 'detect-flake-{}-{}'.format( build_id, flake_type_desc.replace(' ', '_')) taskqueue.add(name=task_name, url=_DETECT_FLAKES_IN_BUILD_TASK_URL, payload=json.dumps(params), target=target, queue_name=constants.FLAKE_DETECTION_MULTITASK_QUEUE) return task_name except (taskqueue.TombstonedTaskError, taskqueue.TaskAlreadyExistsError): logging.info('%s flakes of build %s was already checked.', flake_type_desc, build_id)
def _ContinueTryJobPipeline(self, pipeline_input, failure_info, signals): heuristic_result = { 'failure_info': failure_info, 'signals': signals, 'heuristic_result': None } start_compile_try_job_input = StartCompileTryJobInput( build_key=pipeline_input.build_key, heuristic_result=CompileHeuristicAnalysisOutput.FromSerializable( heuristic_result), build_completed=pipeline_input.build_completed, force=pipeline_input.force) try_job_pipeline = StartCompileTryJobPipeline( start_compile_try_job_input) try_job_pipeline.target = appengine_util.GetTargetNameForModule( constants.WATERFALL_BACKEND) try_job_pipeline.start(queue_name=constants.WATERFALL_ANALYSIS_QUEUE) master_name, builder_name, build_number = ( pipeline_input.build_key.GetParts()) logging.info( 'A try job pipeline for build %s, %s, %s starts after heuristic ' 'analysis was aborted. Check pipeline at: %s.', master_name, builder_name, build_number, self.pipeline_status_path)
def HandlePost(self): # TODO(robertocn): Find out why one of these works for local testing, and # the other one for deploy-test-prod try: envelope = json.loads(self.request.body) except ValueError: envelope = json.loads(self.request.params.get('data')) try: token = envelope['message']['attributes']['auth_token'] if token != GetVerificationToken(): return {'return_code': 400} build_id = envelope['message']['attributes']['build_id'] payload = base64.b64decode(envelope['message']['data']) # Expected payload format: # json.dumps({ # 'build_id': '123412342130498', # Buildbucket id # 'user_data': json.dumps({ # 'Message-Type': 'BuildbucketStatusChange'}), # # Plus any data from MakePubsubCallback # }) message = json.loads(payload) user_data = json.loads(message['user_data']) if user_data['Message-Type'] == 'BuildbucketStatusChange': for kind in ['WfTryJobData', 'FlakeTryJobData']: try_job_data = ndb.Key(kind, build_id).get() if not try_job_data: continue if try_job_data.callback_url: url = try_job_data.callback_url # TODO(robertocn): After a transitional period, all try_job_data # entities should have a targed defined. Remove the or clause. target = try_job_data.callback_target or ( appengine_util.GetTargetNameForModule( constants.WATERFALL_BACKEND)) taskqueue.add(method='GET', url=url, target=target, queue_name=constants.WATERFALL_ANALYSIS_QUEUE) return {} else: logging.warning('The tryjob referenced by pubsub does not have an ' 'associated pipeline callback url.') # We return 200 because we don't want pubsub to retry the push. return {} logging.warning('The build is not known by findit.') # We return 200 because we don't want pubsub to retry the push. return {} else: # We raise an exception instead of accepting the push because we might # be an older version (than the one that sent the new message type) raise Exception('Unsupported message type %s' % user_data[ 'Message-Type']) except KeyError: raise Exception('The message was not in the expected format: \n' '{"message": {\n' ' "attributes": {\n' ' "auth_token": <valid_token>,\n' ' "build_id": <Buildbucket id>,\n' ' }\n' ' "data": <serialization of {\n' ' "build_id": <Buildbucket id>, # Second copy.\n' ' "user_data": <serialization of {\n' ' "Message-Type": "BuildbucketStatusChange"\n' ' }>\n' ' }>\n' '}}\n')
def ScheduleAnalysisIfNeeded(master_name, builder_name, build_number, failed_steps=None, build_completed=False, force=False, queue_name=constants.DEFAULT_QUEUE): """Schedules an analysis if needed and returns the build analysis. When the build failure was already analyzed and a new analysis is scheduled, the returned WfAnalysis will still have the result of last completed analysis. Args: master_name (str): The master name of the failed build. builder_name (str): The builder name of the failed build. build_number (int): The build number of the failed build. failed_steps (list): The names of all failed steps reported for the build. build_completed (bool): Indicate whether the build is completed. force (bool): If True, a fresh new analysis will be triggered even when an old one was completed already; otherwise bail out. queue_name (str): The task queue to be used for pipeline tasks. Returns: A WfAnalysis instance. """ if NeedANewAnalysis(master_name, builder_name, build_number, failed_steps, build_completed, force): failure_info, should_proceed = ci_failure.GetBuildFailureInfo( master_name, builder_name, build_number) if not should_proceed: return WfAnalysis.Get(master_name, builder_name, build_number) build_key = BuildKey(master_name=master_name, builder_name=builder_name, build_number=build_number) if failure_info['failure_type'] == failure_type.COMPILE: # Use new compile pipelines. # TODO(crbug/869684): Use a gauge metric to track intermittent statuses. compile_pipeline_input = AnalyzeCompileFailureInput( build_key=build_key, current_failure_info=CompileFailureInfo.FromSerializable( failure_info), build_completed=build_completed, force=force) pipeline_job = AnalyzeCompileFailurePipeline( compile_pipeline_input) else: # TODO(crbug/869684): Use a gauge metric to track intermittent statuses. test_pipeline_input = AnalyzeTestFailureInput( build_key=build_key, current_failure_info=TestFailureInfo.FromSerializable( failure_info), build_completed=build_completed, force=force) pipeline_job = AnalyzeTestFailurePipeline(test_pipeline_input) # Explicitly run analysis in the backend module "waterfall-backend". # Note: Just setting the target in queue.yaml does NOT work for pipeline # when deployed to App Engine, but it does work in dev-server locally. # A possible reason is that pipeline will pick a default target if none is # specified explicitly, and the default target is used rather than the one # in the queue.yaml file, but this contradicts the documentation in # https://cloud.google.com/appengine/docs/python/taskqueue/tasks#Task. pipeline_job.target = appengine_util.GetTargetNameForModule( constants.WATERFALL_BACKEND) pipeline_job.start(queue_name=queue_name) logging.info('An analysis was scheduled for build %s, %s, %s: %s', master_name, builder_name, build_number, pipeline_job.pipeline_status_path) else: logging.info('An analysis is not needed for build %s, %s, %s', master_name, builder_name, build_number) return WfAnalysis.Get(master_name, builder_name, build_number)
def ScheduleAnalysisIfNeeded( normalized_test, original_test, flake_key, bug_id=None, allow_new_analysis=False, force=False, manually_triggered=False, user_email=None, triggering_source=triggering_sources.FINDIT_PIPELINE, queue_name=constants.DEFAULT_QUEUE): """Schedules an analysis if needed and returns the MasterFlakeAnalysis. When the build failure was already analyzed and a new analysis is scheduled, the returned WfAnalysis will still have the result of last completed analysis. Args: normalized_test (TestInfo): Info of the normalized flaky test after mapping a CQ trybot step to a Waterfall buildbot step, striping prefix "PRE_" from a gtest, etc. original_test (TestInfo): Info of the original flaky test. flake_key (ndb.Key): The key to the Flake responsible for triggering this analysis. bug_id (int): The monorail bug id to update when analysis is done. allow_new_analysis (bool): Indicate whether a new analysis is allowed. force (bool): Indicate whether to force a rerun of current analysis. manually_triggered (bool): True if the analysis was requested manually, such as by a Chromium sheriff. user_email (str): The email of the user requesting the analysis. triggering_source (int): From where this analysis was triggered, such as through Findit pipeline, UI, or through Findit API. queue_name (str): The App Engine queue to run the analysis. Returns: A MasterFlakeAnalysis instance. None if no analysis was scheduled and the user has no permission to. """ need_new_analysis, analysis = _NeedANewAnalysis( normalized_test, original_test, flake_key, bug_id=bug_id, allow_new_analysis=allow_new_analysis, force=force, user_email=user_email, triggering_source=triggering_source) if need_new_analysis: # _NeedANewAnalysis just created master_flake_analysis. Use the latest # version number and pass that along to the other pipelines for updating # results and data. logging.info( 'A new master flake analysis was successfully saved for %s (%s) and ' 'will be captured in version %s', repr(normalized_test), repr(original_test), analysis.version_number) step_metadata = (step_util.LegacyGetStepMetadata( normalized_test.master_name, normalized_test.builder_name, normalized_test.build_number, normalized_test.step_name) or step_util.LegacyGetStepMetadata( original_test.master_name, original_test.builder_name, original_test.build_number, original_test.step_name)) logging.info('Initializing flake analysis pipeline for key: %s', analysis.key) starting_build_info = build_util.GetBuildInfo( normalized_test.master_name, normalized_test.builder_name, normalized_test.build_number) original_build_info = build_util.GetBuildInfo( original_test.master_name, original_test.builder_name, original_test.build_number) assert starting_build_info, ( 'Failed to get starting build for flake analysis') starting_commit_position = starting_build_info.commit_position assert starting_commit_position is not None, ( 'Cannot analyze flake without a starting commit position') assert original_build_info, 'Failed to get original build info' # Get the dimensions of the bot for when try jobs are needed to compile. dimensions = try_job_service.GetDimensionsFromBuildInfo( starting_build_info) analyze_flake_input = AnalyzeFlakeInput( analysis_urlsafe_key=analysis.key.urlsafe(), analyze_commit_position_parameters=NextCommitPositionOutput( culprit_commit_id=None, next_commit_id=CommitID( commit_position=starting_commit_position, revision=starting_build_info.chromium_revision)), commit_position_range=IntRange(lower=None, upper=starting_commit_position), dimensions=ListOfBasestring.FromSerializable(dimensions), manually_triggered=manually_triggered, retries=0, rerun=force, step_metadata=StepMetadata.FromSerializable(step_metadata)) pipeline_job = AnalyzeFlakePipeline(analyze_flake_input) pipeline_job.target = appengine_util.GetTargetNameForModule( constants.WATERFALL_BACKEND) pipeline_job.start(queue_name=queue_name) analysis.pipeline_status_path = pipeline_job.pipeline_status_path analysis.root_pipeline_id = pipeline_job.root_pipeline_id analysis.build_id = starting_build_info.buildbucket_id analysis.original_build_id = original_build_info.buildbucket_id analysis.put() analysis.LogInfo(( 'A flake analysis was scheduled using commit-based pipelines with ' 'path {}').format(pipeline_job.pipeline_status_path)) else: logging.info('A flake analysis not necessary for build %s, %s, %s, %s', normalized_test.master_name, normalized_test.builder_name, normalized_test.build_number, normalized_test.step_name) return analysis
def run(self, master_name, builder_name, build_number, step_name, task_id=None, *args): """Monitors a swarming task. Args: master_name (str): The master name. builder_name (str): The builder name. build_number (str): The build number. step_name (str): The failed test step name. task_id (str): The task id to query the swarming server on the progresss of a swarming task. """ call_args = self._GetArgs(master_name, builder_name, build_number, step_name, *args) task = self._GetSwarmingTask(*call_args) task_id = task_id or task.task_id if not task_id: # The swarming task encountered an error when being triggered. if not task.error: # pragma no branch task.error = { 'error': 'Undetected error in swarming task. No task id found!', 'message': 'Undetected error in swarming task. No task id found!' } task.put() return # Check to make this method idempotent. if task.callback_url and self.pipeline_id in task.callback_url: return timeout_hours = waterfall_config.GetSwarmingSettings().get( 'task_timeout_hours') deadline = time.time() + timeout_hours * 60 * 60 server_query_interval_seconds = waterfall_config.GetSwarmingSettings().get( 'server_query_interval_seconds') task_started = False task_completed = False step_name_no_platform = None if task_id.lower() in (NO_TASK, NO_TASK_EXCEPTION): # pragma: no branch # This situation happens in flake analysis: if the step with flaky test # didn't exist in checked build or the build had exception so the step # with flaky test didn't run at all, we should skip the build. has_valid_artifact = task_id != NO_TASK_EXCEPTION task.task_id = None task.status = analysis_status.SKIPPED task.put() self._UpdateMasterFlakeAnalysis( *call_args, pass_rate=-1, flake_swarming_task=task, has_valid_artifact=has_valid_artifact) self.complete(self._GetPipelineResult( step_name, step_name_no_platform, task)) return self.last_params = { 'task_id': task_id, 'step_name': step_name, 'call_args': call_args, 'deadline': deadline, 'server_query_interval_seconds': server_query_interval_seconds, 'task_started': task_started, 'task_completed': task_completed, 'step_name_no_platform': step_name_no_platform, } task.callback_url = self.get_callback_url(callback_params=json.dumps( self.last_params)) task.callback_target = appengine_util.GetTargetNameForModule( constants.WATERFALL_BACKEND) task.put() # Guarantee one callback 10 minutes after the deadline to clean up even if # Swarming fails to call us back. self.delay_callback((timeout_hours * 60 + 10) * 60, self.last_params, name=task_id + '_cleanup_task') # Run immediately in case the task already went from scheduled to started. self.callback(callback_params=self.last_params)
def run(self, master_name, builder_name, triggering_build_number, current_build_number, step_name, test_name, version_number, step_metadata=None, use_nearby_neighbor=False, manually_triggered=False): # Get MasterFlakeAnalysis success list corresponding to parameters. analysis = MasterFlakeAnalysis.GetVersion(master_name, builder_name, triggering_build_number, step_name, test_name, version=version_number) flake_swarming_task = FlakeSwarmingTask.Get(master_name, builder_name, current_build_number, step_name, test_name) # Don't call another pipeline if we fail. if flake_swarming_task.status == analysis_status.ERROR: # Report the last flake swarming task's error that it encountered. # TODO(lijeffrey): Another neighboring swarming task may be needed in this # one's place instead of failing altogether. error = flake_swarming_task.error or { 'error': 'Swarming task failed', 'message': 'The last swarming task did not complete as expected' } _UpdateAnalysisStatusUponCompletion(analysis, None, analysis_status.ERROR, error) logging.error('Error in Swarming task') yield UpdateFlakeBugPipeline(analysis.key.urlsafe()) return if not analysis.algorithm_parameters: # Uses analysis' own algorithm_parameters. flake_settings = waterfall_config.GetCheckFlakeSettings() analysis.algorithm_parameters = flake_settings analysis.put() algorithm_settings = analysis.algorithm_parameters.get( 'swarming_rerun') data_points = _NormalizeDataPoints(analysis.data_points) # Figure out what build_number to trigger a swarming rerun on next, if any. next_build_number, suspected_build, iterations_to_rerun = ( lookback_algorithm.GetNextRunPointNumber(data_points, algorithm_settings)) if iterations_to_rerun: # Need to rerun the first build with more iterations. _UpdateIterationsToRerun(analysis, iterations_to_rerun) _RemoveRerunBuildDataPoint(analysis, next_build_number) analysis.put() max_build_numbers_to_look_back = algorithm_settings.get( 'max_build_numbers_to_look_back', _DEFAULT_MAX_BUILD_NUMBERS) last_build_number = max( 0, triggering_build_number - max_build_numbers_to_look_back) if ((next_build_number < last_build_number or next_build_number >= triggering_build_number) and not iterations_to_rerun): # Finished. build_confidence_score = None if suspected_build is not None: # Use steppiness as the confidence score. build_confidence_score = confidence.SteppinessForBuild( analysis.data_points, suspected_build) # Update suspected build and the confidence score. _UpdateAnalysisStatusUponCompletion( analysis, suspected_build, analysis_status.COMPLETED, None, build_confidence_score=build_confidence_score) if build_confidence_score is None: logging.info( ('Skipping try jobs due to no suspected flake build being ' 'identified')) elif not _HasSufficientConfidenceToRunTryJobs(analysis): logging.info( ('Skipping try jobs due to insufficient confidence in ' 'suspected build')) else: # Hook up with try-jobs. Based on analysis of historical data, 60% # confidence could filter out almost all false positives. suspected_build_point = analysis.GetDataPointOfSuspectedBuild() assert suspected_build_point blamed_cls, lower_bound = _GetFullBlamedCLsAndLowerBound( suspected_build_point, analysis.data_points) if blamed_cls: if len(blamed_cls) > 1: logging.info( 'Running try-jobs against commits in regressions') start_commit_position = suspected_build_point.commit_position - 1 start_revision = blamed_cls[start_commit_position] build_info = build_util.GetBuildInfo( master_name, builder_name, triggering_build_number) parent_mastername = build_info.parent_mastername or master_name parent_buildername = build_info.parent_buildername or builder_name cache_name = swarming_util.GetCacheName( parent_mastername, parent_buildername) dimensions = waterfall_config.GetTrybotDimensions( parent_mastername, parent_buildername) yield RecursiveFlakeTryJobPipeline( analysis.key.urlsafe(), start_commit_position, start_revision, lower_bound, cache_name, dimensions) return # No update to bug yet. else: logging.info( 'Single commit in the blame list of suspected build' ) culprit_confidence_score = confidence.SteppinessForCommitPosition( analysis.data_points, suspected_build_point.commit_position) culprit = recursive_flake_try_job_pipeline.CreateCulprit( suspected_build_point.git_hash, suspected_build_point.commit_position, culprit_confidence_score) UpdateAnalysisUponCompletion(analysis, culprit, analysis_status.COMPLETED, None) else: logging.error( 'Cannot run flake try jobs against empty blame list') error = { 'error': 'Could not start try jobs', 'message': 'Empty blame list' } UpdateAnalysisUponCompletion(analysis, None, analysis_status.ERROR, error) yield UpdateFlakeBugPipeline(analysis.key.urlsafe()) return pipeline_job = RecursiveFlakePipeline( master_name, builder_name, next_build_number, step_name, test_name, version_number, triggering_build_number, step_metadata=step_metadata, manually_triggered=manually_triggered, use_nearby_neighbor=use_nearby_neighbor, step_size=(current_build_number - next_build_number)) # Disable attribute 'target' defined outside __init__ pylint warning, # because pipeline generates its own __init__ based on run function. pipeline_job.target = ( # pylint: disable=W0201 appengine_util.GetTargetNameForModule(constants.WATERFALL_BACKEND)) pipeline_job.start( queue_name=self.queue_name or constants.DEFAULT_QUEUE)
def run(self, master_name, builder_name, preferred_run_build_number, step_name, test_name, version_number, triggering_build_number, step_metadata=None, manually_triggered=False, use_nearby_neighbor=False, step_size=0, retries=0): """Pipeline to determine the regression range of a flaky test. Args: master_name (str): The master name. builder_name (str): The builder name. preferred_run_build_number (int): The build number the check flake algorithm should perform a swarming rerun on, but may be overridden to use the results of a nearby neighbor if use_nearby_neighbor is True. step_name (str): The step name. test_name (str): The test name. version_number (int): The version to save analysis results and data to. triggering_build_number (int): The build number that triggered this analysis. step_metadata (dict): Step_metadata for the test. manually_triggered (bool): True if the analysis is from manual request, like by a Chromium sheriff. use_nearby_neighbor (bool): Whether the optimization for using the swarming results of a nearby build number, if available, should be used in place of triggering a new swarming task on preferred_run_build_number. step_size (int): The difference in build numbers since the last call to RecursiveFlakePipeline to determine the bounds for how far a nearby build's swarming task results should be used. Only relevant if use_nearby_neighbor is True. retries (int): Number of retries of this pipeline. If reties exceeds the _MAX_RETRY_TIMES, start this pipeline off peak hours. Returns: A dict of lists for reliable/flaky tests. """ # If retries has not exceeded max count and there are available bots, # we can start the analysis. can_start_analysis = (self._BotsAvailableForTask(step_metadata) if retries <= _MAX_RETRY_TIMES else True) if not can_start_analysis: retries += 1 pipeline_job = RecursiveFlakePipeline( master_name, builder_name, preferred_run_build_number, step_name, test_name, version_number, triggering_build_number, step_metadata, manually_triggered=manually_triggered, use_nearby_neighbor=use_nearby_neighbor, step_size=step_size, retries=retries) # Disable attribute 'target' defined outside __init__ pylint warning, # because pipeline generates its own __init__ based on run function. pipeline_job.target = ( # pylint: disable=W0201 appengine_util.GetTargetNameForModule( constants.WATERFALL_BACKEND)) if retries > _MAX_RETRY_TIMES: pipeline_job._StartOffPSTPeakHours( queue_name=self.queue_name or constants.DEFAULT_QUEUE) logging.info( 'Retrys exceed max count, RecursiveFlakePipeline on ' 'MasterFlakeAnalysis %s/%s/%s/%s/%s will start off peak ' 'hour', master_name, builder_name, triggering_build_number, step_name, test_name) else: pipeline_job._RetryWithDelay( queue_name=self.queue_name or constants.DEFAULT_QUEUE) countdown = retries * _BASE_COUNT_DOWN_SECONDS logging.info( 'No available swarming bots, RecursiveFlakePipeline on ' 'MasterFlakeAnalysis %s/%s/%s/%s/%s will be tried after' '%d seconds', master_name, builder_name, triggering_build_number, step_name, test_name, countdown) else: # Bots are available or pipeline starts off peak hours, trigger the task. flake_analysis = MasterFlakeAnalysis.GetVersion( master_name, builder_name, triggering_build_number, step_name, test_name, version=version_number) logging.info( 'Running RecursiveFlakePipeline on MasterFlakeAnalysis' ' %s/%s/%s/%s/%s', master_name, builder_name, triggering_build_number, step_name, test_name) logging.info('MasterFlakeAnalysis %s version %s', flake_analysis, version_number) if flake_analysis.status != analysis_status.RUNNING: # pragma: no branch flake_analysis.status = analysis_status.RUNNING flake_analysis.start_time = time_util.GetUTCNow() flake_analysis.put() # TODO(lijeffrey): Allow custom parameters supplied by user. iterations = flake_analysis.algorithm_parameters.get( 'swarming_rerun', {}).get('iterations_to_rerun', 100) hard_timeout_seconds = _GetHardTimeoutSeconds( master_name, builder_name, triggering_build_number, step_name, iterations) actual_run_build_number = _GetBestBuildNumberToRun( master_name, builder_name, preferred_run_build_number, step_name, test_name, step_size, iterations) if use_nearby_neighbor else ( preferred_run_build_number) # Call trigger pipeline (flake style). task_id = yield TriggerFlakeSwarmingTaskPipeline( master_name, builder_name, actual_run_build_number, step_name, [test_name], iterations, hard_timeout_seconds) with pipeline.InOrder(): yield ProcessFlakeSwarmingTaskResultPipeline( master_name, builder_name, actual_run_build_number, step_name, task_id, triggering_build_number, test_name, version_number) yield NextBuildNumberPipeline( master_name, builder_name, triggering_build_number, actual_run_build_number, step_name, test_name, version_number, step_metadata=step_metadata, use_nearby_neighbor=use_nearby_neighbor, manually_triggered=manually_triggered)
def run(self, urlsafe_flake_analysis_key, urlsafe_try_job_key, lower_boundary_commit_position, cache_name, dimensions): """Determines the next commit position to run a try job on. Args: urlsafe_flake_analysis_key (str): The url-safe key to the corresponding flake analysis that triggered this pipeline. urlsafe_try_job_key (str): The url-safe key to the try job that was just run. lower_boundary_commit_position (int): The lower bound of commit position that can run a try job. """ flake_analysis = ndb.Key(urlsafe=urlsafe_flake_analysis_key).get() try_job = ndb.Key(urlsafe=urlsafe_try_job_key).get() assert flake_analysis assert try_job assert try_job.try_job_ids try_job_id = try_job.try_job_ids[-1] try_job_data = FlakeTryJobData.Get(try_job_id) # Don't call another pipeline if the previous try job failed. if try_job_data.error: UpdateAnalysisUponCompletion(flake_analysis, None, analysis_status.ERROR, try_job_data.error) yield UpdateFlakeBugPipeline(flake_analysis.key.urlsafe()) return suspected_build_data_point = flake_analysis.GetDataPointOfSuspectedBuild( ) # Because there are hard lower and upper bounds, only the data points # involved in try jobs should be considered when determining the next # commit position to test. try_job_data_points = _GetNormalizedTryJobDataPoints(flake_analysis) algorithm_settings = flake_analysis.algorithm_parameters.get( 'try_job_rerun', {}) # Figure out what commit position to trigger the next try job on, if any. next_commit_position, suspected_commit_position, _ = ( lookback_algorithm.GetNextRunPointNumber( try_job_data_points, algorithm_settings, lower_boundary_commit_position)) if suspected_commit_position is not None: # Finished. confidence_score = confidence.SteppinessForCommitPosition( flake_analysis.data_points, suspected_commit_position) culprit_revision = suspected_build_data_point.GetRevisionAtCommitPosition( suspected_commit_position) culprit = CreateCulprit(culprit_revision, suspected_commit_position, confidence_score) UpdateAnalysisUponCompletion(flake_analysis, culprit, analysis_status.COMPLETED, None) yield UpdateFlakeBugPipeline(flake_analysis.key.urlsafe()) return next_revision = suspected_build_data_point.GetRevisionAtCommitPosition( next_commit_position) pipeline_job = RecursiveFlakeTryJobPipeline( urlsafe_flake_analysis_key, next_commit_position, next_revision, lower_boundary_commit_position, cache_name, dimensions) # Disable attribute 'target' defined outside __init__ pylint warning, # because pipeline generates its own __init__ based on run function. pipeline_job.target = ( # pylint: disable=W0201 appengine_util.GetTargetNameForModule(constants.WATERFALL_BACKEND)) pipeline_job.start()
def run(self, urlsafe_try_job_key, try_job_type, try_job_id): """Monitors try job until it's complete. This method stores parameters in self so that the callback method can perform appropriate checks. callback(), defined below is expected to run when a pubsub notification from the buildbucket service is sent to this application indicating that the job has changed status. callback() is also run in two occassions separate from pubsub: - at the end of this run method (i.e. when creating this pipeline) - after timeout_hours have passed without the job completing. Args: urlsafe_try_job_key (str): The urlsafe key for the corresponding try job entity. try_job_type (str): The type of the try job. try_job_id (str): The try job id to query buildbucket with. """ if not try_job_id: self.complete() return if try_job_type == failure_type.FLAKY_TEST: try_job_kind = FlakeTryJobData else: try_job_kind = WfTryJobData try_job_data = try_job_kind.Get(try_job_id) if not try_job_data: logging.error('%(kind)s entity does not exist for id %(id)s: creating it', {'kind': try_job_kind, 'id': try_job_id}) try_job_data = try_job_kind.Create(try_job_id) try_job_data.try_job_key = ndb.Key(urlsafe=urlsafe_try_job_key) # Check if callback url is already registered with the TryJobData entity to # guarantee this run method is idempotent when called again with the same # params. if try_job_data.callback_url and ( self.pipeline_id in try_job_data.callback_url): return timeout_hours = waterfall_config.GetTryJobSettings().get( 'job_timeout_hours') default_pipeline_wait_seconds = waterfall_config.GetTryJobSettings( ).get( 'server_query_interval_seconds') max_error_times = waterfall_config.GetTryJobSettings().get( 'allowed_response_error_times') # TODO(chanli): Make sure total wait time equals to timeout_hours # regardless of retries. deadline = time.time() + timeout_hours * 60 * 60 already_set_started = False backoff_time = default_pipeline_wait_seconds error_count = 0 self.last_params = { 'try_job_id': try_job_id, 'try_job_type': try_job_type, 'urlsafe_try_job_key': urlsafe_try_job_key, 'deadline': deadline, 'already_set_started': already_set_started, 'error_count': error_count, 'max_error_times': max_error_times, 'default_pipeline_wait_seconds': default_pipeline_wait_seconds, 'timeout_hours': timeout_hours, 'backoff_time': backoff_time, } callback_url = self.get_callback_url(callback_params=json.dumps( self.last_params)) try_job_data.callback_url = callback_url try_job_data.callback_target = appengine_util.GetTargetNameForModule( constants.WATERFALL_BACKEND) try_job_data.put() # Guarantee one callback 10 minutes after the deadline to clean up even if # buildbucket fails to call us back. self.delay_callback( (timeout_hours * 60 + 10) * 60, self.last_params, name=try_job_id + '_cleanup_task') # Run immediately in case the job already went from scheduled to started. self.callback(callback_params=self.last_params)
def ScheduleAnalysisIfNeeded( normalized_test, original_test, bug_id=None, allow_new_analysis=False, force=False, manually_triggered=False, user_email=None, triggering_source=triggering_sources.FINDIT_PIPELINE, queue_name=constants.DEFAULT_QUEUE): """Schedules an analysis if needed and returns the MasterFlakeAnalysis. When the build failure was already analyzed and a new analysis is scheduled, the returned WfAnalysis will still have the result of last completed analysis. Args: normalized_test (TestInfo): Info of the normalized flaky test after mapping a CQ trybot step to a Waterfall buildbot step, striping prefix "PRE_" from a gtest, etc. original_test (TestInfo): Info of the original flaky test. bug_id (int): The monorail bug id to update when analysis is done. allow_new_analysis (bool): Indicate whether a new analysis is allowed. force (bool): Indicate whether to force a rerun of current analysis. manually_triggered (bool): True if the analysis was requested manually, such as by a Chromium sheriff. user_email (str): The email of the user requesting the analysis. triggering_source (int): From where this analysis was triggered, such as through Findit pipeline, UI, or through Findit API. queue_name (str): The App Engine queue to run the analysis. Returns: A MasterFlakeAnalysis instance. None if no analysis was scheduled and the user has no permission to. """ flake_settings = waterfall_config.GetCheckFlakeSettings() use_nearby_neighbor = flake_settings.get('swarming_rerun', {}).get('use_nearby_neighbor', False) need_new_analysis, analysis = _NeedANewAnalysis( normalized_test, original_test, flake_settings, bug_id=bug_id, allow_new_analysis=allow_new_analysis, force=force, user_email=user_email, triggering_source=triggering_source) if need_new_analysis: # _NeedANewAnalysis just created master_flake_analysis. Use the latest # version number and pass that along to the other pipelines for updating # results and data. logging.info( 'A new master flake analysis was successfully saved for %s (%s) and ' 'will be captured in version %s', repr(normalized_test), repr(original_test), analysis.version_number) step_metadata = buildbot.GetStepLog(normalized_test.master_name, normalized_test.builder_name, normalized_test.build_number, normalized_test.step_name, HttpClientAppengine(), 'step_metadata') pipeline_job = RecursiveFlakePipeline( normalized_test.master_name, normalized_test.builder_name, normalized_test.build_number, normalized_test.step_name, normalized_test.test_name, analysis.version_number, triggering_build_number=normalized_test.build_number, step_metadata=step_metadata, manually_triggered=manually_triggered, use_nearby_neighbor=use_nearby_neighbor) pipeline_job.target = appengine_util.GetTargetNameForModule( constants.WATERFALL_BACKEND) pipeline_job.start(queue_name=queue_name) return analysis