def testObscureMasterFlakeAnalysis(self): self.mock_current_user(user_email='*****@*****.**', is_admin=True) mocked_utcnow = datetime(2017, 05, 05, 22, 50, 10) self.MockUTCNow(mocked_utcnow) valid_record_time = obscure_emails._TimeBeforeNow(days=1) valid_request_time = obscure_emails._TimeBeforeNow(days=5) invalid_record_time = obscure_emails._TimeBeforeNow( days=obscure_emails._TRIAGE_RECORD_RENTENSION_DAYS + 10) invalid_request_time = obscure_emails._TimeBeforeNow( days=obscure_emails._REQUEST_RECORD_RENTENSION_DAYS + 10) old_analysis = MasterFlakeAnalysis.Create('m', 'b', 1, 's', 't') old_analysis.triage_history.append( TriageResult(user_name='*****@*****.**')) old_analysis.triage_email_obscured = False old_analysis.triage_record_last_add = invalid_record_time old_analysis.triggering_user_email = '*****@*****.**' old_analysis.triggering_user_email_obscured = False old_analysis.request_time = invalid_request_time old_analysis.Save() recent_analysis = MasterFlakeAnalysis.Create('m', 'b', 1000, 's', 't') recent_analysis.triage_history.append( TriageResult(user_name='*****@*****.**')) recent_analysis.triage_email_obscured = False recent_analysis.triage_record_last_add = valid_record_time recent_analysis.triggering_user_email = '*****@*****.**' recent_analysis.triggering_user_email_obscured = False recent_analysis.request_time = valid_request_time recent_analysis.Save() response = self.test_app.get('/obscure-emails', params={'format': 'json'}) expected_response = { 'failure_triage_count': 0, 'flake_triage_count': 1, 'flake_request_aggregated_count': 0, 'flake_request_count': 1, } self.assertEqual(expected_response, response.json_body) old_analysis = MasterFlakeAnalysis.GetVersion('m', 'b', 1, 's', 't') self.assertEqual('*****@*****.**', old_analysis.triage_history[0].user_name) self.assertTrue(old_analysis.triage_email_obscured) self.assertEqual('*****@*****.**', old_analysis.triggering_user_email) self.assertTrue(old_analysis.triggering_user_email_obscured) recent_analysis = MasterFlakeAnalysis.GetVersion( 'm', 'b', 1000, 's', 't') self.assertEqual('*****@*****.**', recent_analysis.triage_history[0].user_name) self.assertFalse(recent_analysis.triage_email_obscured) self.assertEqual('*****@*****.**', recent_analysis.triggering_user_email) self.assertFalse(recent_analysis.triggering_user_email_obscured)
def _UpdateMasterFlakeAnalysis(self, master_name, builder_name, build_number, step_name, master_build_number, test_name, version_number, pass_rate, flake_swarming_task, has_valid_artifact=True): """Update MasterFlakeAnalysis to include result of the swarming task.""" master_flake_analysis = MasterFlakeAnalysis.GetVersion( master_name, builder_name, master_build_number, step_name, test_name, version=version_number) logging.info( 'Updating MasterFlakeAnalysis swarming task data %s/%s/%s/%s/%s', master_name, builder_name, master_build_number, step_name, test_name) data_point = DataPoint() data_point.build_number = build_number data_point.pass_rate = pass_rate data_point.task_id = flake_swarming_task.task_id data_point.has_valid_artifact = has_valid_artifact # Include git information about each build that was run. build_info = build_util.GetBuildInfo(master_name, builder_name, build_number) data_point.commit_position = build_info.commit_position data_point.git_hash = build_info.chromium_revision if build_number > 0: previous_build = build_util.GetBuildInfo(master_name, builder_name, build_number - 1) data_point.previous_build_commit_position = previous_build.commit_position data_point.previous_build_git_hash = previous_build.chromium_revision data_point.blame_list = _GetCommitsBetweenRevisions( previous_build.chromium_revision, build_info.chromium_revision) else: data_point.blame_list = build_info.blame_list master_flake_analysis.data_points.append(data_point) results = flake_swarming_task.GetFlakeSwarmingTaskData() # TODO(lijeffrey): Determine whether or not this flake swarming task # was a cache hit (already ran results for more iterations than were # requested) and update results['cache_hit'] accordingly. master_flake_analysis.swarming_rerun_results.append(results) master_flake_analysis.put()
def _SaveLastAttemptedSwarmingTask(self, master_name, builder_name, build_number, step_name, task_id, *args): # Saves the last-attempted swarming task id to the corresponding analysis. master_build_number, test_name, version_number = args analysis = MasterFlakeAnalysis.GetVersion(master_name, builder_name, master_build_number, step_name, test_name, version=version_number) analysis.last_attempted_build_number = build_number analysis.last_attempted_swarming_task_id = task_id analysis.put()
def _LogUnexpectedAbort(self): if not self.was_aborted: return flake_analysis = MasterFlakeAnalysis.GetVersion( self.master_name, self.builder_name, self.triggering_build_number, self.step_name, self.test_name, version=self.version_number) if flake_analysis and not flake_analysis.completed: flake_analysis.status = analysis_status.ERROR flake_analysis.result_status = None flake_analysis.error = flake_analysis.error or { 'error': 'RecursiveFlakePipeline was aborted unexpectedly', 'message': 'RecursiveFlakePipeline was aborted unexpectedly' } flake_analysis.put()
def testCheckTestsRunStatusesWhenTestDoesNotExist(self, mocked_fn, _): build_info = BuildInfo(self.master_name, self.builder_name, self.build_number) build_info.commit_position = 12345 build_info.chromium_revision = 'a1b2c3d4' mocked_fn.return_value = build_info test_name = 'TestSuite1.new_test' analysis = MasterFlakeAnalysis.Create(self.master_name, self.builder_name, self.build_number, self.step_name, test_name) analysis.Save() task = FlakeSwarmingTask.Create(self.master_name, self.builder_name, self.build_number, self.step_name, test_name) task.put() pipeline = ProcessFlakeSwarmingTaskResultPipeline() tests_statuses = pipeline._CheckTestsRunStatuses( base_test._SAMPLE_FAILURE_LOG, self.master_name, self.builder_name, self.build_number, self.step_name, self.build_number, test_name, self.version_number) self.assertEqual(base_test._EXPECTED_TESTS_STATUS, tests_statuses) task = FlakeSwarmingTask.Get(self.master_name, self.builder_name, self.build_number, self.step_name, test_name) self.assertEqual(0, task.tries) self.assertEqual(0, task.successes) analysis = MasterFlakeAnalysis.GetVersion(self.master_name, self.builder_name, self.build_number, self.step_name, test_name, self.version_number) self.assertTrue(analysis.data_points[-1].pass_rate < 0)
def run(self, master_name, builder_name, triggering_build_number, current_build_number, step_name, test_name, version_number, step_metadata=None, use_nearby_neighbor=False, manually_triggered=False): # Get MasterFlakeAnalysis success list corresponding to parameters. analysis = MasterFlakeAnalysis.GetVersion(master_name, builder_name, triggering_build_number, step_name, test_name, version=version_number) flake_swarming_task = FlakeSwarmingTask.Get(master_name, builder_name, current_build_number, step_name, test_name) # Don't call another pipeline if we fail. if flake_swarming_task.status == analysis_status.ERROR: # Report the last flake swarming task's error that it encountered. # TODO(lijeffrey): Another neighboring swarming task may be needed in this # one's place instead of failing altogether. error = flake_swarming_task.error or { 'error': 'Swarming task failed', 'message': 'The last swarming task did not complete as expected' } _UpdateAnalysisStatusUponCompletion(analysis, None, analysis_status.ERROR, error) logging.error('Error in Swarming task') yield UpdateFlakeBugPipeline(analysis.key.urlsafe()) return if not analysis.algorithm_parameters: # Uses analysis' own algorithm_parameters. flake_settings = waterfall_config.GetCheckFlakeSettings() analysis.algorithm_parameters = flake_settings analysis.put() algorithm_settings = analysis.algorithm_parameters.get( 'swarming_rerun') data_points = _NormalizeDataPoints(analysis.data_points) # Figure out what build_number to trigger a swarming rerun on next, if any. next_build_number, suspected_build, iterations_to_rerun = ( lookback_algorithm.GetNextRunPointNumber(data_points, algorithm_settings)) if iterations_to_rerun: # Need to rerun the first build with more iterations. _UpdateIterationsToRerun(analysis, iterations_to_rerun) _RemoveRerunBuildDataPoint(analysis, next_build_number) analysis.put() max_build_numbers_to_look_back = algorithm_settings.get( 'max_build_numbers_to_look_back', _DEFAULT_MAX_BUILD_NUMBERS) last_build_number = max( 0, triggering_build_number - max_build_numbers_to_look_back) if ((next_build_number < last_build_number or next_build_number >= triggering_build_number) and not iterations_to_rerun): # Finished. build_confidence_score = None if suspected_build is not None: # Use steppiness as the confidence score. build_confidence_score = confidence.SteppinessForBuild( analysis.data_points, suspected_build) # Update suspected build and the confidence score. _UpdateAnalysisStatusUponCompletion( analysis, suspected_build, analysis_status.COMPLETED, None, build_confidence_score=build_confidence_score) if build_confidence_score is None: logging.info( ('Skipping try jobs due to no suspected flake build being ' 'identified')) elif not _HasSufficientConfidenceToRunTryJobs(analysis): logging.info( ('Skipping try jobs due to insufficient confidence in ' 'suspected build')) else: # Hook up with try-jobs. Based on analysis of historical data, 60% # confidence could filter out almost all false positives. suspected_build_point = analysis.GetDataPointOfSuspectedBuild() assert suspected_build_point blamed_cls, lower_bound = _GetFullBlamedCLsAndLowerBound( suspected_build_point, analysis.data_points) if blamed_cls: if len(blamed_cls) > 1: logging.info( 'Running try-jobs against commits in regressions') start_commit_position = suspected_build_point.commit_position - 1 start_revision = blamed_cls[start_commit_position] build_info = build_util.GetBuildInfo( master_name, builder_name, triggering_build_number) parent_mastername = build_info.parent_mastername or master_name parent_buildername = build_info.parent_buildername or builder_name cache_name = swarming_util.GetCacheName( parent_mastername, parent_buildername) dimensions = waterfall_config.GetTrybotDimensions( parent_mastername, parent_buildername) yield RecursiveFlakeTryJobPipeline( analysis.key.urlsafe(), start_commit_position, start_revision, lower_bound, cache_name, dimensions) return # No update to bug yet. else: logging.info( 'Single commit in the blame list of suspected build' ) culprit_confidence_score = confidence.SteppinessForCommitPosition( analysis.data_points, suspected_build_point.commit_position) culprit = recursive_flake_try_job_pipeline.CreateCulprit( suspected_build_point.git_hash, suspected_build_point.commit_position, culprit_confidence_score) UpdateAnalysisUponCompletion(analysis, culprit, analysis_status.COMPLETED, None) else: logging.error( 'Cannot run flake try jobs against empty blame list') error = { 'error': 'Could not start try jobs', 'message': 'Empty blame list' } UpdateAnalysisUponCompletion(analysis, None, analysis_status.ERROR, error) yield UpdateFlakeBugPipeline(analysis.key.urlsafe()) return pipeline_job = RecursiveFlakePipeline( master_name, builder_name, next_build_number, step_name, test_name, version_number, triggering_build_number, step_metadata=step_metadata, manually_triggered=manually_triggered, use_nearby_neighbor=use_nearby_neighbor, step_size=(current_build_number - next_build_number)) # Disable attribute 'target' defined outside __init__ pylint warning, # because pipeline generates its own __init__ based on run function. pipeline_job.target = ( # pylint: disable=W0201 appengine_util.GetTargetNameForModule(constants.WATERFALL_BACKEND)) pipeline_job.start( queue_name=self.queue_name or constants.DEFAULT_QUEUE)
def run(self, master_name, builder_name, preferred_run_build_number, step_name, test_name, version_number, triggering_build_number, step_metadata=None, manually_triggered=False, use_nearby_neighbor=False, step_size=0, retries=0): """Pipeline to determine the regression range of a flaky test. Args: master_name (str): The master name. builder_name (str): The builder name. preferred_run_build_number (int): The build number the check flake algorithm should perform a swarming rerun on, but may be overridden to use the results of a nearby neighbor if use_nearby_neighbor is True. step_name (str): The step name. test_name (str): The test name. version_number (int): The version to save analysis results and data to. triggering_build_number (int): The build number that triggered this analysis. step_metadata (dict): Step_metadata for the test. manually_triggered (bool): True if the analysis is from manual request, like by a Chromium sheriff. use_nearby_neighbor (bool): Whether the optimization for using the swarming results of a nearby build number, if available, should be used in place of triggering a new swarming task on preferred_run_build_number. step_size (int): The difference in build numbers since the last call to RecursiveFlakePipeline to determine the bounds for how far a nearby build's swarming task results should be used. Only relevant if use_nearby_neighbor is True. retries (int): Number of retries of this pipeline. If reties exceeds the _MAX_RETRY_TIMES, start this pipeline off peak hours. Returns: A dict of lists for reliable/flaky tests. """ # If retries has not exceeded max count and there are available bots, # we can start the analysis. can_start_analysis = (self._BotsAvailableForTask(step_metadata) if retries <= _MAX_RETRY_TIMES else True) if not can_start_analysis: retries += 1 pipeline_job = RecursiveFlakePipeline( master_name, builder_name, preferred_run_build_number, step_name, test_name, version_number, triggering_build_number, step_metadata, manually_triggered=manually_triggered, use_nearby_neighbor=use_nearby_neighbor, step_size=step_size, retries=retries) # Disable attribute 'target' defined outside __init__ pylint warning, # because pipeline generates its own __init__ based on run function. pipeline_job.target = ( # pylint: disable=W0201 appengine_util.GetTargetNameForModule( constants.WATERFALL_BACKEND)) if retries > _MAX_RETRY_TIMES: pipeline_job._StartOffPSTPeakHours( queue_name=self.queue_name or constants.DEFAULT_QUEUE) logging.info( 'Retrys exceed max count, RecursiveFlakePipeline on ' 'MasterFlakeAnalysis %s/%s/%s/%s/%s will start off peak ' 'hour', master_name, builder_name, triggering_build_number, step_name, test_name) else: pipeline_job._RetryWithDelay( queue_name=self.queue_name or constants.DEFAULT_QUEUE) countdown = retries * _BASE_COUNT_DOWN_SECONDS logging.info( 'No available swarming bots, RecursiveFlakePipeline on ' 'MasterFlakeAnalysis %s/%s/%s/%s/%s will be tried after' '%d seconds', master_name, builder_name, triggering_build_number, step_name, test_name, countdown) else: # Bots are available or pipeline starts off peak hours, trigger the task. flake_analysis = MasterFlakeAnalysis.GetVersion( master_name, builder_name, triggering_build_number, step_name, test_name, version=version_number) logging.info( 'Running RecursiveFlakePipeline on MasterFlakeAnalysis' ' %s/%s/%s/%s/%s', master_name, builder_name, triggering_build_number, step_name, test_name) logging.info('MasterFlakeAnalysis %s version %s', flake_analysis, version_number) if flake_analysis.status != analysis_status.RUNNING: # pragma: no branch flake_analysis.status = analysis_status.RUNNING flake_analysis.start_time = time_util.GetUTCNow() flake_analysis.put() # TODO(lijeffrey): Allow custom parameters supplied by user. iterations = flake_analysis.algorithm_parameters.get( 'swarming_rerun', {}).get('iterations_to_rerun', 100) hard_timeout_seconds = _GetHardTimeoutSeconds( master_name, builder_name, triggering_build_number, step_name, iterations) actual_run_build_number = _GetBestBuildNumberToRun( master_name, builder_name, preferred_run_build_number, step_name, test_name, step_size, iterations) if use_nearby_neighbor else ( preferred_run_build_number) # Call trigger pipeline (flake style). task_id = yield TriggerFlakeSwarmingTaskPipeline( master_name, builder_name, actual_run_build_number, step_name, [test_name], iterations, hard_timeout_seconds) with pipeline.InOrder(): yield ProcessFlakeSwarmingTaskResultPipeline( master_name, builder_name, actual_run_build_number, step_name, task_id, triggering_build_number, test_name, version_number) yield NextBuildNumberPipeline( master_name, builder_name, triggering_build_number, actual_run_build_number, step_name, test_name, version_number, step_metadata=step_metadata, use_nearby_neighbor=use_nearby_neighbor, manually_triggered=manually_triggered)
def HandleGet(self): key = self.request.get('key') if key: analysis = ndb.Key(urlsafe=key).get() if not analysis: # pragma: no cover return self.CreateError('Analysis of flake is not found', 404) else: build_url = self.request.get('url', '').strip() build_info = buildbot.ParseBuildUrl(build_url) if not build_info: # pragma: no cover return self.CreateError('Unknown build info!', 400) master_name, builder_name, build_number = build_info step_name = self.request.get('step_name', '').strip() test_name = self.request.get('test_name', '').strip() bug_id = self.request.get('bug_id', '').strip() # TODO(lijeffrey): Add support for force flag to trigger a rerun. error = self._ValidateInput(step_name, test_name, bug_id) if error: # pragma: no cover return error build_number = int(build_number) bug_id = int(bug_id) if bug_id else None user_email = auth_util.GetUserEmail() is_admin = auth_util.IsCurrentUserAdmin() request = FlakeAnalysisRequest.Create(test_name, False, bug_id) request.AddBuildStep(master_name, builder_name, build_number, step_name, time_util.GetUTCNow()) scheduled = flake_analysis_service.ScheduleAnalysisForFlake( request, user_email, is_admin, triggering_sources.FINDIT_UI) analysis = MasterFlakeAnalysis.GetVersion(master_name, builder_name, build_number, step_name, test_name) if not analysis: if scheduled is None: # User does not have permission to trigger, nor was any previous # analysis triggered to view. return { 'template': 'error.html', 'data': { 'error_message': ('You could schedule an analysis for flaky test only ' 'after you login with google.com account.'), 'login_url': self.GetLoginUrl(), }, 'return_code': 401, } # Check if a previous request has already covered this analysis so use # the results from that analysis. request = FlakeAnalysisRequest.GetVersion(key=test_name) if not (request and request.analyses): return { 'template': 'error.html', 'data': { 'error_message': ('Flake analysis is not supported for this request. Either' ' the build step may not be supported or the test is not ' 'swarmed.'), }, 'return_code': 400, } analysis = request.FindMatchingAnalysisForConfiguration( master_name, builder_name) if not analysis: # pragma: no cover logging.error('Flake analysis was deleted unexpectedly!') return { 'template': 'error.html', 'data': { 'error_message': 'Flake analysis was deleted unexpectedly!', }, 'return_code': 400 } suspected_flake = _GetSuspectedFlakeInfo(analysis) culprit = _GetCulpritInfo(analysis) build_level_number, revision_level_number = _GetNumbersOfDataPointGroups( analysis.data_points) data = { 'key': analysis.key.urlsafe(), 'master_name': analysis.master_name, 'builder_name': analysis.builder_name, 'build_number': analysis.build_number, 'step_name': analysis.step_name, 'test_name': analysis.test_name, 'pass_rates': [], 'analysis_status': analysis.status_description, 'try_job_status': analysis_status.STATUS_TO_DESCRIPTION.get(analysis.try_job_status), 'last_attempted_swarming_task': _GetLastAttemptedSwarmingTaskDetails(analysis), 'last_attempted_try_job': _GetLastAttemptedTryJobDetails(analysis), 'version_number': analysis.version_number, 'suspected_flake': suspected_flake, 'culprit': culprit, 'request_time': time_util.FormatDatetime(analysis.request_time), 'build_level_number': build_level_number, 'revision_level_number': revision_level_number, 'error': analysis.error_message, 'iterations_to_rerun': analysis.iterations_to_rerun, 'show_input_ui': self._ShowInputUI(analysis) } if (users.is_current_user_admin() and analysis.completed and analysis.triage_history): data['triage_history'] = analysis.GetTriageHistory() data['pending_time'] = time_util.FormatDuration( analysis.request_time, analysis.start_time or time_util.GetUTCNow()) if analysis.status != analysis_status.PENDING: data['duration'] = time_util.FormatDuration( analysis.start_time, analysis.end_time or time_util.GetUTCNow()) data['pass_rates'] = _GetCoordinatesData(analysis) return {'template': 'flake/result.html', 'data': data}
def _NeedANewAnalysis(normalized_test, original_test, flake_settings, bug_id=None, allow_new_analysis=False, force=False, user_email='', triggering_source=triggering_sources.FINDIT_PIPELINE): """Checks status of analysis for the test and decides if a new one is needed. A MasterFlakeAnalysis entity for the given parameters will be created if none exists. When a new analysis is needed, this function will create and save a MasterFlakeAnalysis entity to the datastore. Args: normalized_test (TestInfo): Info of the normalized flaky test after mapping a CQ trybot step to a Waterfall buildbot step, striping prefix "PRE_" from a gtest, etc. original_test (TestInfo): Info of the original flaky test. flake_settings (dict): The flake settings run on this analysis. bug_id (int): The monorail bug id to update when analysis is done. allow_new_analysis (bool): Indicate whether a new analysis is allowed. force (bool): Indicate whether to force a rerun of current analysis. user_email (str): The user triggering this analysis. triggering_source (int): The source from which this analysis was triggered. Returns: (need_new_analysis, analysis) need_new_analysis (bool): True if an analysis is needed, otherwise False. analysis (MasterFlakeAnalysis): The MasterFlakeAnalysis entity. """ analysis = MasterFlakeAnalysis.GetVersion(normalized_test.master_name, normalized_test.builder_name, normalized_test.build_number, normalized_test.step_name, normalized_test.test_name) def PopulateAnalysisInfo(analysis): analysis.Reset() analysis.request_time = time_util.GetUTCNow() analysis.status = analysis_status.PENDING analysis.algorithm_parameters = flake_settings analysis.version = appengine_util.GetCurrentVersion() analysis.triggering_user_email = user_email analysis.triggering_user_email_obscured = False analysis.triggering_source = triggering_source analysis.original_master_name = original_test.master_name analysis.original_builder_name = original_test.builder_name analysis.original_build_number = original_test.build_number analysis.original_step_name = original_test.step_name analysis.original_test_name = original_test.test_name analysis.bug_id = bug_id if not analysis: if not allow_new_analysis: return False, None analysis = MasterFlakeAnalysis.Create(normalized_test.master_name, normalized_test.builder_name, normalized_test.build_number, normalized_test.step_name, normalized_test.test_name) PopulateAnalysisInfo(analysis) _, saved = analysis.Save() return saved, analysis elif (analysis.status == analysis_status.PENDING or analysis.status == analysis_status.RUNNING): return False, analysis elif allow_new_analysis and force and analysis.status in ( analysis_status.ERROR, analysis_status.COMPLETED): PopulateAnalysisInfo(analysis) _, saved = analysis.Save() return saved, analysis else: return False, analysis