def FindMatchingWaterfallStep(build_step, test_name): """Finds the matching Waterfall step and checks whether it is supported. Only Swarmed and gtest-based steps are supported at the moment. Args: build_step (BuildStep): A build step on Waterfall or Commit Queue. It will be updated with the matching Waterfall step and whether it is Swarmed and supported. test_name (str): The name of the test. """ build_step.swarmed = False build_step.supported = False http_client = HttpClientAppengine() if build_step.on_cq: wf_master_name, wf_builder_name, wf_build_number, wf_step_name, metadata = ( _GetMatchingWaterfallBuildStep(build_step, http_client)) build_step.wf_master_name = wf_master_name build_step.wf_builder_name = wf_builder_name build_step.wf_build_number = wf_build_number build_step.wf_step_name = wf_step_name if not build_step.has_matching_waterfall_step: return else: build_step.wf_master_name = build_step.master_name build_step.wf_builder_name = build_step.builder_name build_step.wf_build_number = build_step.build_number build_step.wf_step_name = build_step.step_name metadata = buildbot.GetStepLog(build_step.master_name, build_step.builder_name, build_step.build_number, build_step.step_name, http_client, 'step_metadata') if not metadata: logging.error('Couldn\'t get step_metadata') return # Query Swarming for isolated data. build_step.swarmed = True if metadata.get('swarm_task_ids') else False if build_step.swarmed: # Retrieve a sample output from Isolate. task_id = metadata['swarm_task_ids'][0] output = swarming_util.GetIsolatedOutputForTask(task_id, http_client) if output: # Guess from the format. build_step.supported = ( isinstance(output, dict) and isinstance(output.get('all_tests'), list) and test_name in output.get('all_tests', []) and isinstance(output.get('per_iteration_data'), list) and all( isinstance(i, dict) for i in output.get('per_iteration_data')))
def run(self, failure_info): """Extracts failure signals from failed steps. Args: failure_info (dict): Output of pipeline DetectFirstFailurePipeline.run(). Returns: A dict like below: { 'step_name1': waterfall.failure_signal.FailureSignal.ToDict(), ... } """ signals = {} if not failure_info['failed'] or not failure_info['chromium_revision']: # Bail out if no failed step or no chromium revision. return signals # Bail out on infra failure if failure_info.get('failure_type') == failure_type.INFRA: return signals master_name = failure_info['master_name'] builder_name = failure_info['builder_name'] build_number = failure_info['build_number'] for step_name in failure_info.get('failed_steps', []): if not waterfall_config.StepIsSupportedForMaster( step_name, master_name): # Bail out if the step is not supported. continue step = WfStep.Get(master_name, builder_name, build_number, step_name) if step and step.log_data: failure_log = step.log_data else: # TODO: do test-level analysis instead of step-level. # TODO: Use swarming test result instead of archived gtest results gtest_result = buildbot.GetGtestResultLog( master_name, builder_name, build_number, step_name) if gtest_result: failure_log = _GetReliableTestFailureLog(gtest_result) if gtest_result is None or failure_log == 'invalid': if not lock_util.WaitUntilDownloadAllowed( master_name): # pragma: no cover raise pipeline.Retry( 'Failed to pull log of step %s of master %s' % (step_name, master_name)) try: failure_log = buildbot.GetStepLog( master_name, builder_name, build_number, step_name, self.HTTP_CLIENT) except ResponseTooLargeError: # pragma: no cover. logging.exception( 'Log of step "%s" is too large for urlfetch.', step_name) # If the stdio log of a step is too large, we don't want to pull it # again in next run, because that might lead to DDoS to the master. # TODO: Use archived stdio logs in Google Storage instead. failure_log = 'Stdio log is too large for urlfetch.' if not failure_log: # pragma: no cover raise pipeline.Retry( 'Failed to pull stdio of step %s of master %s' % (step_name, master_name)) # Save step log in datastore and avoid downloading again during retry. if not step: # pragma: no cover step = WfStep.Create(master_name, builder_name, build_number, step_name) step.log_data = _ExtractStorablePortionOfLog(failure_log) try: step.put() except Exception as e: # pragma: no cover # Sometimes, the step log is too large to save in datastore. logging.exception(e) # TODO: save result in datastore? if step.isolated: try: json_failure_log = (json.loads(failure_log) if failure_log != 'flaky' else {}) except ValueError: # pragma: no cover json_failure_log = {} logging.warning('failure_log %s is not valid JSON.' % failure_log) signals[step_name] = {'tests': {}} step_signal = FailureSignal() for test_name, test_failure_log in json_failure_log.iteritems( ): signals[step_name]['tests'][ test_name] = extractors.ExtractSignal( master_name, builder_name, step_name, test_name, base64.b64decode(test_failure_log)).ToDict() # Save signals in test failure log to step level. step_signal.MergeFrom( signals[step_name]['tests'][test_name]) signals[step_name]['files'] = step_signal.files signals[step_name]['keywords'] = step_signal.keywords else: signals[step_name] = extractors.ExtractSignal( master_name, builder_name, step_name, None, failure_log).ToDict() return signals
def _callback(self, callback_params, pipeline_id=None): """Updates the TryJobData entities with status from buildbucket.""" # callback_params may have been serialized if the callback was converted to # a URL. if isinstance(callback_params, basestring): callback_params = json.loads(callback_params) self.last_params = callback_params _ = pipeline_id # We do nothing with this id. try_job_id = callback_params['try_job_id'] assert try_job_id urlsafe_try_job_key = callback_params['urlsafe_try_job_key'] try_job_type = callback_params['try_job_type'] deadline = callback_params['deadline'] already_set_started = callback_params['already_set_started'] error_count = callback_params['error_count'] max_error_times = callback_params['max_error_times'] default_pipeline_wait_seconds = callback_params[ 'default_pipeline_wait_seconds'] timeout_hours = callback_params['timeout_hours'] backoff_time = callback_params['backoff_time'] if try_job_type == failure_type.FLAKY_TEST: try_job_data = FlakeTryJobData.Get(try_job_id) else: try_job_data = WfTryJobData.Get(try_job_id) error, build = buildbucket_client.GetTryJobs([try_job_id])[0] if error: if error_count < max_error_times: error_count += 1 self.delay_callback( backoff_time, callback_params={ 'try_job_id': try_job_id, 'try_job_type': try_job_type, 'urlsafe_try_job_key': urlsafe_try_job_key, 'deadline': deadline, 'already_set_started': already_set_started, 'error_count': error_count, 'max_error_times': max_error_times, 'default_pipeline_wait_seconds': default_pipeline_wait_seconds, 'timeout_hours': timeout_hours, 'backoff_time': backoff_time * 2, } ) return else: # pragma: no cover # Buildbucket has responded error more than 5 times, retry pipeline. _UpdateTryJobMetadata( try_job_data, try_job_type, build, error, False) raise pipeline.Retry( 'Error "%s" occurred. Reason: "%s"' % (error.message, error.reason)) elif build.status == BuildbucketBuild.COMPLETED: swarming_task_id = buildbot.GetSwarmingTaskIdFromUrl( build.url) if swarming_task_id: try: report = json.loads(swarming_util.GetStepLog( try_job_id, 'report', HttpClientAppengine(), 'report')) except (ValueError, TypeError) as e: # pragma: no cover report = {} logging.exception( 'Failed to load result report for swarming/%s ' 'due to exception %s.' % (swarming_task_id, e.message)) else: try_job_master_name, try_job_builder_name, try_job_build_number = ( buildbot.ParseBuildUrl(build.url)) try: report = json.loads(buildbot.GetStepLog( try_job_master_name, try_job_builder_name, try_job_build_number, 'report', HttpClientAppengine(), 'report')) except (ValueError, TypeError) as e: # pragma: no cover report = {} logging.exception( 'Failed to load result report for %s/%s/%s due to exception %s.' % (try_job_master_name, try_job_builder_name, try_job_build_number, e.message)) _UpdateTryJobMetadata( try_job_data, try_job_type, build, error, False, report if report else {}) result_to_update = self._UpdateTryJobResult( urlsafe_try_job_key, try_job_type, try_job_id, build.url, BuildbucketBuild.COMPLETED, report) self.complete(result_to_update[-1]) return else: error_count = 0 backoff_time = default_pipeline_wait_seconds if build.status == BuildbucketBuild.STARTED and not ( already_set_started): # It is possible this branch is skipped if a fast build goes from # 'SCHEDULED' to 'COMPLETED' between queries, so start_time may be # unavailable. start_time = time_util.MicrosecondsToDatetime(build.updated_time) self._UpdateTryJobResult( urlsafe_try_job_key, try_job_type, try_job_id, build.url, BuildbucketBuild.STARTED) already_set_started = True # Update as much try job metadata as soon as possible to avoid data # loss in case of errors. try_job_data.start_time = start_time try_job_data.request_time = ( time_util.MicrosecondsToDatetime(build.request_time)) try_job_data.try_job_url = build.url try_job_data.callback_url = self.get_callback_url( callback_params=json.dumps({ 'try_job_id': try_job_id, 'try_job_type': try_job_type, 'urlsafe_try_job_key': urlsafe_try_job_key, 'deadline': deadline, 'already_set_started': already_set_started, 'error_count': error_count, 'max_error_times': max_error_times, 'default_pipeline_wait_seconds': default_pipeline_wait_seconds, 'timeout_hours': timeout_hours, 'backoff_time': backoff_time, }) ) try_job_data.put() if time.time() > deadline: # pragma: no cover _UpdateTryJobMetadata( try_job_data, try_job_type, build, error, True) # Explicitly abort the whole pipeline. raise pipeline.Abort( 'Try job %s timed out after %d hours.' % ( try_job_id, timeout_hours)) # Ensure last_buildbucket_response is always the most recent # whenever available during intermediate queries. _UpdateLastBuildbucketResponse(try_job_data, build)
def testGetStepLogStdioIfNoStream(self, *_): self.assertEqual('log1/nlog2', buildbot.GetStepLog( self.master_name, self.builder_name, self.build_number, self.step_name, self.http_client))
def testGetStepMetadataStreamNone(self, *_): step_metadata = buildbot.GetStepLog( self.master_name, self.builder_name, self.build_number, self.step_name, self.http_client, 'step_metadata') self.assertIsNone(step_metadata)
def testGetStepMetadata(self, *_): step_metadata = buildbot.GetStepLog( self.master_name, self.builder_name, self.build_number, self.step_name, self.http_client, 'step_metadata') self.assertEqual(step_metadata, wf_testcase.SAMPLE_STEP_METADATA)
def _GetMatchingWaterfallBuildStep(cq_build_step, http_client): """Returns the matching Waterfall build step of the given CQ one. Args: cq_build_step (BuildStep): A build step on Commit Queue. http_client (RetryHttpClient): A http client to send http requests. Returns: (master_name, builder_name, build_number, step_name, step_metadata) or None """ no_matching_result = (None, None, None, None, None) # 0. Get step_metadata. step_metadata = buildbot.GetStepLog(cq_build_step.master_name, cq_build_step.builder_name, cq_build_step.build_number, cq_build_step.step_name, http_client, 'step_metadata') if not step_metadata: logging.error('Couldn\'t get step_metadata') return no_matching_result # 1. Map a cq trybot to the matching waterfall buildbot: # get master_name and builder_name. wf_master_name = step_metadata.get('waterfall_mastername') wf_builder_name = step_metadata.get('waterfall_buildername') if not wf_master_name or not wf_builder_name: # Either waterfall_mastername or waterfall_buildername doesn't exist. logging.info('%s/%s has no matching Waterfall buildbot', cq_build_step.master_name, cq_build_step.builder_name) return no_matching_result # No matching Waterfall buildbot. # 2. Get "name" of the CQ trybot step. # Name of the step in the tags of a Swarming task. # Can't use step name, as cq one is with "(with patch)" while waterfall one # without. name = step_metadata.get('canonical_step_name') # The OS in which the test runs on. The same test binary might run on two # different OS platforms. os_name = step_metadata.get('dimensions', {}).get('os') if not name or not os_name: logging.error('Couldn\'t find name/os') return no_matching_result # No name of the step. # TODO: cache and throttle QPS to the same master. # 3. Retrieve latest completed build cycle on the buildbot. builds = buildbot.GetRecentCompletedBuilds(wf_master_name, wf_builder_name, http_client) if not builds: logging.error('Couldn\'t find latest builds.') return no_matching_result # No name of the step. # 4. Check whether there is matching step. tasks = swarming_util.ListSwarmingTasksDataByTags(wf_master_name, wf_builder_name, builds[0], http_client, { 'name': name, 'os': os_name }) if tasks: # One matching buildbot is found. wf_step_name = swarming_util.GetTagValue(tasks[0].get('tags', []), 'stepname') logging.info('%s/%s/%s is mapped to %s/%s/%s', cq_build_step.master_name, cq_build_step.builder_name, cq_build_step.step_name, wf_master_name, wf_builder_name, wf_step_name) return (wf_master_name, wf_builder_name, builds[0], wf_step_name, step_metadata) return no_matching_result
def ScheduleAnalysisIfNeeded( normalized_test, original_test, bug_id=None, allow_new_analysis=False, force=False, manually_triggered=False, user_email=None, triggering_source=triggering_sources.FINDIT_PIPELINE, queue_name=constants.DEFAULT_QUEUE): """Schedules an analysis if needed and returns the MasterFlakeAnalysis. When the build failure was already analyzed and a new analysis is scheduled, the returned WfAnalysis will still have the result of last completed analysis. Args: normalized_test (TestInfo): Info of the normalized flaky test after mapping a CQ trybot step to a Waterfall buildbot step, striping prefix "PRE_" from a gtest, etc. original_test (TestInfo): Info of the original flaky test. bug_id (int): The monorail bug id to update when analysis is done. allow_new_analysis (bool): Indicate whether a new analysis is allowed. force (bool): Indicate whether to force a rerun of current analysis. manually_triggered (bool): True if the analysis was requested manually, such as by a Chromium sheriff. user_email (str): The email of the user requesting the analysis. triggering_source (int): From where this analysis was triggered, such as through Findit pipeline, UI, or through Findit API. queue_name (str): The App Engine queue to run the analysis. Returns: A MasterFlakeAnalysis instance. None if no analysis was scheduled and the user has no permission to. """ flake_settings = waterfall_config.GetCheckFlakeSettings() use_nearby_neighbor = flake_settings.get('swarming_rerun', {}).get('use_nearby_neighbor', False) need_new_analysis, analysis = _NeedANewAnalysis( normalized_test, original_test, flake_settings, bug_id=bug_id, allow_new_analysis=allow_new_analysis, force=force, user_email=user_email, triggering_source=triggering_source) if need_new_analysis: # _NeedANewAnalysis just created master_flake_analysis. Use the latest # version number and pass that along to the other pipelines for updating # results and data. logging.info( 'A new master flake analysis was successfully saved for %s (%s) and ' 'will be captured in version %s', repr(normalized_test), repr(original_test), analysis.version_number) step_metadata = buildbot.GetStepLog(normalized_test.master_name, normalized_test.builder_name, normalized_test.build_number, normalized_test.step_name, HttpClientAppengine(), 'step_metadata') pipeline_job = RecursiveFlakePipeline( normalized_test.master_name, normalized_test.builder_name, normalized_test.build_number, normalized_test.step_name, normalized_test.test_name, analysis.version_number, triggering_build_number=normalized_test.build_number, step_metadata=step_metadata, manually_triggered=manually_triggered, use_nearby_neighbor=use_nearby_neighbor) pipeline_job.target = appengine_util.GetTargetNameForModule( constants.WATERFALL_BACKEND) pipeline_job.start(queue_name=queue_name) return analysis