def testRelativeChangeFromZero(self): """Tests what happens when relative change from zero is calculated.""" # If the first number is zero, then the result is not a number. self.assertEqual(0, math_utils.RelativeChange(0, 0)) self.assertTrue( math.isnan(math_utils.RelativeChange(0, 1))) self.assertTrue( math.isnan(math_utils.RelativeChange(0, -1)))
def _ComputeRegressionStatistics(cls, rev_states, first_working_rev, last_broken_rev): # TODO(sergiyb): We assume that value has "values" key, which may not be # the case for failure-bisects, where there is a single value only. broken_means = [ state.value['values'] for state in rev_states[:last_broken_rev.index + 1] if state.value ] working_means = [ state.value['values'] for state in rev_states[first_working_rev.index:] if state.value ] # Flatten the lists to calculate mean of all values. working_mean = sum(working_means, []) broken_mean = sum(broken_means, []) # Calculate the approximate size of the regression mean_of_bad_runs = math_utils.Mean(broken_mean) mean_of_good_runs = math_utils.Mean(working_mean) regression_size = 100 * math_utils.RelativeChange( mean_of_good_runs, mean_of_bad_runs) if math.isnan(regression_size): regression_size = 'zero-to-nonzero' regression_std_err = math.fabs( math_utils.PooledStandardError([working_mean, broken_mean]) / max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 # Give a "confidence" in the bisect. Currently, we consider the values of # only the revisions at the breaking range (last known good and first known # bad) see the note in the docstring for FindBreakingRange. confidence_params = (sum([first_working_rev.value['values']], []), sum([last_broken_rev.value['values']], [])) confidence = cls.ConfidenceScore(*confidence_params) bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs return { 'regression_size': regression_size, 'regression_std_err': regression_std_err, 'confidence': confidence, 'bad_greater_than_good': bad_greater_than_good }
def _ComputeRegressionStatistics(cls, rev_states, first_working_rev, last_broken_rev): # TODO(sergiyb): We assume that value has "values" key, which may not be # the case for failure-bisects, where there is a single value only. broken_means = [ state.value['values'] for state in rev_states[:last_broken_rev.index + 1] if state.value ] working_means = [ state.value['values'] for state in rev_states[first_working_rev.index:] if state.value ] # Flatten the lists to calculate mean of all values. working_mean = sum(working_means, []) broken_mean = sum(broken_means, []) # Calculate the approximate size of the regression mean_of_bad_runs = math_utils.Mean(broken_mean) mean_of_good_runs = math_utils.Mean(working_mean) regression_size = 100 * math_utils.RelativeChange( mean_of_good_runs, mean_of_bad_runs) if math.isnan(regression_size): regression_size = 'zero-to-nonzero' regression_std_err = math.fabs( math_utils.PooledStandardError([working_mean, broken_mean]) / max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 # Give a "confidence" in the bisect culprit by seeing whether the results # of the culprit revision and the revision before that appear to be # statistically significantly different. confidence = cls.ConfidenceScore( sum([first_working_rev.value['values']], []), sum([last_broken_rev.value['values']], [])) bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs return { 'regression_size': regression_size, 'regression_std_err': regression_std_err, 'confidence': confidence, 'bad_greater_than_good': bad_greater_than_good }
def testRelativeChange_Negative(self): # Note that the return value of RelativeChange is always positive. self.assertEqual(3.0, math_utils.RelativeChange(-1, 2)) self.assertEqual(3.0, math_utils.RelativeChange(1, -2)) self.assertEqual(1.0, math_utils.RelativeChange(-1, -2))
def testRelativeChange_FromZero(self): # If the first number is zero, then the result is not a number. self.assertEqual(0, math_utils.RelativeChange(0, 0)) self.assertTrue(math.isnan(math_utils.RelativeChange(0, 1))) self.assertTrue(math.isnan(math_utils.RelativeChange(0, -1)))
def testRelativeChange_NonZero(self): # The change is relative to the first value, regardless of which is bigger. self.assertEqual(0.5, math_utils.RelativeChange(1.0, 1.5)) self.assertEqual(0.5, math_utils.RelativeChange(2.0, 1.0))
def testRelativeChangeWithNegatives(self): """Tests that relative change given is always positive.""" self.assertEqual(3.0, math_utils.RelativeChange(-1, 2)) self.assertEqual(3.0, math_utils.RelativeChange(1, -2)) self.assertEqual(1.0, math_utils.RelativeChange(-1, -2))
def testRelativeChange(self): """Tests the common cases for calculating relative change.""" # The change is relative to the first value, regardless of which is bigger. self.assertEqual(0.5, math_utils.RelativeChange(1.0, 1.5)) self.assertEqual(0.5, math_utils.RelativeChange(2.0, 1.0))
def GetResultsDict(self): """Prepares and returns information about the final resulsts as a dict. Returns: A dictionary with the following fields 'first_working_revision': First good revision. 'last_broken_revision': Last bad revision. 'culprit_revisions': A list of revisions, which contain the bad change introducing the failure. 'other_regressions': A list of tuples representing other regressions, which may have occured. 'regression_size': For performance bisects, this is a relative change of the mean metric value. For other bisects this field always contains 'zero-to-nonzero'. 'regression_std_err': For performance bisects, it is a pooled standard error for groups of good and bad runs. Not used for other bisects. 'confidence': For performance bisects, it is a confidence that the good and bad runs are distinct groups. Not used for non-performance bisects. 'revision_data_sorted': dict mapping revision ids to data about that revision. Each piece of revision data consists of a dict with the following keys: 'passed': Represents whether the performance test was successful at that revision. Possible values include: 1 (passed), 0 (failed), '?' (skipped), 'F' (build failed). 'depot': The depot that this revision is from (i.e. WebKit) 'external': If the revision is a 'src' revision, 'external' contains the revisions of each of the external libraries. 'sort': A sort value for sorting the dict in order of commits. For example: { 'CL #1': { 'passed': False, 'depot': 'chromium', 'external': None, 'sort': 0 } } """ revision_data_sorted = sorted(self.revision_data.iteritems(), key=lambda x: x[1]['sort']) # Find range where it possibly broke. first_working_revision = None first_working_revision_index = -1 last_broken_revision = None last_broken_revision_index = -1 culprit_revisions = [] other_regressions = [] regression_size = 0.0 regression_std_err = 0.0 confidence = 0.0 for i in xrange(len(revision_data_sorted)): k, v = revision_data_sorted[i] if v['passed'] == 1: if not first_working_revision: first_working_revision = k first_working_revision_index = i if not v['passed']: last_broken_revision = k last_broken_revision_index = i if last_broken_revision != None and first_working_revision != None: broken_means = [] for i in xrange(0, last_broken_revision_index + 1): if revision_data_sorted[i][1]['value']: broken_means.append( revision_data_sorted[i][1]['value']['values']) working_means = [] for i in xrange(first_working_revision_index, len(revision_data_sorted)): if revision_data_sorted[i][1]['value']: working_means.append( revision_data_sorted[i][1]['value']['values']) # Flatten the lists to calculate mean of all values. working_mean = sum(working_means, []) broken_mean = sum(broken_means, []) # Calculate the approximate size of the regression mean_of_bad_runs = math_utils.Mean(broken_mean) mean_of_good_runs = math_utils.Mean(working_mean) regression_size = 100 * math_utils.RelativeChange( mean_of_good_runs, mean_of_bad_runs) if math.isnan(regression_size): regression_size = 'zero-to-nonzero' regression_std_err = math.fabs( math_utils.PooledStandardError([working_mean, broken_mean]) / max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 # Give a "confidence" in the bisect. At the moment we use how distinct the # values are before and after the last broken revision, and how noisy the # overall graph is. confidence = ConfidenceScore(working_means, broken_means) culprit_revisions = [] cwd = os.getcwd() self._depot_registry.ChangeToDepotDir( self.revision_data[last_broken_revision]['depot']) if self.revision_data[last_broken_revision]['depot'] == 'cros': # Want to get a list of all the commits and what depots they belong # to so that we can grab info about each. cmd = [ 'repo', 'forall', '-c', 'pwd ; git log --pretty=oneline --before=%d --after=%d' % (last_broken_revision, first_working_revision + 1) ] output, return_code = bisect_utils.RunProcessAndRetrieveOutput( cmd) changes = [] assert not return_code, ('An error occurred while running ' '"%s"' % ' '.join(cmd)) last_depot = None cwd = os.getcwd() for l in output.split('\n'): if l: # Output will be in form: # /path_to_depot # /path_to_other_depot # <SHA1> # /path_again # <SHA1> # etc. if l[0] == '/': last_depot = l else: contents = l.split(' ') if len(contents) > 1: changes.append([last_depot, contents[0]]) for c in changes: os.chdir(c[0]) info = self._source_control.QueryRevisionInfo(c[1]) culprit_revisions.append((c[1], info, None)) else: for i in xrange(last_broken_revision_index, len(revision_data_sorted)): k, v = revision_data_sorted[i] if k == first_working_revision: break self._depot_registry.ChangeToDepotDir(v['depot']) info = self._source_control.QueryRevisionInfo(k) culprit_revisions.append((k, info, v['depot'])) os.chdir(cwd) # Check for any other possible regression ranges. other_regressions = self._FindOtherRegressions( revision_data_sorted, mean_of_bad_runs > mean_of_good_runs) return { 'first_working_revision': first_working_revision, 'last_broken_revision': last_broken_revision, 'culprit_revisions': culprit_revisions, 'other_regressions': other_regressions, 'regression_size': regression_size, 'regression_std_err': regression_std_err, 'confidence': confidence, 'revision_data_sorted': revision_data_sorted }