Beispiel #1
0
    def _ComputeRegressionStatistics(cls, rev_states, first_working_rev,
                                     last_broken_rev):
        # TODO(sergiyb): We assume that value has "values" key, which may not be
        # the case for failure-bisects, where there is a single value only.
        broken_means = [
            state.value['values']
            for state in rev_states[:last_broken_rev.index + 1] if state.value
        ]

        working_means = [
            state.value['values']
            for state in rev_states[first_working_rev.index:] if state.value
        ]

        # Flatten the lists to calculate mean of all values.
        working_mean = sum(working_means, [])
        broken_mean = sum(broken_means, [])

        # Calculate the approximate size of the regression
        mean_of_bad_runs = math_utils.Mean(broken_mean)
        mean_of_good_runs = math_utils.Mean(working_mean)

        regression_size = 100 * math_utils.RelativeChange(
            mean_of_good_runs, mean_of_bad_runs)
        if math.isnan(regression_size):
            regression_size = 'zero-to-nonzero'

        regression_std_err = math.fabs(
            math_utils.PooledStandardError([working_mean, broken_mean]) /
            max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0

        # Give a "confidence" in the bisect. Currently, we consider the values of
        # only the revisions at the breaking range (last known good and first known
        # bad) see the note in the docstring for FindBreakingRange.
        confidence_params = (sum([first_working_rev.value['values']],
                                 []), sum([last_broken_rev.value['values']],
                                          []))
        confidence = cls.ConfidenceScore(*confidence_params)

        bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs

        return {
            'regression_size': regression_size,
            'regression_std_err': regression_std_err,
            'confidence': confidence,
            'bad_greater_than_good': bad_greater_than_good
        }
Beispiel #2
0
    def _ComputeRegressionStatistics(cls, rev_states, first_working_rev,
                                     last_broken_rev):
        # TODO(sergiyb): We assume that value has "values" key, which may not be
        # the case for failure-bisects, where there is a single value only.
        broken_means = [
            state.value['values']
            for state in rev_states[:last_broken_rev.index + 1] if state.value
        ]

        working_means = [
            state.value['values']
            for state in rev_states[first_working_rev.index:] if state.value
        ]

        # Flatten the lists to calculate mean of all values.
        working_mean = sum(working_means, [])
        broken_mean = sum(broken_means, [])

        # Calculate the approximate size of the regression
        mean_of_bad_runs = math_utils.Mean(broken_mean)
        mean_of_good_runs = math_utils.Mean(working_mean)

        regression_size = 100 * math_utils.RelativeChange(
            mean_of_good_runs, mean_of_bad_runs)
        if math.isnan(regression_size):
            regression_size = 'zero-to-nonzero'

        regression_std_err = math.fabs(
            math_utils.PooledStandardError([working_mean, broken_mean]) /
            max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0

        # Give a "confidence" in the bisect culprit by seeing whether the results
        # of the culprit revision and the revision before that appear to be
        # statistically significantly different.
        confidence = cls.ConfidenceScore(
            sum([first_working_rev.value['values']], []),
            sum([last_broken_rev.value['values']], []))

        bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs

        return {
            'regression_size': regression_size,
            'regression_std_err': regression_std_err,
            'confidence': confidence,
            'bad_greater_than_good': bad_greater_than_good
        }
Beispiel #3
0
    def _FindOtherRegressions(cls, revision_states, bad_greater_than_good):
        """Compiles a list of other possible regressions from the revision data.

    Args:
      revision_states: Sorted list of RevisionState objects.
      bad_greater_than_good: Whether the result value at the "bad" revision is
          numerically greater than the result value at the "good" revision.

    Returns:
      A list of [current_rev, previous_rev, confidence] for other places where
      there may have been a regression.
    """
        other_regressions = []
        previous_values = []
        prev_state = None
        for revision_state in revision_states:
            if revision_state.value:
                current_values = revision_state.value['values']
                if previous_values:
                    confidence_params = (sum(previous_values,
                                             []), sum([current_values], []))
                    confidence = cls.ConfidenceScore(
                        *confidence_params, accept_single_bad_or_good=True)
                    mean_of_prev_runs = math_utils.Mean(
                        sum(previous_values, []))
                    mean_of_current_runs = math_utils.Mean(current_values)

                    # Check that the potential regression is in the same direction as
                    # the overall regression. If the mean of the previous runs < the
                    # mean of the current runs, this local regression is in same
                    # direction.
                    prev_greater_than_current = mean_of_prev_runs > mean_of_current_runs
                    if bad_greater_than_good:
                        is_same_direction = prev_greater_than_current
                    else:
                        is_same_direction = not prev_greater_than_current

                    # Only report potential regressions with high confidence.
                    if is_same_direction and confidence > 50:
                        other_regressions.append(
                            [revision_state, prev_state, confidence])
                previous_values.append(current_values)
                prev_state = revision_state
        return other_regressions
    def _FindOtherRegressions(revision_data_sorted, bad_greater_than_good):
        """Compiles a list of other possible regressions from the revision data.

    Args:
      revision_data_sorted: Sorted list of (revision, revision data) pairs.
      bad_greater_than_good: Whether the result value at the "bad" revision is
          numerically greater than the result value at the "good" revision.

    Returns:
      A list of [current_rev, previous_rev, confidence] for other places where
      there may have been a regression.
    """
        other_regressions = []
        previous_values = []
        previous_id = None
        for current_id, current_data in revision_data_sorted:
            current_values = current_data['value']
            if current_values:
                current_values = current_values['values']
                if previous_values:
                    confidence = ConfidenceScore(previous_values,
                                                 [current_values])
                    mean_of_prev_runs = math_utils.Mean(
                        sum(previous_values, []))
                    mean_of_current_runs = math_utils.Mean(current_values)

                    # Check that the potential regression is in the same direction as
                    # the overall regression. If the mean of the previous runs < the
                    # mean of the current runs, this local regression is in same
                    # direction.
                    prev_less_than_current = mean_of_prev_runs < mean_of_current_runs
                    is_same_direction = (prev_less_than_current
                                         if bad_greater_than_good else
                                         not prev_less_than_current)

                    # Only report potential regressions with high confidence.
                    if is_same_direction and confidence > 50:
                        other_regressions.append(
                            [current_id, previous_id, confidence])
                previous_values.append(current_values)
                previous_id = current_id
        return other_regressions
 def testMeanCompareAlternateImplementation(self):
   """Tests Mean by comparing against an alternate implementation."""
   def AlternateMeanFunction(values):
     """Simple arithmetic mean function."""
     return sum(values) / float(len(values))
   test_values_lists = [[1], [5, 6.5, 1.2, 3], [-3, 0, 1, 4],
                        [-3, -1, 0.12, 0.752, 3.33, 8, 16, 32, 439]]
   for values in test_values_lists:
     self.assertEqual(
         AlternateMeanFunction(values),
         math_utils.Mean(values))
Beispiel #6
0
def WelchsTTest(sample1, sample2):
  """Performs Welch's t-test on the two samples.

  Welch's t-test is an adaptation of Student's t-test which is used when the
  two samples may have unequal variances. It is also an independent two-sample
  t-test.

  Args:
    sample1: A collection of numbers.
    sample2: Another collection of numbers.

  Returns:
    A 3-tuple (t-statistic, degrees of freedom, p-value).
  """
  mean1 = math_utils.Mean(sample1)
  mean2 = math_utils.Mean(sample2)
  v1 = math_utils.Variance(sample1)
  v2 = math_utils.Variance(sample2)
  n1 = len(sample1)
  n2 = len(sample2)
  t = _TValue(mean1, mean2, v1, v2, n1, n2)
  df = _DegreesOfFreedom(v1, v2, n1, n2)
  p = _LookupPValue(t, df)
  return t, df, p
Beispiel #7
0
 def testMean_ShortList(self):
     self.assertEqual(0.5, math_utils.Mean([-3, 0, 1, 4]))
Beispiel #8
0
 def testMean_OneValue(self):
     self.assertEqual(3.0, math_utils.Mean([3]))
 def testMeanShortList(self):
   """Tests the Mean function with a short list."""
   self.assertEqual(0.5, math_utils.Mean([-3, 0, 1, 4]))
 def testMeanSingleNum(self):
   """Tests the Mean function with a single number."""
   self.assertEqual(3.0, math_utils.Mean([3]))
    def GetResultsDict(self):
        """Prepares and returns information about the final resulsts as a dict.

    Returns:
      A dictionary with the following fields

      'first_working_revision': First good revision.
      'last_broken_revision': Last bad revision.
      'culprit_revisions': A list of revisions, which contain the bad change
          introducing the failure.
      'other_regressions': A list of tuples representing other regressions,
          which may have occured.
      'regression_size': For performance bisects, this is a relative change of
          the mean metric value. For other bisects this field always contains
          'zero-to-nonzero'.
      'regression_std_err': For performance bisects, it is a pooled standard
          error for groups of good and bad runs. Not used for other bisects.
      'confidence': For performance bisects, it is a confidence that the good
          and bad runs are distinct groups. Not used for non-performance
          bisects.
      'revision_data_sorted': dict mapping revision ids to data about that
          revision. Each piece of revision data consists of a dict with the
          following keys:

          'passed': Represents whether the performance test was successful at
              that revision. Possible values include: 1 (passed), 0 (failed),
              '?' (skipped), 'F' (build failed).
          'depot': The depot that this revision is from (i.e. WebKit)
          'external': If the revision is a 'src' revision, 'external' contains
              the revisions of each of the external libraries.
          'sort': A sort value for sorting the dict in order of commits.

          For example:
          {
            'CL #1':
            {
              'passed': False,
              'depot': 'chromium',
              'external': None,
              'sort': 0
            }
          }
    """
        revision_data_sorted = sorted(self.revision_data.iteritems(),
                                      key=lambda x: x[1]['sort'])

        # Find range where it possibly broke.
        first_working_revision = None
        first_working_revision_index = -1
        last_broken_revision = None
        last_broken_revision_index = -1

        culprit_revisions = []
        other_regressions = []
        regression_size = 0.0
        regression_std_err = 0.0
        confidence = 0.0

        for i in xrange(len(revision_data_sorted)):
            k, v = revision_data_sorted[i]
            if v['passed'] == 1:
                if not first_working_revision:
                    first_working_revision = k
                    first_working_revision_index = i

            if not v['passed']:
                last_broken_revision = k
                last_broken_revision_index = i

        if last_broken_revision != None and first_working_revision != None:
            broken_means = []
            for i in xrange(0, last_broken_revision_index + 1):
                if revision_data_sorted[i][1]['value']:
                    broken_means.append(
                        revision_data_sorted[i][1]['value']['values'])

            working_means = []
            for i in xrange(first_working_revision_index,
                            len(revision_data_sorted)):
                if revision_data_sorted[i][1]['value']:
                    working_means.append(
                        revision_data_sorted[i][1]['value']['values'])

            # Flatten the lists to calculate mean of all values.
            working_mean = sum(working_means, [])
            broken_mean = sum(broken_means, [])

            # Calculate the approximate size of the regression
            mean_of_bad_runs = math_utils.Mean(broken_mean)
            mean_of_good_runs = math_utils.Mean(working_mean)

            regression_size = 100 * math_utils.RelativeChange(
                mean_of_good_runs, mean_of_bad_runs)
            if math.isnan(regression_size):
                regression_size = 'zero-to-nonzero'

            regression_std_err = math.fabs(
                math_utils.PooledStandardError([working_mean, broken_mean]) /
                max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0

            # Give a "confidence" in the bisect. At the moment we use how distinct the
            # values are before and after the last broken revision, and how noisy the
            # overall graph is.
            confidence = ConfidenceScore(working_means, broken_means)

            culprit_revisions = []

            cwd = os.getcwd()
            self._depot_registry.ChangeToDepotDir(
                self.revision_data[last_broken_revision]['depot'])

            if self.revision_data[last_broken_revision]['depot'] == 'cros':
                # Want to get a list of all the commits and what depots they belong
                # to so that we can grab info about each.
                cmd = [
                    'repo', 'forall', '-c',
                    'pwd ; git log --pretty=oneline --before=%d --after=%d' %
                    (last_broken_revision, first_working_revision + 1)
                ]
                output, return_code = bisect_utils.RunProcessAndRetrieveOutput(
                    cmd)

                changes = []
                assert not return_code, ('An error occurred while running '
                                         '"%s"' % ' '.join(cmd))
                last_depot = None
                cwd = os.getcwd()
                for l in output.split('\n'):
                    if l:
                        # Output will be in form:
                        # /path_to_depot
                        # /path_to_other_depot
                        # <SHA1>
                        # /path_again
                        # <SHA1>
                        # etc.
                        if l[0] == '/':
                            last_depot = l
                        else:
                            contents = l.split(' ')
                            if len(contents) > 1:
                                changes.append([last_depot, contents[0]])
                for c in changes:
                    os.chdir(c[0])
                    info = self._source_control.QueryRevisionInfo(c[1])
                    culprit_revisions.append((c[1], info, None))
            else:
                for i in xrange(last_broken_revision_index,
                                len(revision_data_sorted)):
                    k, v = revision_data_sorted[i]
                    if k == first_working_revision:
                        break
                    self._depot_registry.ChangeToDepotDir(v['depot'])
                    info = self._source_control.QueryRevisionInfo(k)
                    culprit_revisions.append((k, info, v['depot']))
            os.chdir(cwd)

            # Check for any other possible regression ranges.
            other_regressions = self._FindOtherRegressions(
                revision_data_sorted, mean_of_bad_runs > mean_of_good_runs)

        return {
            'first_working_revision': first_working_revision,
            'last_broken_revision': last_broken_revision,
            'culprit_revisions': culprit_revisions,
            'other_regressions': other_regressions,
            'regression_size': regression_size,
            'regression_std_err': regression_std_err,
            'confidence': confidence,
            'revision_data_sorted': revision_data_sorted
        }