Example #1
0
    def test_metric_bounds_fixed_value(self, comparison, threshold_value,
                                       expected_bounds):
        # `fixed_value` comparison ignores value history
        value_history = []
        threshold = metrics.Threshold('fixed_value', threshold_value)
        bounds = metrics.metric_bounds(value_history, threshold, comparison)

        self.assertSequenceAlmostEqual(bounds, expected_bounds)
Example #2
0
    def test_metric_bounds_stddevs_from_mean(self, comparison, threshold_value,
                                             expected_bounds):
        # mean = 3, stddev = ~1.414
        value_history = [
            metrics.MetricPoint(metric_value=1, wall_time=10),
            metrics.MetricPoint(metric_value=2, wall_time=20),
            metrics.MetricPoint(metric_value=3, wall_time=30),
            metrics.MetricPoint(metric_value=4, wall_time=40),
            metrics.MetricPoint(metric_value=5, wall_time=50),
        ]

        threshold = metrics.Threshold('stddevs_from_mean', threshold_value)
        bounds = metrics.metric_bounds(value_history, threshold, comparison)

        self.assertSequenceAlmostEqual(bounds, expected_bounds, places=3)
Example #3
0
  def compute_bounds_and_report_errors(self, metrics_history, new_metrics,
                                       job_status):
    """Compute the bounds for metrics and report abnormal values.

    Any metric that is currently outside the expected bounds is reported to
    Stackdriver Error Reporting unless `alert_for_oob_metrics` is set to
    False in the regression test config. Even if this reporting is turned off,
    this method computes the upper and lower bounds for each metric to provide
    to BigQuery as a visual aid when rendering metrics history into charts.

    Args:
      metrics_history(dict): Historic values of each metric.
      new_metrics(dict): Key is metric name and value is MetricPoint containing
        the latest aggregated value for that metric.
      job_status(string): Final state of the job, should be one of the status
        constants found in job_status_handler.py.

    Returns:
      metric_name_to_visual_bounds (dict): Key is metric name and value is a
        tuple of floats of the form (lower_bound, upper_bound).
    """
    if not self.regression_test_config:
      return {}
    success_conditions = self.regression_test_config.get(
        'metric_success_conditions')
    if not success_conditions:
      return {}

    metrics_history = metrics_history.copy()

    # Add the metrics from the latest run. These aren't in Bigquery yet.
    for metric_name, metric_value in new_metrics.items():
      metrics_history[metric_name].append(metric_value)

    metric_name_to_visual_bounds = {}
    metric_subset_to_report = set(
        self.regression_test_config.get('metric_subset_to_alert', []))
    for metric_name, value_history in metrics_history.items():
      if metric_subset_to_report and metric_name not in metric_subset_to_report:
        self.logger.info(
            'Skipping alerts and bounds for metric `{}` since '
            'it does not appear in `metric_subset_to_report` in your '
            'regression test config.'.format(metric_name))
        continue
      success_condition = success_conditions.get(metric_name) or \
        success_conditions.get('default')
      if not success_condition:
        self.logger.warning(
            'metric: `{}` has an empty success condition in the '
            '`metric_success_conditions` dict in the regression_test_config '
            'but there is no default condition provided. No bounds or '
            'alerts will be computed. See README for config details.'.format(
                metric_name))
        continue
      elif len(value_history) <= success_condition.get(
        'wait_for_n_points_of_history', -1):
        self.logger.info(
            'Metric: {} had only {} points of history. Skipping bounds '
            'enforcement. Success condition: {}'.format(
                metric_name, len(value_history), success_condition))
        continue

      threshold_type, threshold_value = list(success_condition.get('success_threshold').items())[0]
      threshold = metrics.Threshold(threshold_type, threshold_value)
      comparison = success_condition.get('comparison')
      lower_bound, upper_bound = metrics.metric_bounds(
          value_history, threshold, comparison)
      metric_name_to_visual_bounds[metric_name] = (lower_bound, upper_bound)

      metric_value = value_history[-1].metric_value
      within_bounds = metrics.within_bounds(metric_value, lower_bound, upper_bound, inclusive=('equal' in comparison))

      # Generate an alert unless one of these is True:
      #   1. metrics are within bounds.
      #   2. alerting is disabled by config.
      #   3. the job failed and therefore metrics are unreliable.
      if within_bounds or not self.regression_test_config.get(
          'alert_for_oob_metrics', True) or \
              job_status != job_status_handler.SUCCESS:
        continue
      self.logger.error(
          'Metric `{}` was out of bounds for test `{}`. Bounds were '
          '({}, {}) and value was {:.2f}'.format(
              metric_name, self.test_name, lower_bound, upper_bound,
              metric_value),
          debug_info=self.debug_info)

    return metric_name_to_visual_bounds
Example #4
0
    def test_skip_oob_alerting(self):
        handler_base_args = {
            'test_name': 'test',
            'events_dir': self.temp_dir,
            'debug_info': None,
            'metric_collection_config': {},
            'regression_test_config': {
                'alert_after_second_test_failure': True,
            },
            'test_type': None,
            'accelerator': None,
            'framework_version': None,
            'logger': self.logger,
        }
        metrics_handler = main.CloudMetricsHandler(**handler_base_args)
        # Both current and previous runs were OOB. Should alert.
        self.assertFalse(
            metrics_handler.skip_oob_alerting(
                job_status_handler.SUCCESS, [
                    metrics.MetricPoint(0.8, 111),
                    metrics.MetricPoint(0.8, 112),
                    metrics.MetricPoint(1.0, 113)
                ], metrics.Threshold('fixed_value', 0.9), 'greater'))
        # Job was FAILURE; should skip metrics alerting.
        self.assertTrue(
            metrics_handler.skip_oob_alerting(
                job_status_handler.FAILURE, [
                    metrics.MetricPoint(1.0, 111),
                    metrics.MetricPoint(1.0, 112),
                    metrics.MetricPoint(1.0, 113)
                ], metrics.Threshold('fixed_value', 0.9), 'greater'))
        # Job was TIMEOUT; should skip metrics alerting.
        self.assertTrue(
            metrics_handler.skip_oob_alerting(
                job_status_handler.TIMEOUT, [
                    metrics.MetricPoint(1.0, 111),
                    metrics.MetricPoint(1.0, 112),
                    metrics.MetricPoint(1.0, 113)
                ], metrics.Threshold('fixed_value', 0.9), 'greater'))
        # Latest run was OOB but previous run was not; should skip alerting.
        self.assertTrue(
            metrics_handler.skip_oob_alerting(
                job_status_handler.SUCCESS, [
                    metrics.MetricPoint(0.8, 110),
                    metrics.MetricPoint(1.0, 112),
                    metrics.MetricPoint(1.0, 113)
                ], metrics.Threshold('fixed_value', 0.9), 'greater'))

        handler_base_args['regression_test_config'] = {
            'alert_after_second_test_failure': False,
        }
        metrics_handler = main.CloudMetricsHandler(**handler_base_args)
        # Latest run was OOB but previous run was not; should alert since now the
        # config has 'alert_after_second_test_failure': False.
        self.assertFalse(
            metrics_handler.skip_oob_alerting(
                job_status_handler.SUCCESS, [
                    metrics.MetricPoint(0.8, 110),
                    metrics.MetricPoint(1.0, 112),
                    metrics.MetricPoint(1.0, 113)
                ], metrics.Threshold('fixed_value', 0.9), 'greater'))