Exemple #1
0
    def test_skip_job_status_alerting(self):
        handler_base_args = {
            'test_name': 'test',
            'events_dir': self.temp_dir,
            'debug_info': None,
            'metric_collection_config': {},
            'regression_test_config': {
                'alert_after_second_test_failure': False,
            },
            'test_type': None,
            'accelerator': None,
            'framework_version': None,
            'logger': self.logger,
        }
        metrics_handler = main.CloudMetricsHandler(**handler_base_args)
        # Job succeeded, should not alert for job status.
        self.assertTrue(
            metrics_handler.skip_job_status_alerting(
                job_status_handler.SUCCESS,
                'tf-nightly-keras-api-save-and-load-v2-8'))
        # Job failed and alerts not disabled by config. Should alert.
        self.assertFalse(
            metrics_handler.skip_job_status_alerting(
                job_status_handler.FAILURE,
                'tf-nightly-keras-api-save-and-load-v2-8'))

        # regression_test_config is None; should skip alerting.
        handler_base_args['regression_test_config'] = {}
        metrics_handler = main.CloudMetricsHandler(**handler_base_args)
        self.assertTrue(
            metrics_handler.skip_job_status_alerting(
                job_status_handler.FAILURE,
                'tf-nightly-keras-api-save-and-load-v2-8'))

        # alert_for_failed_jobs set to False, should not alert for job status.
        handler_base_args['regression_test_config'] = {
            'alert_for_failed_jobs': False,
        }
        metrics_handler = main.CloudMetricsHandler(**handler_base_args)
        self.assertTrue(
            metrics_handler.skip_job_status_alerting(
                job_status_handler.FAILURE,
                'tf-nightly-keras-api-save-and-load-v2-8'))

        # alert_after_second_test_failure set to True but Bigquery unreachable.
        # Should default to not skipping alerts.
        handler_base_args['regression_test_config'] = {
            'alert_after_second_test_failure': True,
        }
        metrics_handler = main.CloudMetricsHandler(**handler_base_args)
        self.assertFalse(
            metrics_handler.skip_job_status_alerting(
                job_status_handler.FAILURE,
                'tf-nightly-keras-api-save-and-load-v2-8'))
Exemple #2
0
  def test_compute_bounds_and_report_errors_stddevs(self):
    metrics_handler = main.CloudMetricsHandler(
      test_name="test",
      events_dir=self.temp_dir,
      debug_info=None,
      metric_collection_config={
        'default_aggregation_strategies': ['final'],
        'tags_to_ignore': ['foo'],
      },
      regression_test_config={
        'metric_subset_to_alert': ['bar_final'],
        'metric_success_conditions': {
          'bar_final': {
            'success_threshold': {
              'stddevs_from_mean': 1.
            },
            'comparison': 'greater_or_equal',
            'wait_for_n_points_of_history': 0,
          },
        },
      },
      test_type=None,
      accelerator=None,
      framework_version=None,
      logger=self.logger,
    )

    _, aggregated_metrics = metrics_handler.get_metrics_from_events_dir()
    # Average is higher than current value - this should trigger an alert.
    with self.assertLogs(level='ERROR'):
      metrics_handler.compute_bounds_and_report_errors(
          {'bar_final': [metrics.MetricPoint(10.0, 111),
                         metrics.MetricPoint(10.0, 112),
                         metrics.MetricPoint(10.0, 113)],
           'total_wall_time': []},
          aggregated_metrics, job_status_handler.SUCCESS
      )
    # No error should be logged for out-of-bounds metrics if the job failed.
    with self.assertRaises(AssertionError):
      with self.assertLogs(level='ERROR'):
        metrics_handler.compute_bounds_and_report_errors(
            {'bar_final': [metrics.MetricPoint(10.0, 111),
                           metrics.MetricPoint(10.0, 112),
                           metrics.MetricPoint(10.0, 113)],
             'total_wall_time': []},
            aggregated_metrics, job_status_handler.FAILURE
        )
    # Average == current value - this should not trigger an alert since
    # we are using `greater_or_equal`.
    with self.assertRaises(AssertionError):
      with self.assertLogs(level='ERROR'):
        metrics_handler.compute_bounds_and_report_errors(
            {'bar_final': [metrics.MetricPoint(1.0, 111),
                           metrics.MetricPoint(1.0, 112),
                           metrics.MetricPoint(1.0, 113)],
             'total_wall_time': []},
            aggregated_metrics, job_status_handler.SUCCESS
        )
Exemple #3
0
    def test_get_metrics_from_perfzero_summary_not_found(self):
        metrics_handler = main.CloudMetricsHandler(
            test_name="test",
            events_dir=self.temp_dir,
            debug_info=None,
            metric_collection_config={},
            regression_test_config={},
            test_type=None,
            accelerator=None,
            framework_version=None,
            logger=self.logger,
        )

        aggregated_metrics = metrics_handler.get_metrics_from_perfzero_summary(
        )
        self.assertEmpty(aggregated_metrics)
Exemple #4
0
    def test_compute_bounds_and_report_errors_missing_metric(self):
        metrics_handler = main.CloudMetricsHandler(
            test_name="test",
            events_dir=self.temp_dir,
            debug_info=None,
            metric_collection_config={
                'default_aggregation_strategies': ['final'],
                'tags_to_ignore': ['bar'],
            },
            regression_test_config={
                'metric_subset_to_alert': ['foo_final'],
                'required_metrics': ['foo_final', 'fake_metric'],
                'metric_success_conditions': {
                    'foo_final': {
                        # foo_final=2.0 from the setUp() method above.
                        'success_threshold': {
                            'fixed_value': 1.
                        },
                        'comparison': 'greater',
                        'wait_for_n_points_of_history': 0,
                    },
                },
            },
            test_type=None,
            accelerator=None,
            framework_version=None,
            logger=self.logger,
        )

        _, aggregated_metrics = metrics_handler.get_metrics_from_events_dir()
        # Metrics are within bounds but are missing the `fake_metric`, which
        # is in `required_metrics`, so an error should be created.
        with self.assertLogs(level='ERROR'):
            metrics_handler.compute_bounds_and_report_errors(
                {
                    'foo_final': [],
                    'total_wall_time': []
                }, aggregated_metrics, job_status_handler.SUCCESS)
        # Error should be logged for missing required_metrics even if the test
        # run ended in failure.
        with self.assertLogs(level='ERROR'):
            metrics_handler.compute_bounds_and_report_errors(
                {
                    'foo_final': [],
                    'total_wall_time': []
                }, aggregated_metrics, job_status_handler.FAILURE)
Exemple #5
0
    def test_compute_bounds_and_report_errors_fixed_value(self):
        metrics_handler = main.CloudMetricsHandler(
            test_name="test",
            events_dir=self.temp_dir,
            debug_info=None,
            metric_collection_config={
                'default_aggregation_strategies': ['final'],
                'tags_to_ignore': ['bar'],
            },
            regression_test_config={
                'alert_after_second_test_failure': False,
                'metric_subset_to_alert': ['foo_final'],
                'required_metrics': ['foo_final'],
                'metric_success_conditions': {
                    'foo_final': {
                        'success_threshold': {
                            'fixed_value': 3.
                        },
                        'comparison': 'greater',
                        'wait_for_n_points_of_history': 0,
                    },
                },
            },
            test_type=None,
            accelerator=None,
            framework_version=None,
            logger=self.logger,
        )

        _, aggregated_metrics = metrics_handler.get_metrics_from_events_dir()
        with self.assertLogs(level='ERROR'):
            metrics_handler.compute_bounds_and_report_errors(
                {
                    'foo_final': [],
                    'total_wall_time': []
                }, aggregated_metrics, job_status_handler.SUCCESS)
        # No error should be logged for out-of-bounds metrics if the job failed.
        with self.assertRaises(AssertionError):
            with self.assertLogs(level='ERROR'):
                metrics_handler.compute_bounds_and_report_errors(
                    {
                        'foo_final': [],
                        'total_wall_time': []
                    }, aggregated_metrics, job_status_handler.FAILURE)
Exemple #6
0
  def test_get_metrics_from_event_dir(self):
    metrics_handler = main.CloudMetricsHandler(
      test_name="test",
      events_dir=self.temp_dir,
      debug_info=None,
      metric_collection_config={
        'default_aggregation_strategies': ['final', 'min',]
      },
      regression_test_config={},
      test_type=None,
      accelerator=None,
      framework_version=None,
      logger=self.logger,
    )

    final_metrics = metrics_handler.get_metrics_from_events_dir()
    self.assertContainsSubset(
        ['foo_final', 'foo_min', 'bar_final', 'bar_min'],
        final_metrics.keys())
Exemple #7
0
  def test_add_computed_metrics(self):
    metrics_handler = main.CloudMetricsHandler(
      test_name="test",
      events_dir=self.temp_dir,
      debug_info=None,
      metric_collection_config={
        'default_aggregation_strategies': ['final', 'min',]
      },
      regression_test_config={},
      test_type=None,
      accelerator=None,
      framework_version=None,
      logger=self.logger,
    )

    final_metrics = {'foo_final': 1}
    metrics_handler.add_computed_metrics(final_metrics, self.job_status_dict,
                                         find_memory_metrics=False)
    self.assertContainsSubset(
        ['foo_final', 'total_wall_time'],
        final_metrics.keys())
Exemple #8
0
    def test_get_metrics_from_perfzero_summary(self):
        summary_dir = os.path.join(self.temp_dir, 'date_and_time')
        pathlib.Path(summary_dir).mkdir(parents=True, exist_ok=True)
        summary_path = os.path.join(summary_dir, 'perfzero_summary.json')
        with open(summary_path, 'w') as f:
            json.dump(
                {
                    "execution_id": "execution_id",
                    "execution_timestamp": 1234567890.1,
                    "benchmark_result": {
                        "wall_time":
                        1234,
                        "metrics": [{
                            "name": "exp_per_second",
                            "value": 1.1,
                        }, {
                            "name": "avg_exp_per_second",
                            "value": 2.2,
                        }, {
                            "name": "startup_time",
                            "value": 3.3
                        }],
                    },
                    "benchmark_info": {
                        "not": "important",
                    },
                    "setup_info": {},
                    "ml_framework_info": {
                        "not": "important",
                    },
                    "system_info": {
                        "not": "important"
                    },
                    "process_info": {
                        "max_rss": 4.4,
                        "max_vms": 5.5,
                        "max_cpu_percent": 6.6,
                    }
                }, f)

        metrics_handler = main.CloudMetricsHandler(
            test_name="test",
            events_dir=self.temp_dir,
            debug_info=None,
            metric_collection_config={},
            regression_test_config={},
            test_type=None,
            accelerator=None,
            framework_version=None,
            logger=self.logger,
        )

        aggregated_metrics = metrics_handler.get_metrics_from_perfzero_summary(
        )
        self.assertDictEqual(
            {
                "total_wall_time": metrics.MetricPoint(1234, 1234567890.1),
                "exp_per_second": metrics.MetricPoint(1.1, 1234567890.1),
                "avg_exp_per_second": metrics.MetricPoint(2.2, 1234567890.1),
                "startup_time": metrics.MetricPoint(3.3, 1234567890.1),
                "max_rss": metrics.MetricPoint(4.4, 1234567890.1),
                "max_vms": metrics.MetricPoint(5.5, 1234567890.1),
                "max_cpu_percent": metrics.MetricPoint(6.6, 1234567890.1),
            }, aggregated_metrics)
Exemple #9
0
    def test_skip_oob_alerting(self):
        handler_base_args = {
            'test_name': 'test',
            'events_dir': self.temp_dir,
            'debug_info': None,
            'metric_collection_config': {},
            'regression_test_config': {
                'alert_after_second_test_failure': True,
            },
            'test_type': None,
            'accelerator': None,
            'framework_version': None,
            'logger': self.logger,
        }
        metrics_handler = main.CloudMetricsHandler(**handler_base_args)
        # Both current and previous runs were OOB. Should alert.
        self.assertFalse(
            metrics_handler.skip_oob_alerting(
                job_status_handler.SUCCESS, [
                    metrics.MetricPoint(0.8, 111),
                    metrics.MetricPoint(0.8, 112),
                    metrics.MetricPoint(1.0, 113)
                ], metrics.Threshold('fixed_value', 0.9), 'greater'))
        # Job was FAILURE; should skip metrics alerting.
        self.assertTrue(
            metrics_handler.skip_oob_alerting(
                job_status_handler.FAILURE, [
                    metrics.MetricPoint(1.0, 111),
                    metrics.MetricPoint(1.0, 112),
                    metrics.MetricPoint(1.0, 113)
                ], metrics.Threshold('fixed_value', 0.9), 'greater'))
        # Job was TIMEOUT; should skip metrics alerting.
        self.assertTrue(
            metrics_handler.skip_oob_alerting(
                job_status_handler.TIMEOUT, [
                    metrics.MetricPoint(1.0, 111),
                    metrics.MetricPoint(1.0, 112),
                    metrics.MetricPoint(1.0, 113)
                ], metrics.Threshold('fixed_value', 0.9), 'greater'))
        # Latest run was OOB but previous run was not; should skip alerting.
        self.assertTrue(
            metrics_handler.skip_oob_alerting(
                job_status_handler.SUCCESS, [
                    metrics.MetricPoint(0.8, 110),
                    metrics.MetricPoint(1.0, 112),
                    metrics.MetricPoint(1.0, 113)
                ], metrics.Threshold('fixed_value', 0.9), 'greater'))

        handler_base_args['regression_test_config'] = {
            'alert_after_second_test_failure': False,
        }
        metrics_handler = main.CloudMetricsHandler(**handler_base_args)
        # Latest run was OOB but previous run was not; should alert since now the
        # config has 'alert_after_second_test_failure': False.
        self.assertFalse(
            metrics_handler.skip_oob_alerting(
                job_status_handler.SUCCESS, [
                    metrics.MetricPoint(0.8, 110),
                    metrics.MetricPoint(1.0, 112),
                    metrics.MetricPoint(1.0, 113)
                ], metrics.Threshold('fixed_value', 0.9), 'greater'))