def test_skip_job_status_alerting(self): handler_base_args = { 'test_name': 'test', 'events_dir': self.temp_dir, 'debug_info': None, 'metric_collection_config': {}, 'regression_test_config': { 'alert_after_second_test_failure': False, }, 'test_type': None, 'accelerator': None, 'framework_version': None, 'logger': self.logger, } metrics_handler = main.CloudMetricsHandler(**handler_base_args) # Job succeeded, should not alert for job status. self.assertTrue( metrics_handler.skip_job_status_alerting( job_status_handler.SUCCESS, 'tf-nightly-keras-api-save-and-load-v2-8')) # Job failed and alerts not disabled by config. Should alert. self.assertFalse( metrics_handler.skip_job_status_alerting( job_status_handler.FAILURE, 'tf-nightly-keras-api-save-and-load-v2-8')) # regression_test_config is None; should skip alerting. handler_base_args['regression_test_config'] = {} metrics_handler = main.CloudMetricsHandler(**handler_base_args) self.assertTrue( metrics_handler.skip_job_status_alerting( job_status_handler.FAILURE, 'tf-nightly-keras-api-save-and-load-v2-8')) # alert_for_failed_jobs set to False, should not alert for job status. handler_base_args['regression_test_config'] = { 'alert_for_failed_jobs': False, } metrics_handler = main.CloudMetricsHandler(**handler_base_args) self.assertTrue( metrics_handler.skip_job_status_alerting( job_status_handler.FAILURE, 'tf-nightly-keras-api-save-and-load-v2-8')) # alert_after_second_test_failure set to True but Bigquery unreachable. # Should default to not skipping alerts. handler_base_args['regression_test_config'] = { 'alert_after_second_test_failure': True, } metrics_handler = main.CloudMetricsHandler(**handler_base_args) self.assertFalse( metrics_handler.skip_job_status_alerting( job_status_handler.FAILURE, 'tf-nightly-keras-api-save-and-load-v2-8'))
def test_compute_bounds_and_report_errors_stddevs(self): metrics_handler = main.CloudMetricsHandler( test_name="test", events_dir=self.temp_dir, debug_info=None, metric_collection_config={ 'default_aggregation_strategies': ['final'], 'tags_to_ignore': ['foo'], }, regression_test_config={ 'metric_subset_to_alert': ['bar_final'], 'metric_success_conditions': { 'bar_final': { 'success_threshold': { 'stddevs_from_mean': 1. }, 'comparison': 'greater_or_equal', 'wait_for_n_points_of_history': 0, }, }, }, test_type=None, accelerator=None, framework_version=None, logger=self.logger, ) _, aggregated_metrics = metrics_handler.get_metrics_from_events_dir() # Average is higher than current value - this should trigger an alert. with self.assertLogs(level='ERROR'): metrics_handler.compute_bounds_and_report_errors( {'bar_final': [metrics.MetricPoint(10.0, 111), metrics.MetricPoint(10.0, 112), metrics.MetricPoint(10.0, 113)], 'total_wall_time': []}, aggregated_metrics, job_status_handler.SUCCESS ) # No error should be logged for out-of-bounds metrics if the job failed. with self.assertRaises(AssertionError): with self.assertLogs(level='ERROR'): metrics_handler.compute_bounds_and_report_errors( {'bar_final': [metrics.MetricPoint(10.0, 111), metrics.MetricPoint(10.0, 112), metrics.MetricPoint(10.0, 113)], 'total_wall_time': []}, aggregated_metrics, job_status_handler.FAILURE ) # Average == current value - this should not trigger an alert since # we are using `greater_or_equal`. with self.assertRaises(AssertionError): with self.assertLogs(level='ERROR'): metrics_handler.compute_bounds_and_report_errors( {'bar_final': [metrics.MetricPoint(1.0, 111), metrics.MetricPoint(1.0, 112), metrics.MetricPoint(1.0, 113)], 'total_wall_time': []}, aggregated_metrics, job_status_handler.SUCCESS )
def test_get_metrics_from_perfzero_summary_not_found(self): metrics_handler = main.CloudMetricsHandler( test_name="test", events_dir=self.temp_dir, debug_info=None, metric_collection_config={}, regression_test_config={}, test_type=None, accelerator=None, framework_version=None, logger=self.logger, ) aggregated_metrics = metrics_handler.get_metrics_from_perfzero_summary( ) self.assertEmpty(aggregated_metrics)
def test_compute_bounds_and_report_errors_missing_metric(self): metrics_handler = main.CloudMetricsHandler( test_name="test", events_dir=self.temp_dir, debug_info=None, metric_collection_config={ 'default_aggregation_strategies': ['final'], 'tags_to_ignore': ['bar'], }, regression_test_config={ 'metric_subset_to_alert': ['foo_final'], 'required_metrics': ['foo_final', 'fake_metric'], 'metric_success_conditions': { 'foo_final': { # foo_final=2.0 from the setUp() method above. 'success_threshold': { 'fixed_value': 1. }, 'comparison': 'greater', 'wait_for_n_points_of_history': 0, }, }, }, test_type=None, accelerator=None, framework_version=None, logger=self.logger, ) _, aggregated_metrics = metrics_handler.get_metrics_from_events_dir() # Metrics are within bounds but are missing the `fake_metric`, which # is in `required_metrics`, so an error should be created. with self.assertLogs(level='ERROR'): metrics_handler.compute_bounds_and_report_errors( { 'foo_final': [], 'total_wall_time': [] }, aggregated_metrics, job_status_handler.SUCCESS) # Error should be logged for missing required_metrics even if the test # run ended in failure. with self.assertLogs(level='ERROR'): metrics_handler.compute_bounds_and_report_errors( { 'foo_final': [], 'total_wall_time': [] }, aggregated_metrics, job_status_handler.FAILURE)
def test_compute_bounds_and_report_errors_fixed_value(self): metrics_handler = main.CloudMetricsHandler( test_name="test", events_dir=self.temp_dir, debug_info=None, metric_collection_config={ 'default_aggregation_strategies': ['final'], 'tags_to_ignore': ['bar'], }, regression_test_config={ 'alert_after_second_test_failure': False, 'metric_subset_to_alert': ['foo_final'], 'required_metrics': ['foo_final'], 'metric_success_conditions': { 'foo_final': { 'success_threshold': { 'fixed_value': 3. }, 'comparison': 'greater', 'wait_for_n_points_of_history': 0, }, }, }, test_type=None, accelerator=None, framework_version=None, logger=self.logger, ) _, aggregated_metrics = metrics_handler.get_metrics_from_events_dir() with self.assertLogs(level='ERROR'): metrics_handler.compute_bounds_and_report_errors( { 'foo_final': [], 'total_wall_time': [] }, aggregated_metrics, job_status_handler.SUCCESS) # No error should be logged for out-of-bounds metrics if the job failed. with self.assertRaises(AssertionError): with self.assertLogs(level='ERROR'): metrics_handler.compute_bounds_and_report_errors( { 'foo_final': [], 'total_wall_time': [] }, aggregated_metrics, job_status_handler.FAILURE)
def test_get_metrics_from_event_dir(self): metrics_handler = main.CloudMetricsHandler( test_name="test", events_dir=self.temp_dir, debug_info=None, metric_collection_config={ 'default_aggregation_strategies': ['final', 'min',] }, regression_test_config={}, test_type=None, accelerator=None, framework_version=None, logger=self.logger, ) final_metrics = metrics_handler.get_metrics_from_events_dir() self.assertContainsSubset( ['foo_final', 'foo_min', 'bar_final', 'bar_min'], final_metrics.keys())
def test_add_computed_metrics(self): metrics_handler = main.CloudMetricsHandler( test_name="test", events_dir=self.temp_dir, debug_info=None, metric_collection_config={ 'default_aggregation_strategies': ['final', 'min',] }, regression_test_config={}, test_type=None, accelerator=None, framework_version=None, logger=self.logger, ) final_metrics = {'foo_final': 1} metrics_handler.add_computed_metrics(final_metrics, self.job_status_dict, find_memory_metrics=False) self.assertContainsSubset( ['foo_final', 'total_wall_time'], final_metrics.keys())
def test_get_metrics_from_perfzero_summary(self): summary_dir = os.path.join(self.temp_dir, 'date_and_time') pathlib.Path(summary_dir).mkdir(parents=True, exist_ok=True) summary_path = os.path.join(summary_dir, 'perfzero_summary.json') with open(summary_path, 'w') as f: json.dump( { "execution_id": "execution_id", "execution_timestamp": 1234567890.1, "benchmark_result": { "wall_time": 1234, "metrics": [{ "name": "exp_per_second", "value": 1.1, }, { "name": "avg_exp_per_second", "value": 2.2, }, { "name": "startup_time", "value": 3.3 }], }, "benchmark_info": { "not": "important", }, "setup_info": {}, "ml_framework_info": { "not": "important", }, "system_info": { "not": "important" }, "process_info": { "max_rss": 4.4, "max_vms": 5.5, "max_cpu_percent": 6.6, } }, f) metrics_handler = main.CloudMetricsHandler( test_name="test", events_dir=self.temp_dir, debug_info=None, metric_collection_config={}, regression_test_config={}, test_type=None, accelerator=None, framework_version=None, logger=self.logger, ) aggregated_metrics = metrics_handler.get_metrics_from_perfzero_summary( ) self.assertDictEqual( { "total_wall_time": metrics.MetricPoint(1234, 1234567890.1), "exp_per_second": metrics.MetricPoint(1.1, 1234567890.1), "avg_exp_per_second": metrics.MetricPoint(2.2, 1234567890.1), "startup_time": metrics.MetricPoint(3.3, 1234567890.1), "max_rss": metrics.MetricPoint(4.4, 1234567890.1), "max_vms": metrics.MetricPoint(5.5, 1234567890.1), "max_cpu_percent": metrics.MetricPoint(6.6, 1234567890.1), }, aggregated_metrics)
def test_skip_oob_alerting(self): handler_base_args = { 'test_name': 'test', 'events_dir': self.temp_dir, 'debug_info': None, 'metric_collection_config': {}, 'regression_test_config': { 'alert_after_second_test_failure': True, }, 'test_type': None, 'accelerator': None, 'framework_version': None, 'logger': self.logger, } metrics_handler = main.CloudMetricsHandler(**handler_base_args) # Both current and previous runs were OOB. Should alert. self.assertFalse( metrics_handler.skip_oob_alerting( job_status_handler.SUCCESS, [ metrics.MetricPoint(0.8, 111), metrics.MetricPoint(0.8, 112), metrics.MetricPoint(1.0, 113) ], metrics.Threshold('fixed_value', 0.9), 'greater')) # Job was FAILURE; should skip metrics alerting. self.assertTrue( metrics_handler.skip_oob_alerting( job_status_handler.FAILURE, [ metrics.MetricPoint(1.0, 111), metrics.MetricPoint(1.0, 112), metrics.MetricPoint(1.0, 113) ], metrics.Threshold('fixed_value', 0.9), 'greater')) # Job was TIMEOUT; should skip metrics alerting. self.assertTrue( metrics_handler.skip_oob_alerting( job_status_handler.TIMEOUT, [ metrics.MetricPoint(1.0, 111), metrics.MetricPoint(1.0, 112), metrics.MetricPoint(1.0, 113) ], metrics.Threshold('fixed_value', 0.9), 'greater')) # Latest run was OOB but previous run was not; should skip alerting. self.assertTrue( metrics_handler.skip_oob_alerting( job_status_handler.SUCCESS, [ metrics.MetricPoint(0.8, 110), metrics.MetricPoint(1.0, 112), metrics.MetricPoint(1.0, 113) ], metrics.Threshold('fixed_value', 0.9), 'greater')) handler_base_args['regression_test_config'] = { 'alert_after_second_test_failure': False, } metrics_handler = main.CloudMetricsHandler(**handler_base_args) # Latest run was OOB but previous run was not; should alert since now the # config has 'alert_after_second_test_failure': False. self.assertFalse( metrics_handler.skip_oob_alerting( job_status_handler.SUCCESS, [ metrics.MetricPoint(0.8, 110), metrics.MetricPoint(1.0, 112), metrics.MetricPoint(1.0, 113) ], metrics.Threshold('fixed_value', 0.9), 'greater'))