def test_compute_bounds_stddevs_from_mean(self, comparison, std_devs, expected_bounds, inclusive=False): # mean = 3, stddev = ~1.414 value_history = range(1, 6) class _FakeMetricStore: def get_metric_history(*args, **kwargs): return [ bigquery_client.MetricHistoryRow('', '', datetime.datetime.now(), '', v) for v in value_history ] assertion = metrics_pb2.Assertion( std_devs_from_mean=metrics_pb2.Assertion.StdDevsFromMean( comparison=metrics_pb2.Assertion.Comparison.Value(comparison), std_devs=std_devs, ), inclusive_bounds=inclusive, ) collector = base.BaseCollector( metrics_pb2.TestCompletedEvent(benchmark_id="test_benchmark"), None, _FakeMetricStore()) bounds = collector.compute_bounds("metric_key", assertion) self.assertSequenceAlmostEqual( dataclasses.astuple(bounds), # EQUAL is always inclusive dataclasses.astuple( utils.Bounds(*expected_bounds, inclusive or comparison == 'EQUAL')), places=3)
def test_compute_bounds_percent_difference_with_mean_value( self, comparison, pct_diff, expected_bounds, inclusive=False): # mean = 3 value_history = range(1, 6) class _FakeMetricStore: def get_metric_history(*args, **kwargs): return [ bigquery_client.MetricHistoryRow('', '', datetime.datetime.now(), '', v) for v in value_history ] assertion = metrics_pb2.Assertion( percent_difference=metrics_pb2.Assertion.PercentDifference( comparison=metrics_pb2.Assertion.Comparison.Value(comparison), use_historical_mean=True, percent=pct_diff, ), inclusive_bounds=inclusive, ) collector = base.BaseCollector( metrics_pb2.TestCompletedEvent(benchmark_id="test_benchmark"), None, _FakeMetricStore()) bounds = collector.compute_bounds("metric_key", assertion) self.assertSequenceAlmostEqual( dataclasses.astuple(bounds), dataclasses.astuple(utils.Bounds(*expected_bounds, inclusive)), places=3)
def test_assert_duration(self): metric_source = metrics_pb2.MetricSource( literals=metrics_pb2.LiteralSource( assertions={ "duration": metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=100, upper_bound=200, ), inclusive_bounds=False, ) })) event = metrics_pb2.TestCompletedEvent( benchmark_id="test_benchmark", duration=duration_pb2.Duration(seconds=150), metric_collection_config=metrics_pb2.MetricCollectionConfig( sources=[metric_source])) collector = literal_collector.LiteralCollector( event=event, raw_source=metric_source) points = collector.metric_points() self.assertLen(points, 1) self.assertEqual(points[0].metric_key, 'duration') self.assertEqual(points[0].metric_value, 150) self.assertEqual(points[0].bounds, utils.Bounds(100, 200, False))
def test_compute_bounds_within_bounds(self, lower, upper, expected_bounds, inclusive=False): assertion = metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=lower, upper_bound=upper, ), inclusive_bounds=inclusive, ) collector = base.BaseCollector( metrics_pb2.TestCompletedEvent(benchmark_id="test_benchmark"), None) bounds = collector.compute_bounds("metric_key", assertion) self.assertEqual(bounds, utils.Bounds(*expected_bounds, inclusive))
def test_aggregate_metrics_with_assertion(self): metric_source = metrics_pb2.MetricSource( tensorboard=metrics_pb2.TensorBoardSource( include_tags=[ metrics_pb2.TensorBoardSource.TagStrategy( tag_pattern="eval/*", strategies=[ metrics_pb2.TensorBoardSource.FINAL, metrics_pb2.TensorBoardSource.MAX, metrics_pb2.TensorBoardSource.MIN, ]) ], aggregate_assertions=[ metrics_pb2.TensorBoardSource.AggregateAssertion( tag='eval/accuracy', strategy=metrics_pb2.TensorBoardSource.MAX, assertion=metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=.4, upper_bound=1.0, ), inclusive_bounds=True, )) ])) event = metrics_pb2.TestCompletedEvent( benchmark_id="test_benchmark", output_path=self.temp_dir, metric_collection_config=metrics_pb2.MetricCollectionConfig( sources=[metric_source])) collector = tensorboard_collector.TensorBoardCollector( event=event, raw_source=metric_source) points = list(collector.metric_points()) self.assertCountEqual( points, [ utils.MetricPoint('eval/accuracy_max', .5, utils.Bounds(.4, 1.0, True)), utils.MetricPoint('eval/accuracy_min', .125, utils.NO_BOUNDS), utils.MetricPoint('eval/accuracy_final', .25, utils.NO_BOUNDS), ], )
def test_compute_bounds_percent_difference_with_target_value( self, comparison, target, pct_diff, expected_bounds, inclusive=False): assertion = metrics_pb2.Assertion( percent_difference=metrics_pb2.Assertion.PercentDifference( comparison=metrics_pb2.Assertion.Comparison.Value(comparison), value=target, percent=pct_diff, ), inclusive_bounds=inclusive, ) collector = base.BaseCollector( metrics_pb2.TestCompletedEvent(benchmark_id="test_benchmark"), None) bounds = collector.compute_bounds("metric_key", assertion) self.assertEqual(bounds, utils.Bounds(*expected_bounds, inclusive))
def test_include_and_exclude(self): metric_source = metrics_pb2.MetricSource( tensorboard=metrics_pb2.TensorBoardSource( include_tags=[ metrics_pb2.TensorBoardSource.TagStrategy( tag_pattern="*", strategies=[ metrics_pb2.TensorBoardSource.FINAL, ]) ], exclude_tags=[ 'foo', 'train/*', ], aggregate_assertions=[ metrics_pb2.TensorBoardSource.AggregateAssertion( tag='foo', strategy=metrics_pb2.TensorBoardSource.MIN, assertion=metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=0., upper_bound=2., ))) ])) event = metrics_pb2.TestCompletedEvent( benchmark_id="test_benchmark", output_path=self.temp_dir, metric_collection_config=metrics_pb2.MetricCollectionConfig( sources=[metric_source])) collector = tensorboard_collector.TensorBoardCollector( event=event, raw_source=metric_source) points = list(collector.metric_points()) self.assertCountEqual( points, [ utils.MetricPoint('eval/accuracy_final', .25, utils.NO_BOUNDS), utils.MetricPoint('foo_min', 1, utils.Bounds(0., 2., False)), ], )
def test_min_time(self, window, timestamp, expected_min): start_time = timestamp_pb2.Timestamp() start_time.FromDatetime(datetime.datetime(2021, 2, 16, 0, 0, 0)) min_timestamp = timestamp_pb2.Timestamp() if timestamp: min_timestamp.FromDatetime(timestamp) time_window = duration_pb2.Duration() if window: time_window.FromTimedelta(window) assertion = metrics_pb2.Assertion( std_devs_from_mean=metrics_pb2.Assertion.StdDevsFromMean( comparison=metrics_pb2.Assertion.Comparison.WITHIN, std_devs=1, ), time_window=time_window, min_timestamp=min_timestamp, ) metric_store = bigquery_client.BigQueryMetricStore( 'fake_dataset', 'fake_project') metric_store.get_metric_history = mock.Mock(return_value=[]) collector = base.BaseCollector( metrics_pb2.TestCompletedEvent( benchmark_id="test_benchmark", start_time=start_time, ), None, metric_store) collector.compute_bounds("metric_key", assertion) metric_store.get_metric_history.assert_called_with( benchmark_id="test_benchmark", metric_key="metric_key", min_time=expected_min)
def test_compute_bounds_fixed_value(self, comparison, threshold_value, expected_bounds, inclusive=False): assertion = metrics_pb2.Assertion( fixed_value=metrics_pb2.Assertion.FixedValue( comparison=metrics_pb2.Assertion.Comparison.Value(comparison), value=threshold_value, ), inclusive_bounds=inclusive, ) collector = base.BaseCollector( metrics_pb2.TestCompletedEvent(benchmark_id="test_benchmark"), None) bounds = collector.compute_bounds("metric_key", assertion) self.assertSequenceAlmostEqual( dataclasses.astuple(bounds), # EQUAL is always inclusive dataclasses.astuple( utils.Bounds(*expected_bounds, inclusive or comparison == 'EQUAL')), places=3)
def test_metric_collection_config(self, gcs_subdir): job = _job_from_dict({ 'metadata': { 'name': 'job-name', 'namespace': 'namespace', 'labels': { 'benchmarkId': 'test-job', }, 'annotations': { 'ml-testing-accelerators/metric-config': json.dumps({ 'sources': [{ 'literals': { 'assertions': { 'duration': { 'within_bounds': { 'lower_bound': 1, 'upper_bound': 2, } } } } }] }) } }, 'status': { 'startTime': _START_TIME, 'completionTime': _END_TIME, 'succeeded': 1, 'conditions': [ { 'status': True, 'type': 'Complete', 'lastTransitionTime': _END_TIME, } ] } }) if gcs_subdir: job.metadata.annotations['ml-testing-accelerators/gcs-subdir'] = gcs_subdir actual_event = event_publisher.create_test_completed_event( job, model_output_bucket='gs://fake-bucket', cluster_name='cluster-name', cluster_location='cluster-location', project='project-id' ) actual_mcc = actual_event.metric_collection_config expected_mcc = metrics_pb2.MetricCollectionConfig( sources=[ metrics_pb2.MetricSource( literals=metrics_pb2.LiteralSource( assertions={ 'duration': metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=1, upper_bound=2, ) ) } ) ) ] ) self.assertEqual(actual_event.output_path, os.path.join('gs://fake-bucket', gcs_subdir or '', 'job-name')) self.assertProtoEqual(expected_mcc, actual_mcc)
def test_get_metrics_from_perfzero_summary(self): temp_dir = self.create_tempdir().full_path summary_dir = os.path.join(temp_dir, 'date_and_time') pathlib.Path(summary_dir).mkdir(parents=True, exist_ok=True) summary_path = os.path.join(summary_dir, 'perfzero_summary.json') with open(summary_path, 'w') as f: json.dump( { "execution_id": "execution_id", "execution_timestamp": 1234567890.1, "benchmark_result": { "wall_time": 1234, "metrics": [{ "name": "exp_per_second", "value": 1.1, }, { "name": "avg_exp_per_second", "value": 2.2, }, { "name": "startup_time", "value": 3.3 }], }, "benchmark_info": { "not": "important", }, "setup_info": {}, "ml_framework_info": { "not": "important", }, "system_info": { "not": "important" }, "process_info": { "max_rss": 4.4, "max_vms": 5.5, "max_cpu_percent": 6.6, } }, f) metric_source = metrics_pb2.MetricSource( perfzero=metrics_pb2.PerfZeroSource( assertions={ 'total_wall_time': metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=1230, upper_bound=1240, )), 'exp_per_second': metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=1, upper_bound=100, ), ) })) event = metrics_pb2.TestCompletedEvent( benchmark_id="test_benchmark", output_path=temp_dir, metric_collection_config=metrics_pb2.MetricCollectionConfig( sources=[metric_source])) collector = perfzero_collector.PerfZeroCollector( event=event, raw_source=metric_source) points = list(collector.metric_points()) self.assertCountEqual( points, { utils.MetricPoint("total_wall_time", 1234, utils.Bounds(1230., 1240., False)), utils.MetricPoint("exp_per_second", 1.1, utils.Bounds(1., 100., False)), utils.MetricPoint("avg_exp_per_second", 2.2, utils.NO_BOUNDS), utils.MetricPoint("startup_time", 3.3, utils.NO_BOUNDS), utils.MetricPoint("process_info/max_rss", 4.4, utils.NO_BOUNDS), utils.MetricPoint("process_info/max_vms", 5.5, utils.NO_BOUNDS), utils.MetricPoint("process_info/max_cpu_percent", 6.6, utils.NO_BOUNDS), }, )