def test_compute_bounds_stddevs_from_mean(self, comparison, std_devs, expected_bounds, inclusive=False): # mean = 3, stddev = ~1.414 value_history = range(1, 6) class _FakeMetricStore: def get_metric_history(*args, **kwargs): return [ bigquery_client.MetricHistoryRow('', '', datetime.datetime.now(), '', v) for v in value_history ] assertion = metrics_pb2.Assertion( std_devs_from_mean=metrics_pb2.Assertion.StdDevsFromMean( comparison=metrics_pb2.Assertion.Comparison.Value(comparison), std_devs=std_devs, ), inclusive_bounds=inclusive, ) collector = base.BaseCollector( metrics_pb2.TestCompletedEvent(benchmark_id="test_benchmark"), None, _FakeMetricStore()) bounds = collector.compute_bounds("metric_key", assertion) self.assertSequenceAlmostEqual( dataclasses.astuple(bounds), # EQUAL is always inclusive dataclasses.astuple( utils.Bounds(*expected_bounds, inclusive or comparison == 'EQUAL')), places=3)
def test_compute_bounds_percent_difference_with_mean_value( self, comparison, pct_diff, expected_bounds, inclusive=False): # mean = 3 value_history = range(1, 6) class _FakeMetricStore: def get_metric_history(*args, **kwargs): return [ bigquery_client.MetricHistoryRow('', '', datetime.datetime.now(), '', v) for v in value_history ] assertion = metrics_pb2.Assertion( percent_difference=metrics_pb2.Assertion.PercentDifference( comparison=metrics_pb2.Assertion.Comparison.Value(comparison), use_historical_mean=True, percent=pct_diff, ), inclusive_bounds=inclusive, ) collector = base.BaseCollector( metrics_pb2.TestCompletedEvent(benchmark_id="test_benchmark"), None, _FakeMetricStore()) bounds = collector.compute_bounds("metric_key", assertion) self.assertSequenceAlmostEqual( dataclasses.astuple(bounds), dataclasses.astuple(utils.Bounds(*expected_bounds, inclusive)), places=3)
def test_assert_duration(self): metric_source = metrics_pb2.MetricSource( literals=metrics_pb2.LiteralSource( assertions={ "duration": metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=100, upper_bound=200, ), inclusive_bounds=False, ) })) event = metrics_pb2.TestCompletedEvent( benchmark_id="test_benchmark", duration=duration_pb2.Duration(seconds=150), metric_collection_config=metrics_pb2.MetricCollectionConfig( sources=[metric_source])) collector = literal_collector.LiteralCollector( event=event, raw_source=metric_source) points = collector.metric_points() self.assertLen(points, 1) self.assertEqual(points[0].metric_key, 'duration') self.assertEqual(points[0].metric_value, 150) self.assertEqual(points[0].bounds, utils.Bounds(100, 200, False))
def test_compute_bounds_within_bounds(self, lower, upper, expected_bounds, inclusive=False): assertion = metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=lower, upper_bound=upper, ), inclusive_bounds=inclusive, ) collector = base.BaseCollector( metrics_pb2.TestCompletedEvent(benchmark_id="test_benchmark"), None) bounds = collector.compute_bounds("metric_key", assertion) self.assertEqual(bounds, utils.Bounds(*expected_bounds, inclusive))
def test_aggregate_metrics_with_assertion(self): metric_source = metrics_pb2.MetricSource( tensorboard=metrics_pb2.TensorBoardSource( include_tags=[ metrics_pb2.TensorBoardSource.TagStrategy( tag_pattern="eval/*", strategies=[ metrics_pb2.TensorBoardSource.FINAL, metrics_pb2.TensorBoardSource.MAX, metrics_pb2.TensorBoardSource.MIN, ]) ], aggregate_assertions=[ metrics_pb2.TensorBoardSource.AggregateAssertion( tag='eval/accuracy', strategy=metrics_pb2.TensorBoardSource.MAX, assertion=metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=.4, upper_bound=1.0, ), inclusive_bounds=True, )) ])) event = metrics_pb2.TestCompletedEvent( benchmark_id="test_benchmark", output_path=self.temp_dir, metric_collection_config=metrics_pb2.MetricCollectionConfig( sources=[metric_source])) collector = tensorboard_collector.TensorBoardCollector( event=event, raw_source=metric_source) points = list(collector.metric_points()) self.assertCountEqual( points, [ utils.MetricPoint('eval/accuracy_max', .5, utils.Bounds(.4, 1.0, True)), utils.MetricPoint('eval/accuracy_min', .125, utils.NO_BOUNDS), utils.MetricPoint('eval/accuracy_final', .25, utils.NO_BOUNDS), ], )
def test_compute_bounds_percent_difference_with_target_value( self, comparison, target, pct_diff, expected_bounds, inclusive=False): assertion = metrics_pb2.Assertion( percent_difference=metrics_pb2.Assertion.PercentDifference( comparison=metrics_pb2.Assertion.Comparison.Value(comparison), value=target, percent=pct_diff, ), inclusive_bounds=inclusive, ) collector = base.BaseCollector( metrics_pb2.TestCompletedEvent(benchmark_id="test_benchmark"), None) bounds = collector.compute_bounds("metric_key", assertion) self.assertEqual(bounds, utils.Bounds(*expected_bounds, inclusive))
def test_include_and_exclude(self): metric_source = metrics_pb2.MetricSource( tensorboard=metrics_pb2.TensorBoardSource( include_tags=[ metrics_pb2.TensorBoardSource.TagStrategy( tag_pattern="*", strategies=[ metrics_pb2.TensorBoardSource.FINAL, ]) ], exclude_tags=[ 'foo', 'train/*', ], aggregate_assertions=[ metrics_pb2.TensorBoardSource.AggregateAssertion( tag='foo', strategy=metrics_pb2.TensorBoardSource.MIN, assertion=metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=0., upper_bound=2., ))) ])) event = metrics_pb2.TestCompletedEvent( benchmark_id="test_benchmark", output_path=self.temp_dir, metric_collection_config=metrics_pb2.MetricCollectionConfig( sources=[metric_source])) collector = tensorboard_collector.TensorBoardCollector( event=event, raw_source=metric_source) points = list(collector.metric_points()) self.assertCountEqual( points, [ utils.MetricPoint('eval/accuracy_final', .25, utils.NO_BOUNDS), utils.MetricPoint('foo_min', 1, utils.Bounds(0., 2., False)), ], )
def test_compute_bounds_fixed_value(self, comparison, threshold_value, expected_bounds, inclusive=False): assertion = metrics_pb2.Assertion( fixed_value=metrics_pb2.Assertion.FixedValue( comparison=metrics_pb2.Assertion.Comparison.Value(comparison), value=threshold_value, ), inclusive_bounds=inclusive, ) collector = base.BaseCollector( metrics_pb2.TestCompletedEvent(benchmark_id="test_benchmark"), None) bounds = collector.compute_bounds("metric_key", assertion) self.assertSequenceAlmostEqual( dataclasses.astuple(bounds), # EQUAL is always inclusive dataclasses.astuple( utils.Bounds(*expected_bounds, inclusive or comparison == 'EQUAL')), places=3)
def compute_bounds(self, metric_key: str, assertion: metrics_pb2.Assertion) -> utils.Bounds: """Returns the bounds for a given metric, based on the given assertion. This method may result in database calls to gather historical data for some types of assertions. Args: metric_key: Unique string identifying the name of the metric. assertion: The assertion that will be used to define the bounds. Returns: An instance of utils.Bounds representing the metric bounds. """ if assertion is None: return utils.NO_BOUNDS lower_bound = -math.inf upper_bound = math.inf inclusive = assertion.inclusive_bounds assertion_type = assertion.WhichOneof('assertion_type') if assertion_type == 'fixed_value': c = assertion.fixed_value.comparison if c == metrics_pb2.Assertion.LESS: upper_bound = assertion.fixed_value.value elif c == metrics_pb2.Assertion.GREATER: lower_bound = assertion.fixed_value.value elif c == metrics_pb2.Assertion.EQUAL: lower_bound = assertion.fixed_value.value upper_bound = assertion.fixed_value.value inclusive = True elif assertion_type == 'within_bounds': lower_bound = assertion.within_bounds.lower_bound upper_bound = assertion.within_bounds.upper_bound elif assertion_type == 'std_devs_from_mean': values = self.get_metric_history(metric_key, assertion.time_window, assertion.min_timestamp) # Standard deviation not defined for n < 2 min_num_points = max(assertion.wait_for_n_data_points, 2) if len(values) < min_num_points: logging.info( 'Not enough data points to compute bounds for %s. ' 'Need %d points, have %d.', metric_key, min_num_points, len(values)) return utils.NO_BOUNDS mean = np.mean(values) stddev = np.std(values) c = assertion.std_devs_from_mean.comparison if c in (metrics_pb2.Assertion.LESS, metrics_pb2.Assertion.WITHIN): upper_bound = mean + (stddev * assertion.std_devs_from_mean.std_devs) if c in (metrics_pb2.Assertion.GREATER, metrics_pb2.Assertion.WITHIN): lower_bound = mean - (stddev * assertion.std_devs_from_mean.std_devs) if upper_bound == math.inf and lower_bound == -math.inf: logging.error( '%s: comparison %s is not implemented for assertion type `%s`', metric_key, metrics_pb2.Assertion.Comparison.Name(c), assertion_type) return utils.NO_BOUNDS elif assertion_type == 'percent_difference': target_type = assertion.percent_difference.WhichOneof( 'target_type') if target_type == 'use_historical_mean': values = self.get_metric_history(metric_key, assertion.time_window, assertion.min_timestamp) # Mean not defined for n < 1. min_num_points = max(assertion.wait_for_n_data_points, 1) if len(values) < min_num_points: logging.info( 'Not enough data points to compute bounds for %s. ' 'Need %d points, have %d.', metric_key, len(values), min_num_points) return utils.NO_BOUNDS target = np.mean(values) elif target_type == 'value': target = assertion.percent_difference.value else: logging.error( '%s: No `target_type` defined for assertion type `%s`.', metric_key, assertion_type) return utils.NO_BOUNDS c = assertion.percent_difference.comparison if c in (metrics_pb2.Assertion.LESS, metrics_pb2.Assertion.WITHIN): upper_bound = target + (assertion.percent_difference.percent * target) if c in (metrics_pb2.Assertion.GREATER, metrics_pb2.Assertion.WITHIN): lower_bound = target - (assertion.percent_difference.percent * target) if upper_bound == math.inf and lower_bound == -math.inf: logging.error( '%s: comparison %s is not implemented for assertion type `%s`', metric_key, metrics_pb2.Assertion.Comparison.Name(c), assertion_type) return utils.NO_BOUNDS return utils.Bounds(lower_bound, upper_bound, inclusive)
def test_get_metrics_from_perfzero_summary(self): temp_dir = self.create_tempdir().full_path summary_dir = os.path.join(temp_dir, 'date_and_time') pathlib.Path(summary_dir).mkdir(parents=True, exist_ok=True) summary_path = os.path.join(summary_dir, 'perfzero_summary.json') with open(summary_path, 'w') as f: json.dump( { "execution_id": "execution_id", "execution_timestamp": 1234567890.1, "benchmark_result": { "wall_time": 1234, "metrics": [{ "name": "exp_per_second", "value": 1.1, }, { "name": "avg_exp_per_second", "value": 2.2, }, { "name": "startup_time", "value": 3.3 }], }, "benchmark_info": { "not": "important", }, "setup_info": {}, "ml_framework_info": { "not": "important", }, "system_info": { "not": "important" }, "process_info": { "max_rss": 4.4, "max_vms": 5.5, "max_cpu_percent": 6.6, } }, f) metric_source = metrics_pb2.MetricSource( perfzero=metrics_pb2.PerfZeroSource( assertions={ 'total_wall_time': metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=1230, upper_bound=1240, )), 'exp_per_second': metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=1, upper_bound=100, ), ) })) event = metrics_pb2.TestCompletedEvent( benchmark_id="test_benchmark", output_path=temp_dir, metric_collection_config=metrics_pb2.MetricCollectionConfig( sources=[metric_source])) collector = perfzero_collector.PerfZeroCollector( event=event, raw_source=metric_source) points = list(collector.metric_points()) self.assertCountEqual( points, { utils.MetricPoint("total_wall_time", 1234, utils.Bounds(1230., 1240., False)), utils.MetricPoint("exp_per_second", 1.1, utils.Bounds(1., 100., False)), utils.MetricPoint("avg_exp_per_second", 2.2, utils.NO_BOUNDS), utils.MetricPoint("startup_time", 3.3, utils.NO_BOUNDS), utils.MetricPoint("process_info/max_rss", 4.4, utils.NO_BOUNDS), utils.MetricPoint("process_info/max_vms", 5.5, utils.NO_BOUNDS), utils.MetricPoint("process_info/max_cpu_percent", 6.6, utils.NO_BOUNDS), }, )