def test_compute_bounds_stddevs_from_mean(self, comparison, std_devs, expected_bounds, inclusive=False): # mean = 3, stddev = ~1.414 value_history = range(1, 6) class _FakeMetricStore: def get_metric_history(*args, **kwargs): return [ bigquery_client.MetricHistoryRow('', '', datetime.datetime.now(), '', v) for v in value_history ] assertion = metrics_pb2.Assertion( std_devs_from_mean=metrics_pb2.Assertion.StdDevsFromMean( comparison=metrics_pb2.Assertion.Comparison.Value(comparison), std_devs=std_devs, ), inclusive_bounds=inclusive, ) collector = base.BaseCollector( metrics_pb2.TestCompletedEvent(benchmark_id="test_benchmark"), None, _FakeMetricStore()) bounds = collector.compute_bounds("metric_key", assertion) self.assertSequenceAlmostEqual( dataclasses.astuple(bounds), # EQUAL is always inclusive dataclasses.astuple( utils.Bounds(*expected_bounds, inclusive or comparison == 'EQUAL')), places=3)
def test_compute_bounds_percent_difference_with_mean_value( self, comparison, pct_diff, expected_bounds, inclusive=False): # mean = 3 value_history = range(1, 6) class _FakeMetricStore: def get_metric_history(*args, **kwargs): return [ bigquery_client.MetricHistoryRow('', '', datetime.datetime.now(), '', v) for v in value_history ] assertion = metrics_pb2.Assertion( percent_difference=metrics_pb2.Assertion.PercentDifference( comparison=metrics_pb2.Assertion.Comparison.Value(comparison), use_historical_mean=True, percent=pct_diff, ), inclusive_bounds=inclusive, ) collector = base.BaseCollector( metrics_pb2.TestCompletedEvent(benchmark_id="test_benchmark"), None, _FakeMetricStore()) bounds = collector.compute_bounds("metric_key", assertion) self.assertSequenceAlmostEqual( dataclasses.astuple(bounds), dataclasses.astuple(utils.Bounds(*expected_bounds, inclusive)), places=3)
def test_assert_duration(self): metric_source = metrics_pb2.MetricSource( literals=metrics_pb2.LiteralSource( assertions={ "duration": metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=100, upper_bound=200, ), inclusive_bounds=False, ) })) event = metrics_pb2.TestCompletedEvent( benchmark_id="test_benchmark", duration=duration_pb2.Duration(seconds=150), metric_collection_config=metrics_pb2.MetricCollectionConfig( sources=[metric_source])) collector = literal_collector.LiteralCollector( event=event, raw_source=metric_source) points = collector.metric_points() self.assertLen(points, 1) self.assertEqual(points[0].metric_key, 'duration') self.assertEqual(points[0].metric_value, 150) self.assertEqual(points[0].bounds, utils.Bounds(100, 200, False))
def receive_test_event(data: dict, context: dict) -> bool: """Entrypoint for Cloud Function. Args: data: dict containing base64-encoded proto message. context: dict containing event metadata. Returns: True if message should be ack-ed, else False. """ logging.set_verbosity(logging.INFO) dataset = DATASET project = PROJECT or google.auth.default()[1] try: message_bytes = base64.b64decode(data['data']) event = metrics_pb2.TestCompletedEvent() event.ParseFromString(message_bytes) except Exception as e: logging.fatal( 'Failed to parse PubSub message. Will ack message to prevent ' 'more crashes.', exc_info=e) return True alert_handler = (alerts.AlertHandler(project, event.benchmark_id, event.debug_info, level='ERROR')) logging.get_absl_logger().addHandler(alert_handler) metric_store = bigquery_client.BigQueryMetricStore( project=project, dataset=dataset, ) try: logging.info('Processing test event: %s', str(event)) job_row, metric_rows = process_proto_message(event, metric_store, context.event_id) metric_store.insert_status_and_metrics(job_row, metric_rows) except Exception as e: logging.fatal( 'Encountered exception while attempting to process message.', exc_info=e) if alert_handler.has_errors: logging.info('Alerts: %s', str(alert_handler._records)) if SEND_EMAIL_ALERTS: _send_email(project, *alert_handler.generate_email_content) else: logging.info('E-mail alerts disabled.') else: logging.info('No alerts found.') return True
def test_create_test_completed_event(self, succeeded_count, failed_count, conditions, expected_status): job = _job_from_dict({ 'metadata': { 'name': 'job-name', 'namespace': 'namespace', 'labels': { 'benchmarkId': 'test-job', }, }, 'status': { 'startTime': _START_TIME, 'succeeded': succeeded_count, 'failed': failed_count, 'conditions': [ { 'status': True, 'reason': reason, 'type': cond_type, 'lastTransitionTime': _END_TIME, } for cond_type, reason in conditions ] } }) actual_event = event_publisher.create_test_completed_event( job, model_output_bucket='gs://fake-bucket', cluster_name='cluster-name', cluster_location='cluster-location', project='project-id' ) start_time = timestamp_pb2.Timestamp() start_time.FromDatetime(_START_TIME) duration = duration_pb2.Duration() duration.FromTimedelta(_END_TIME - _START_TIME) expected_event = metrics_pb2.TestCompletedEvent( benchmark_id='test-job', output_path='gs://fake-bucket/job-name', status=metrics_pb2.TestCompletedEvent.TestStatus.Value(expected_status), num_attempts=succeeded_count + failed_count, start_time=start_time, duration=duration, labels={'benchmarkId': 'test-job'}, debug_info=metrics_pb2.DebugInfo( logs_link='https://console.cloud.google.com/logs?project=project-id&advancedFilter=resource.type%3Dk8s_container%0Aresource.labels.project_id%3Dproject-id%0Aresource.labels.cluster_name%3Dcluster-name%0Aresource.labels.namespace_name%3Dnamespace%0Aresource.labels.pod_name%3Ajob-name%0Aresource.labels.location%3Acluster-location%0A', details_link=f'https://console.cloud.google.com/kubernetes/job/cluster-location/cluster-name/namespace/job-name?project=project-id' ), metric_collection_config=metrics_pb2.MetricCollectionConfig(), ) self.assertProtoEqual(expected_event, actual_event)
def test_compute_bounds_within_bounds(self, lower, upper, expected_bounds, inclusive=False): assertion = metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=lower, upper_bound=upper, ), inclusive_bounds=inclusive, ) collector = base.BaseCollector( metrics_pb2.TestCompletedEvent(benchmark_id="test_benchmark"), None) bounds = collector.compute_bounds("metric_key", assertion) self.assertEqual(bounds, utils.Bounds(*expected_bounds, inclusive))
def test_aggregate_metrics_include_all_strategies(self): metric_source = metrics_pb2.MetricSource( tensorboard=metrics_pb2.TensorBoardSource(include_tags=[ metrics_pb2.TensorBoardSource.TagStrategy( tag_pattern="*", strategies=[ metrics_pb2.TensorBoardSource.FINAL, metrics_pb2.TensorBoardSource.MAX, metrics_pb2.TensorBoardSource.MIN, metrics_pb2.TensorBoardSource.AVERAGE, metrics_pb2.TensorBoardSource.MEDIAN, ]) ])) event = metrics_pb2.TestCompletedEvent( benchmark_id="test_benchmark", output_path=self.temp_dir, metric_collection_config=metrics_pb2.MetricCollectionConfig( sources=[metric_source])) collector = tensorboard_collector.TensorBoardCollector( event=event, raw_source=metric_source) points = list(collector.metric_points()) metric_to_value = {key: value for key, value, _ in points} self.assertDictEqual( metric_to_value, { 'foo_final': 2, 'foo_min': 1, 'foo_max': 2, 'foo_average': 1.5, 'foo_median': 1.5, 'eval/accuracy_final': .25, 'eval/accuracy_min': .125, 'eval/accuracy_max': .5, 'eval/accuracy_average': np.mean([.125, .25, .5]), 'eval/accuracy_median': np.median([.125, .25, .5]), 'train/bar_final': 100, 'train/bar_min': 10, 'train/bar_max': 100, 'train/bar_average': np.mean([10, 100, 100]), 'train/bar_median': np.median([10, 100, 100]), }) for _, _, bounds in points: self.assertEqual(bounds, utils.NO_BOUNDS)
def test_aggregate_metrics_with_assertion(self): metric_source = metrics_pb2.MetricSource( tensorboard=metrics_pb2.TensorBoardSource( include_tags=[ metrics_pb2.TensorBoardSource.TagStrategy( tag_pattern="eval/*", strategies=[ metrics_pb2.TensorBoardSource.FINAL, metrics_pb2.TensorBoardSource.MAX, metrics_pb2.TensorBoardSource.MIN, ]) ], aggregate_assertions=[ metrics_pb2.TensorBoardSource.AggregateAssertion( tag='eval/accuracy', strategy=metrics_pb2.TensorBoardSource.MAX, assertion=metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=.4, upper_bound=1.0, ), inclusive_bounds=True, )) ])) event = metrics_pb2.TestCompletedEvent( benchmark_id="test_benchmark", output_path=self.temp_dir, metric_collection_config=metrics_pb2.MetricCollectionConfig( sources=[metric_source])) collector = tensorboard_collector.TensorBoardCollector( event=event, raw_source=metric_source) points = list(collector.metric_points()) self.assertCountEqual( points, [ utils.MetricPoint('eval/accuracy_max', .5, utils.Bounds(.4, 1.0, True)), utils.MetricPoint('eval/accuracy_min', .125, utils.NO_BOUNDS), utils.MetricPoint('eval/accuracy_final', .25, utils.NO_BOUNDS), ], )
def test_compute_bounds_percent_difference_with_target_value( self, comparison, target, pct_diff, expected_bounds, inclusive=False): assertion = metrics_pb2.Assertion( percent_difference=metrics_pb2.Assertion.PercentDifference( comparison=metrics_pb2.Assertion.Comparison.Value(comparison), value=target, percent=pct_diff, ), inclusive_bounds=inclusive, ) collector = base.BaseCollector( metrics_pb2.TestCompletedEvent(benchmark_id="test_benchmark"), None) bounds = collector.compute_bounds("metric_key", assertion) self.assertEqual(bounds, utils.Bounds(*expected_bounds, inclusive))
def test_include_and_exclude(self): metric_source = metrics_pb2.MetricSource( tensorboard=metrics_pb2.TensorBoardSource( include_tags=[ metrics_pb2.TensorBoardSource.TagStrategy( tag_pattern="*", strategies=[ metrics_pb2.TensorBoardSource.FINAL, ]) ], exclude_tags=[ 'foo', 'train/*', ], aggregate_assertions=[ metrics_pb2.TensorBoardSource.AggregateAssertion( tag='foo', strategy=metrics_pb2.TensorBoardSource.MIN, assertion=metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=0., upper_bound=2., ))) ])) event = metrics_pb2.TestCompletedEvent( benchmark_id="test_benchmark", output_path=self.temp_dir, metric_collection_config=metrics_pb2.MetricCollectionConfig( sources=[metric_source])) collector = tensorboard_collector.TensorBoardCollector( event=event, raw_source=metric_source) points = list(collector.metric_points()) self.assertCountEqual( points, [ utils.MetricPoint('eval/accuracy_final', .25, utils.NO_BOUNDS), utils.MetricPoint('foo_min', 1, utils.Bounds(0., 2., False)), ], )
def test_min_time(self, window, timestamp, expected_min): start_time = timestamp_pb2.Timestamp() start_time.FromDatetime(datetime.datetime(2021, 2, 16, 0, 0, 0)) min_timestamp = timestamp_pb2.Timestamp() if timestamp: min_timestamp.FromDatetime(timestamp) time_window = duration_pb2.Duration() if window: time_window.FromTimedelta(window) assertion = metrics_pb2.Assertion( std_devs_from_mean=metrics_pb2.Assertion.StdDevsFromMean( comparison=metrics_pb2.Assertion.Comparison.WITHIN, std_devs=1, ), time_window=time_window, min_timestamp=min_timestamp, ) metric_store = bigquery_client.BigQueryMetricStore( 'fake_dataset', 'fake_project') metric_store.get_metric_history = mock.Mock(return_value=[]) collector = base.BaseCollector( metrics_pb2.TestCompletedEvent( benchmark_id="test_benchmark", start_time=start_time, ), None, metric_store) collector.compute_bounds("metric_key", assertion) metric_store.get_metric_history.assert_called_with( benchmark_id="test_benchmark", metric_key="metric_key", min_time=expected_min)
def test_compute_bounds_fixed_value(self, comparison, threshold_value, expected_bounds, inclusive=False): assertion = metrics_pb2.Assertion( fixed_value=metrics_pb2.Assertion.FixedValue( comparison=metrics_pb2.Assertion.Comparison.Value(comparison), value=threshold_value, ), inclusive_bounds=inclusive, ) collector = base.BaseCollector( metrics_pb2.TestCompletedEvent(benchmark_id="test_benchmark"), None) bounds = collector.compute_bounds("metric_key", assertion) self.assertSequenceAlmostEqual( dataclasses.astuple(bounds), # EQUAL is always inclusive dataclasses.astuple( utils.Bounds(*expected_bounds, inclusive or comparison == 'EQUAL')), places=3)
def test_get_metrics_from_perfzero_summary(self): temp_dir = self.create_tempdir().full_path summary_dir = os.path.join(temp_dir, 'date_and_time') pathlib.Path(summary_dir).mkdir(parents=True, exist_ok=True) summary_path = os.path.join(summary_dir, 'perfzero_summary.json') with open(summary_path, 'w') as f: json.dump( { "execution_id": "execution_id", "execution_timestamp": 1234567890.1, "benchmark_result": { "wall_time": 1234, "metrics": [{ "name": "exp_per_second", "value": 1.1, }, { "name": "avg_exp_per_second", "value": 2.2, }, { "name": "startup_time", "value": 3.3 }], }, "benchmark_info": { "not": "important", }, "setup_info": {}, "ml_framework_info": { "not": "important", }, "system_info": { "not": "important" }, "process_info": { "max_rss": 4.4, "max_vms": 5.5, "max_cpu_percent": 6.6, } }, f) metric_source = metrics_pb2.MetricSource( perfzero=metrics_pb2.PerfZeroSource( assertions={ 'total_wall_time': metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=1230, upper_bound=1240, )), 'exp_per_second': metrics_pb2.Assertion( within_bounds=metrics_pb2.Assertion.WithinBounds( lower_bound=1, upper_bound=100, ), ) })) event = metrics_pb2.TestCompletedEvent( benchmark_id="test_benchmark", output_path=temp_dir, metric_collection_config=metrics_pb2.MetricCollectionConfig( sources=[metric_source])) collector = perfzero_collector.PerfZeroCollector( event=event, raw_source=metric_source) points = list(collector.metric_points()) self.assertCountEqual( points, { utils.MetricPoint("total_wall_time", 1234, utils.Bounds(1230., 1240., False)), utils.MetricPoint("exp_per_second", 1.1, utils.Bounds(1., 100., False)), utils.MetricPoint("avg_exp_per_second", 2.2, utils.NO_BOUNDS), utils.MetricPoint("startup_time", 3.3, utils.NO_BOUNDS), utils.MetricPoint("process_info/max_rss", 4.4, utils.NO_BOUNDS), utils.MetricPoint("process_info/max_vms", 5.5, utils.NO_BOUNDS), utils.MetricPoint("process_info/max_cpu_percent", 6.6, utils.NO_BOUNDS), }, )
def create_test_completed_event( job: kubernetes.client.V1Job, model_output_bucket: str, cluster_name: str, cluster_location: str, project: str) -> metrics_pb2.TestCompletedEvent: """Returns a TestCompletedEvent to publish to PubSub. Args: job: A Kubernetes Job resource. model_output_bucket: Path to GCS bucket with model outputs. cluster_name: Name of the current Kubernetes cluster. cluster_location: Location (region or zone) of the current Kubernetes cluster. project: The project ID of the current project. Returns: A TestCompletedEvent with the information from job. """ if len(job.status.conditions) == 1: condition = job.status.conditions[0] # job.status.conditions _usually_ has length 1, but it can have both passing and failing conditions. # Give precedence to failing conditions. elif len(job.status.conditions) == 0: logging.error('Job %s has no conditions.', job.metadata.name) return else: condition = next( (c for c in job.status.conditions if c.type == 'Failed'), None) if not condition: logging.error('This should never happen. Conditions: %s', str(job.status.conditions)) return elif condition.reason == 'DeadlineExceeded': job_status = metrics_pb2.TestCompletedEvent.TIMEOUT elif condition.reason == 'BackoffLimitExceeded': job_status = metrics_pb2.TestCompletedEvent.FAILED elif condition.type == 'Complete': job_status = metrics_pb2.TestCompletedEvent.COMPLETED else: logging.error('Unknown condition for Job %s: %s', job.metadata.name, str(condition)) return annotations = job.metadata.annotations or {} gcs_subdir = annotations.get('ml-testing-accelerators/gcs-subdir', '') output_path = os.path.join(model_output_bucket, gcs_subdir, job.metadata.name) metric_config = metrics_pb2.MetricCollectionConfig() mcc_json = annotations.get('ml-testing-accelerators/metric-config', '{}') json_format.Parse(mcc_json, metric_config) stackdriver_query = textwrap.dedent(f"""\ resource.type=k8s_container resource.labels.project_id={project} resource.labels.cluster_name={cluster_name} resource.labels.namespace_name={job.metadata.namespace} resource.labels.pod_name:{job.metadata.name} resource.labels.location:{cluster_location} """) stackdriver_link = "https://console.cloud.google.com/logs?{}".format( urllib.parse.urlencode({ 'project': project, 'advancedFilter': stackdriver_query })) start_time = timestamp_pb2.Timestamp() start_time.FromDatetime(job.status.start_time) duration = duration_pb2.Duration() duration.FromTimedelta(condition.last_transition_time - job.status.start_time) return metrics_pb2.TestCompletedEvent( benchmark_id=job.metadata.labels['benchmarkId'], output_path=output_path, status=job_status, num_attempts=(job.status.succeeded or 0) + (job.status.failed or 0), start_time=start_time, duration=duration, metric_collection_config=metric_config, labels=job.metadata.labels, debug_info=metrics_pb2.DebugInfo( logs_link=stackdriver_link, # TODO: fix hard-coded region and cluster name details_link= f'https://console.cloud.google.com/kubernetes/job/{cluster_location}/{cluster_name}/{job.metadata.namespace}/{job.metadata.name}?project={project}' ))