def test_assert_duration(self):
     metric_source = metrics_pb2.MetricSource(
         literals=metrics_pb2.LiteralSource(
             assertions={
                 "duration":
                 metrics_pb2.Assertion(
                     within_bounds=metrics_pb2.Assertion.WithinBounds(
                         lower_bound=100,
                         upper_bound=200,
                     ),
                     inclusive_bounds=False,
                 )
             }))
     event = metrics_pb2.TestCompletedEvent(
         benchmark_id="test_benchmark",
         duration=duration_pb2.Duration(seconds=150),
         metric_collection_config=metrics_pb2.MetricCollectionConfig(
             sources=[metric_source]))
     collector = literal_collector.LiteralCollector(
         event=event, raw_source=metric_source)
     points = collector.metric_points()
     self.assertLen(points, 1)
     self.assertEqual(points[0].metric_key, 'duration')
     self.assertEqual(points[0].metric_value, 150)
     self.assertEqual(points[0].bounds, utils.Bounds(100, 200, False))
  def test_create_test_completed_event(self, succeeded_count, failed_count, conditions, expected_status):
    job = _job_from_dict({
      'metadata': {
        'name': 'job-name',
        'namespace': 'namespace',
        'labels': {
          'benchmarkId': 'test-job',
        },
      },
      'status': {
        'startTime': _START_TIME,
        'succeeded': succeeded_count,
        'failed': failed_count,
        'conditions': [
          {
            'status': True,
            'reason': reason,
            'type': cond_type,
            'lastTransitionTime': _END_TIME,
          }
          for cond_type, reason in conditions
        ]
      }
    })

    actual_event = event_publisher.create_test_completed_event(
      job,
      model_output_bucket='gs://fake-bucket',
      cluster_name='cluster-name',
      cluster_location='cluster-location',
      project='project-id'
    )

    start_time = timestamp_pb2.Timestamp()
    start_time.FromDatetime(_START_TIME)
    duration = duration_pb2.Duration()
    duration.FromTimedelta(_END_TIME - _START_TIME)
    expected_event = metrics_pb2.TestCompletedEvent(
      benchmark_id='test-job',
      output_path='gs://fake-bucket/job-name',
      status=metrics_pb2.TestCompletedEvent.TestStatus.Value(expected_status),
      num_attempts=succeeded_count + failed_count,
      start_time=start_time,
      duration=duration,
      labels={'benchmarkId': 'test-job'},
      debug_info=metrics_pb2.DebugInfo(
        logs_link='https://console.cloud.google.com/logs?project=project-id&advancedFilter=resource.type%3Dk8s_container%0Aresource.labels.project_id%3Dproject-id%0Aresource.labels.cluster_name%3Dcluster-name%0Aresource.labels.namespace_name%3Dnamespace%0Aresource.labels.pod_name%3Ajob-name%0Aresource.labels.location%3Acluster-location%0A',
        details_link=f'https://console.cloud.google.com/kubernetes/job/cluster-location/cluster-name/namespace/job-name?project=project-id'
      ),
      metric_collection_config=metrics_pb2.MetricCollectionConfig(),
    )

    self.assertProtoEqual(expected_event, actual_event)
    def test_aggregate_metrics_include_all_strategies(self):
        metric_source = metrics_pb2.MetricSource(
            tensorboard=metrics_pb2.TensorBoardSource(include_tags=[
                metrics_pb2.TensorBoardSource.TagStrategy(
                    tag_pattern="*",
                    strategies=[
                        metrics_pb2.TensorBoardSource.FINAL,
                        metrics_pb2.TensorBoardSource.MAX,
                        metrics_pb2.TensorBoardSource.MIN,
                        metrics_pb2.TensorBoardSource.AVERAGE,
                        metrics_pb2.TensorBoardSource.MEDIAN,
                    ])
            ]))
        event = metrics_pb2.TestCompletedEvent(
            benchmark_id="test_benchmark",
            output_path=self.temp_dir,
            metric_collection_config=metrics_pb2.MetricCollectionConfig(
                sources=[metric_source]))
        collector = tensorboard_collector.TensorBoardCollector(
            event=event, raw_source=metric_source)
        points = list(collector.metric_points())

        metric_to_value = {key: value for key, value, _ in points}

        self.assertDictEqual(
            metric_to_value, {
                'foo_final': 2,
                'foo_min': 1,
                'foo_max': 2,
                'foo_average': 1.5,
                'foo_median': 1.5,
                'eval/accuracy_final': .25,
                'eval/accuracy_min': .125,
                'eval/accuracy_max': .5,
                'eval/accuracy_average': np.mean([.125, .25, .5]),
                'eval/accuracy_median': np.median([.125, .25, .5]),
                'train/bar_final': 100,
                'train/bar_min': 10,
                'train/bar_max': 100,
                'train/bar_average': np.mean([10, 100, 100]),
                'train/bar_median': np.median([10, 100, 100]),
            })

        for _, _, bounds in points:
            self.assertEqual(bounds, utils.NO_BOUNDS)
    def test_aggregate_metrics_with_assertion(self):
        metric_source = metrics_pb2.MetricSource(
            tensorboard=metrics_pb2.TensorBoardSource(
                include_tags=[
                    metrics_pb2.TensorBoardSource.TagStrategy(
                        tag_pattern="eval/*",
                        strategies=[
                            metrics_pb2.TensorBoardSource.FINAL,
                            metrics_pb2.TensorBoardSource.MAX,
                            metrics_pb2.TensorBoardSource.MIN,
                        ])
                ],
                aggregate_assertions=[
                    metrics_pb2.TensorBoardSource.AggregateAssertion(
                        tag='eval/accuracy',
                        strategy=metrics_pb2.TensorBoardSource.MAX,
                        assertion=metrics_pb2.Assertion(
                            within_bounds=metrics_pb2.Assertion.WithinBounds(
                                lower_bound=.4,
                                upper_bound=1.0,
                            ),
                            inclusive_bounds=True,
                        ))
                ]))
        event = metrics_pb2.TestCompletedEvent(
            benchmark_id="test_benchmark",
            output_path=self.temp_dir,
            metric_collection_config=metrics_pb2.MetricCollectionConfig(
                sources=[metric_source]))
        collector = tensorboard_collector.TensorBoardCollector(
            event=event, raw_source=metric_source)
        points = list(collector.metric_points())

        self.assertCountEqual(
            points,
            [
                utils.MetricPoint('eval/accuracy_max', .5,
                                  utils.Bounds(.4, 1.0, True)),
                utils.MetricPoint('eval/accuracy_min', .125, utils.NO_BOUNDS),
                utils.MetricPoint('eval/accuracy_final', .25, utils.NO_BOUNDS),
            ],
        )
    def test_include_and_exclude(self):
        metric_source = metrics_pb2.MetricSource(
            tensorboard=metrics_pb2.TensorBoardSource(
                include_tags=[
                    metrics_pb2.TensorBoardSource.TagStrategy(
                        tag_pattern="*",
                        strategies=[
                            metrics_pb2.TensorBoardSource.FINAL,
                        ])
                ],
                exclude_tags=[
                    'foo',
                    'train/*',
                ],
                aggregate_assertions=[
                    metrics_pb2.TensorBoardSource.AggregateAssertion(
                        tag='foo',
                        strategy=metrics_pb2.TensorBoardSource.MIN,
                        assertion=metrics_pb2.Assertion(
                            within_bounds=metrics_pb2.Assertion.WithinBounds(
                                lower_bound=0.,
                                upper_bound=2.,
                            )))
                ]))
        event = metrics_pb2.TestCompletedEvent(
            benchmark_id="test_benchmark",
            output_path=self.temp_dir,
            metric_collection_config=metrics_pb2.MetricCollectionConfig(
                sources=[metric_source]))
        collector = tensorboard_collector.TensorBoardCollector(
            event=event, raw_source=metric_source)
        points = list(collector.metric_points())

        self.assertCountEqual(
            points,
            [
                utils.MetricPoint('eval/accuracy_final', .25, utils.NO_BOUNDS),
                utils.MetricPoint('foo_min', 1, utils.Bounds(0., 2., False)),
            ],
        )
  def test_metric_collection_config(self, gcs_subdir):
    job = _job_from_dict({
      'metadata': {
        'name': 'job-name',
        'namespace': 'namespace',
        'labels': {
          'benchmarkId': 'test-job',
        },
        'annotations': {
          'ml-testing-accelerators/metric-config': json.dumps({
            'sources':  [{
              'literals': {
                'assertions': {
                  'duration': {
                    'within_bounds': {
                      'lower_bound': 1,
                      'upper_bound': 2,
                    }
                  }
                }
              }
            }]
          })
        }
      },
      'status': {
        'startTime': _START_TIME,
        'completionTime': _END_TIME,
        'succeeded': 1,
        'conditions': [
          {
            'status': True,
            'type': 'Complete',
            'lastTransitionTime': _END_TIME,
          }
        ]
      }
    })
    if gcs_subdir:
      job.metadata.annotations['ml-testing-accelerators/gcs-subdir'] = gcs_subdir

    actual_event = event_publisher.create_test_completed_event(
      job,
      model_output_bucket='gs://fake-bucket',
      cluster_name='cluster-name',
      cluster_location='cluster-location',
      project='project-id'
    )
    actual_mcc = actual_event.metric_collection_config

    expected_mcc = metrics_pb2.MetricCollectionConfig(
      sources=[
        metrics_pb2.MetricSource(
          literals=metrics_pb2.LiteralSource(
            assertions={
              'duration': metrics_pb2.Assertion(
                within_bounds=metrics_pb2.Assertion.WithinBounds(
                  lower_bound=1,
                  upper_bound=2,
                )
              )
            }
          )
        )
      ]
    )
    self.assertEqual(actual_event.output_path, os.path.join('gs://fake-bucket', gcs_subdir or '', 'job-name'))
    self.assertProtoEqual(expected_mcc, actual_mcc)
Beispiel #7
0
    def test_get_metrics_from_perfzero_summary(self):
        temp_dir = self.create_tempdir().full_path
        summary_dir = os.path.join(temp_dir, 'date_and_time')
        pathlib.Path(summary_dir).mkdir(parents=True, exist_ok=True)
        summary_path = os.path.join(summary_dir, 'perfzero_summary.json')
        with open(summary_path, 'w') as f:
            json.dump(
                {
                    "execution_id": "execution_id",
                    "execution_timestamp": 1234567890.1,
                    "benchmark_result": {
                        "wall_time":
                        1234,
                        "metrics": [{
                            "name": "exp_per_second",
                            "value": 1.1,
                        }, {
                            "name": "avg_exp_per_second",
                            "value": 2.2,
                        }, {
                            "name": "startup_time",
                            "value": 3.3
                        }],
                    },
                    "benchmark_info": {
                        "not": "important",
                    },
                    "setup_info": {},
                    "ml_framework_info": {
                        "not": "important",
                    },
                    "system_info": {
                        "not": "important"
                    },
                    "process_info": {
                        "max_rss": 4.4,
                        "max_vms": 5.5,
                        "max_cpu_percent": 6.6,
                    }
                }, f)

        metric_source = metrics_pb2.MetricSource(
            perfzero=metrics_pb2.PerfZeroSource(
                assertions={
                    'total_wall_time':
                    metrics_pb2.Assertion(
                        within_bounds=metrics_pb2.Assertion.WithinBounds(
                            lower_bound=1230,
                            upper_bound=1240,
                        )),
                    'exp_per_second':
                    metrics_pb2.Assertion(
                        within_bounds=metrics_pb2.Assertion.WithinBounds(
                            lower_bound=1,
                            upper_bound=100,
                        ), )
                }))
        event = metrics_pb2.TestCompletedEvent(
            benchmark_id="test_benchmark",
            output_path=temp_dir,
            metric_collection_config=metrics_pb2.MetricCollectionConfig(
                sources=[metric_source]))

        collector = perfzero_collector.PerfZeroCollector(
            event=event, raw_source=metric_source)
        points = list(collector.metric_points())
        self.assertCountEqual(
            points,
            {
                utils.MetricPoint("total_wall_time", 1234,
                                  utils.Bounds(1230., 1240., False)),
                utils.MetricPoint("exp_per_second", 1.1,
                                  utils.Bounds(1., 100., False)),
                utils.MetricPoint("avg_exp_per_second", 2.2, utils.NO_BOUNDS),
                utils.MetricPoint("startup_time", 3.3, utils.NO_BOUNDS),
                utils.MetricPoint("process_info/max_rss", 4.4,
                                  utils.NO_BOUNDS),
                utils.MetricPoint("process_info/max_vms", 5.5,
                                  utils.NO_BOUNDS),
                utils.MetricPoint("process_info/max_cpu_percent", 6.6,
                                  utils.NO_BOUNDS),
            },
        )
def create_test_completed_event(
        job: kubernetes.client.V1Job, model_output_bucket: str,
        cluster_name: str, cluster_location: str,
        project: str) -> metrics_pb2.TestCompletedEvent:
    """Returns a TestCompletedEvent to publish to PubSub.

  Args:
    job: A Kubernetes Job resource.
    model_output_bucket: Path to GCS bucket with model outputs.
    cluster_name: Name of the current Kubernetes cluster.
    cluster_location: Location (region or zone) of the current Kubernetes cluster.
    project: The project ID of the current project.
  
  Returns:
    A TestCompletedEvent with the information from job.
  """
    if len(job.status.conditions) == 1:
        condition = job.status.conditions[0]
    # job.status.conditions _usually_ has length 1, but it can have both passing and failing conditions.
    # Give precedence to failing conditions.
    elif len(job.status.conditions) == 0:
        logging.error('Job %s has no conditions.', job.metadata.name)
        return
    else:
        condition = next(
            (c for c in job.status.conditions if c.type == 'Failed'), None)

    if not condition:
        logging.error('This should never happen. Conditions: %s',
                      str(job.status.conditions))
        return
    elif condition.reason == 'DeadlineExceeded':
        job_status = metrics_pb2.TestCompletedEvent.TIMEOUT
    elif condition.reason == 'BackoffLimitExceeded':
        job_status = metrics_pb2.TestCompletedEvent.FAILED
    elif condition.type == 'Complete':
        job_status = metrics_pb2.TestCompletedEvent.COMPLETED
    else:
        logging.error('Unknown condition for Job %s: %s', job.metadata.name,
                      str(condition))
        return

    annotations = job.metadata.annotations or {}
    gcs_subdir = annotations.get('ml-testing-accelerators/gcs-subdir', '')
    output_path = os.path.join(model_output_bucket, gcs_subdir,
                               job.metadata.name)

    metric_config = metrics_pb2.MetricCollectionConfig()
    mcc_json = annotations.get('ml-testing-accelerators/metric-config', '{}')
    json_format.Parse(mcc_json, metric_config)

    stackdriver_query = textwrap.dedent(f"""\
    resource.type=k8s_container
    resource.labels.project_id={project}
    resource.labels.cluster_name={cluster_name}
    resource.labels.namespace_name={job.metadata.namespace}
    resource.labels.pod_name:{job.metadata.name}
    resource.labels.location:{cluster_location}
  """)
    stackdriver_link = "https://console.cloud.google.com/logs?{}".format(
        urllib.parse.urlencode({
            'project': project,
            'advancedFilter': stackdriver_query
        }))

    start_time = timestamp_pb2.Timestamp()
    start_time.FromDatetime(job.status.start_time)
    duration = duration_pb2.Duration()
    duration.FromTimedelta(condition.last_transition_time -
                           job.status.start_time)

    return metrics_pb2.TestCompletedEvent(
        benchmark_id=job.metadata.labels['benchmarkId'],
        output_path=output_path,
        status=job_status,
        num_attempts=(job.status.succeeded or 0) + (job.status.failed or 0),
        start_time=start_time,
        duration=duration,
        metric_collection_config=metric_config,
        labels=job.metadata.labels,
        debug_info=metrics_pb2.DebugInfo(
            logs_link=stackdriver_link,
            # TODO: fix hard-coded region and cluster name
            details_link=
            f'https://console.cloud.google.com/kubernetes/job/{cluster_location}/{cluster_name}/{job.metadata.namespace}/{job.metadata.name}?project={project}'
        ))