def load_eval_config(output_path: Text) -> config.EvalConfig:
    """Loads eval config."""
    path = os.path.join(output_path, _EVAL_CONFIG_FILE)
    if tf.io.gfile.exists(path):
        with tf.io.gfile.GFile(path, 'r') as f:
            pb = json_format.Parse(f.read(), config_pb2.EvalConfigAndVersion())
            _check_version(pb.version, output_path)
            return pb.eval_config
    else:
        # Legacy suppport (to be removed in future).
        # The previous version did not include file extension.
        path = os.path.splitext(path)[0]
        serialized_record = six.next(
            tf.compat.v1.python_io.tf_record_iterator(path))
        final_dict = pickle.loads(serialized_record)
        _check_version(final_dict, output_path)
        old_config = final_dict['eval_config']
        slicing_specs = None
        if old_config.slice_spec:
            slicing_specs = [s.to_proto() for s in old_config.slice_spec]
        options = config.Options()
        options.compute_confidence_intervals.value = (
            old_config.compute_confidence_intervals)
        options.k_anonymization_count.value = old_config.k_anonymization_count
        return config.EvalConfig(
            input_data_specs=[
                config.InputDataSpec(location=old_config.data_location)
            ],
            model_specs=[config.ModelSpec(location=old_config.model_location)],
            output_data_specs=[
                config.OutputDataSpec(default_location=output_path)
            ],
            slicing_specs=slicing_specs,
            options=options)
def default_writers(
    eval_shared_model: Optional[types.EvalSharedModel] = None,
    eval_shared_models: Optional[List[types.EvalSharedModel]] = None,
    output_path: Optional[Text] = None,
    eval_config: config.EvalConfig = None,
) -> List[writer.Writer]:  # pylint: disable=invalid-name
    """Returns the default writers for use in WriteResults.

  Args:
    eval_shared_model: Shared model (single-model evaluation).
    eval_shared_models: Shared models (multi-model evaluation).
    output_path: Deprecated (use EvalConfig).
    eval_config: Eval config.
  """
    # TODO(b/141016373): Add support for multiple models.
    if eval_config is not None:
        output_spec = eval_config.output_data_specs[0]
    elif output_path is not None:
        output_spec = config.OutputDataSpec(default_location=output_path)
    if eval_shared_model is not None:
        eval_shared_models = [eval_shared_model]
    output_paths = {
        constants.METRICS_KEY: output_filename(output_spec,
                                               constants.METRICS_KEY),
        constants.PLOTS_KEY: output_filename(output_spec, constants.PLOTS_KEY)
    }
    return [
        metrics_and_plots_writer.MetricsAndPlotsWriter(
            eval_shared_model=eval_shared_models[0], output_paths=output_paths)
    ]
 def testRunModelAnalysisForCSVText(self):
   model_location = self._exportEvalSavedModel(
       csv_linear_classifier.simple_csv_linear_classifier)
   examples = [
       '3.0,english,1.0', '3.0,chinese,0.0', '4.0,english,1.0',
       '5.0,chinese,1.0'
   ]
   data_location = self._writeCSVToTextFile(examples)
   eval_config = config.EvalConfig(
       input_data_specs=[
           config.InputDataSpec(location=data_location, file_format='text')
       ],
       model_specs=[config.ModelSpec(location=model_location)],
       output_data_specs=[
           config.OutputDataSpec(default_location=self._getTempDir())
       ])
   eval_result = model_eval_lib.run_model_analysis(
       eval_config=eval_config,
       eval_shared_models=[
           model_eval_lib.default_eval_shared_model(
               eval_saved_model_path=model_location)
       ])
   # We only check some of the metrics to ensure that the end-to-end
   # pipeline works.
   expected = {
       (): {
           'accuracy': {
               'doubleValue': 0.75
           },
           metric_keys.EXAMPLE_COUNT: {
               'doubleValue': 4.0
           }
       }
   }
   self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
Exemple #4
0
 def testSerializeDeserializeEvalConfig(self):
     output_path = self._getTempDir()
     options = config.Options()
     options.compute_confidence_intervals.value = False
     options.k_anonymization_count.value = 1
     eval_config = config.EvalConfig(
         input_data_specs=[config.InputDataSpec(location='/path/to/data')],
         model_specs=[config.ModelSpec(location='/path/to/model')],
         output_data_specs=[
             config.OutputDataSpec(default_location=output_path)
         ],
         slicing_specs=[
             config.SlicingSpec(feature_keys=['country'],
                                feature_values={
                                    'age': '5',
                                    'gender': 'f'
                                }),
             config.SlicingSpec(feature_keys=['interest'],
                                feature_values={
                                    'age': '6',
                                    'gender': 'm'
                                })
         ],
         options=options)
     with tf.io.gfile.GFile(os.path.join(output_path, 'eval_config.json'),
                            'w') as f:
         f.write(model_eval_lib._serialize_eval_config(eval_config))
     got_eval_config = model_eval_lib.load_eval_config(output_path)
     self.assertEqual(eval_config, got_eval_config)
Exemple #5
0
    def testRunModelAnalysisWithMultiplePlots(self):
        model_location = self._exportEvalSavedModel(
            fixed_prediction_estimator.simple_fixed_prediction_estimator)
        examples = [
            self._makeExample(prediction=0.0, label=1.0),
            self._makeExample(prediction=0.7, label=0.0),
            self._makeExample(prediction=0.8, label=1.0),
            self._makeExample(prediction=1.0, label=1.0),
            self._makeExample(prediction=1.0, label=1.0)
        ]
        data_location = self._writeTFExamplesToTFRecords(examples)
        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec(location=data_location)],
            model_specs=[config.ModelSpec(location=model_location)],
            output_data_specs=[
                config.OutputDataSpec(default_location=self._getTempDir())
            ])
        eval_shared_model = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=model_location,
            add_metrics_callbacks=[
                post_export_metrics.auc_plots(),
                post_export_metrics.auc_plots(metric_tag='test')
            ])
        eval_result = model_eval_lib.run_model_analysis(
            eval_config=eval_config, eval_shared_models=[eval_shared_model])

        # pipeline works.
        expected_metrics = {
            (): {
                metric_keys.EXAMPLE_COUNT: {
                    'doubleValue': 5.0
                },
            }
        }
        expected_matrix = {
            'threshold': 0.8,
            'falseNegatives': 2.0,
            'trueNegatives': 1.0,
            'truePositives': 2.0,
            'precision': 1.0,
            'recall': 0.5
        }
        self.assertMetricsAlmostEqual(eval_result.slicing_metrics,
                                      expected_metrics)
        self.assertEqual(len(eval_result.plots), 1)
        slice_key, plots = eval_result.plots[0]
        self.assertEqual((), slice_key)
        tf.compat.v1.logging.info(plots.keys())
        self.assertDictElementsAlmostEqual(
            plots['']['']['post_export_metrics']['confusionMatrixAtThresholds']
            ['matrices'][8001], expected_matrix)
        self.assertDictElementsAlmostEqual(
            plots['']['']['post_export_metrics/test']
            ['confusionMatrixAtThresholds']['matrices'][8001], expected_matrix)
    def _runTestWithCustomCheck(self,
                                examples,
                                eval_export_dir,
                                metrics_callbacks,
                                slice_spec=None,
                                custom_metrics_check=None,
                                custom_plots_check=None,
                                custom_result_check=None):
        # make sure we are doing some checks
        self.assertTrue(custom_metrics_check is not None
                        or custom_plots_check is not None
                        or custom_result_check is not None)
        serialized_examples = [ex.SerializeToString() for ex in examples]
        slicing_specs = None
        if slice_spec:
            slicing_specs = [s.to_proto() for s in slice_spec]
        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec()],
            model_specs=[config.ModelSpec(location=eval_export_dir)],
            output_data_specs=[config.OutputDataSpec()],
            slicing_specs=slicing_specs)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_export_dir,
            add_metrics_callbacks=metrics_callbacks)
        extractors = model_eval_lib.default_extractors(
            eval_config=eval_config, eval_shared_models=[eval_shared_model])
        with beam.Pipeline() as pipeline:
            (metrics, plots), _ = (
                pipeline
                | 'Create' >> beam.Create(serialized_examples)
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'Extract' >> tfma_unit.Extract(extractors=extractors)  # pylint: disable=no-value-for-parameter
                | 'ComputeMetricsAndPlots' >>
                metrics_and_plots_evaluator.ComputeMetricsAndPlots(
                    eval_shared_model=eval_shared_model,
                    compute_confidence_intervals=self.
                    compute_confidence_intervals,
                    random_seed_for_testing=self.deterministic_test_seed))
            if custom_metrics_check is not None:
                util.assert_that(metrics,
                                 custom_metrics_check,
                                 label='metrics')
            if custom_plots_check is not None:
                util.assert_that(plots, custom_plots_check, label='plot')

        result = pipeline.run()
        if custom_result_check is not None:
            custom_result_check(result)
Exemple #7
0
 def testSerializeDeserializeLegacyEvalConfig(self):
     output_path = self._getTempDir()
     old_config = LegacyConfig(
         model_location='/path/to/model',
         data_location='/path/to/data',
         slice_spec=[
             slicer.SingleSliceSpec(columns=['country'],
                                    features=[('age', 5), ('gender', 'f')]),
             slicer.SingleSliceSpec(columns=['interest'],
                                    features=[('age', 6), ('gender', 'm')])
         ],
         example_count_metric_key=None,
         example_weight_metric_key='key',
         compute_confidence_intervals=False,
         k_anonymization_count=1)
     final_dict = {}
     final_dict['tfma_version'] = tfma_version.VERSION_STRING
     final_dict['eval_config'] = old_config
     with tf.io.TFRecordWriter(os.path.join(output_path,
                                            'eval_config')) as w:
         w.write(pickle.dumps(final_dict))
     got_eval_config = model_eval_lib.load_eval_config(output_path)
     options = config.Options()
     options.compute_confidence_intervals.value = (
         old_config.compute_confidence_intervals)
     options.k_anonymization_count.value = old_config.k_anonymization_count
     eval_config = config.EvalConfig(
         input_data_specs=[
             config.InputDataSpec(location=old_config.data_location)
         ],
         model_specs=[config.ModelSpec(location=old_config.model_location)],
         output_data_specs=[
             config.OutputDataSpec(default_location=output_path)
         ],
         slicing_specs=[
             config.SlicingSpec(feature_keys=['country'],
                                feature_values={
                                    'age': '5',
                                    'gender': 'f'
                                }),
             config.SlicingSpec(feature_keys=['interest'],
                                feature_values={
                                    'age': '6',
                                    'gender': 'm'
                                })
         ],
         options=options)
     self.assertEqual(eval_config, got_eval_config)
def single_model_analysis(
        model_location: Text,
        data_location: Text,
        output_path: Text = None,
        slice_spec: Optional[List[slicer.SingleSliceSpec]] = None
) -> EvalResult:
    """Run model analysis for a single model on a single data set.

  This is a convenience wrapper around run_model_analysis for a single model
  with a single data set. For more complex use cases, use
  tfma.run_model_analysis.

  Args:
    model_location: Path to the export eval saved model.
    data_location: The location of the data files.
    output_path: The directory to output metrics and results to. If None, we use
      a temporary directory.
    slice_spec: A list of tfma.slicer.SingleSliceSpec.

  Returns:
    An EvalResult that can be used with the TFMA visualization functions.
  """
    # Get working_dir ready.
    if output_path is None:
        output_path = tempfile.mkdtemp()
    if not tf.io.gfile.exists(output_path):
        tf.io.gfile.makedirs(output_path)

    eval_config = config.EvalConfig(
        input_data_specs=[config.InputDataSpec(location=data_location)],
        model_specs=[config.ModelSpec(location=model_location)],
        output_data_specs=[
            config.OutputDataSpec(default_location=output_path)
        ],
        slicing_specs=[s.to_proto() for s in slice_spec])

    return run_model_analysis(
        eval_config=eval_config,
        eval_shared_models=[
            default_eval_shared_model(eval_saved_model_path=model_location)
        ])
  def testNoConstructFn(self):
    model_location = self._exportEvalSavedModel(
        linear_classifier.simple_linear_classifier)
    examples = [self._makeExample(age=3.0, language='english', label=1.0)]
    data_location = self._writeTFExamplesToTFRecords(examples)
    eval_config = config.EvalConfig(
        input_data_specs=[config.InputDataSpec(location=data_location)],
        model_specs=[config.ModelSpec(location=model_location)],
        output_data_specs=[
            config.OutputDataSpec(default_location=self._getTempDir())
        ])
    # No construct_fn should fail when Beam attempts to call the construct_fn.
    eval_shared_model = types.EvalSharedModel(model_path=model_location)
    with self.assertRaisesRegexp(AttributeError,
                                 '\'NoneType\' object has no attribute'):
      model_eval_lib.run_model_analysis(
          eval_config=eval_config, eval_shared_models=[eval_shared_model])

    # Using the default_eval_shared_model should pass as it has a construct_fn.
    eval_shared_model = model_eval_lib.default_eval_shared_model(
        eval_saved_model_path=model_location)
    model_eval_lib.run_model_analysis(
        eval_config=eval_config, eval_shared_models=[eval_shared_model])
Exemple #10
0
    def assertGeneralMetricsComputedWithBeamAre(
            self, eval_saved_model_path: Text,
            examples_pcollection: beam.pvalue.PCollection,
            slice_spec: List[slicer.SingleSliceSpec],
            add_metrics_callbacks: List[types.AddMetricsCallbackType],
            expected_slice_metrics: Dict[Any, Dict[Text, Any]]):
        """Checks metrics computed using Beam.

    A more general version of assertMetricsComputedWithBeamAre. Note that the
    caller is responsible for setting up and running the Beam pipeline.

    Example usage:
      def add_metrics(features, predictions, labels):
       metric_ops = {
         'mse': tf.metrics.mean_squared_error(labels, predictions['logits']),
         'mae': tf.metrics.mean_absolute_error(labels, predictions['logits']),
      }
      return metric_ops

      with beam.Pipeline() as pipeline:
        expected_slice_metrics = {
            (): {
              'mae': 0.1,
              'mse': 0.2,
              tfma.post_export_metrics.metric_keys.AUC:
                tfma.test.BoundedValue(lower_bound=0.5)
            },
            (('age', 10),): {
              'mae': 0.2,
              'mse': 0.3,
              tfma.post_export_metrics.metric_keys.AUC:
                tfma.test.BoundedValue(lower_bound=0.5)
            },
        }
        examples = pipeline | 'ReadExamples' >> beam.io.ReadFromTFRecord(path)
        self.assertGeneralMetricsComputedWithBeamAre(
          eval_saved_model_path=path,
          examples_pcollection=examples,
          slice_spec=[tfma.slicer.SingleSliceSpec(),
                      tfma.slicer.SingleSliceSpec(columns=['age'])],
          add_metrics_callbacks=[
            add_metrics, tfma.post_export_metrics.auc()],
          expected_slice_metrics=expected_slice_metrics)

    Args:
      eval_saved_model_path: Path to the directory containing the
        EvalSavedModel.
      examples_pcollection: A PCollection of serialized example bytes.
      slice_spec: List of slice specifications.
      add_metrics_callbacks: Callbacks for adding additional metrics.
      expected_slice_metrics: Dictionary of dictionaries describing the expected
        metrics for each slice. The outer dictionary map slice keys to the
        expected metrics for that slice.
    """
        def check_metrics(got):
            """Check metrics callback."""
            try:
                slices = {}
                for slice_key, value in got:
                    slices[slice_key] = value
                self.assertItemsEqual(list(slices.keys()),
                                      list(expected_slice_metrics.keys()))
                for slice_key, expected_metrics in expected_slice_metrics.items(
                ):
                    self.assertDictElementsWithinBounds(
                        got_values_dict=slices[slice_key],
                        expected_values_dict=expected_metrics)
            except AssertionError as err:
                raise beam_util.BeamAssertException(err)

        slicing_specs = None
        if slice_spec:
            slicing_specs = [s.to_proto() for s in slice_spec]
        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec()],
            model_specs=[config.ModelSpec(location=eval_saved_model_path)],
            output_data_specs=[config.OutputDataSpec()],
            slicing_specs=slicing_specs)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_saved_model_path,
            add_metrics_callbacks=add_metrics_callbacks)
        extractors = model_eval_lib.default_extractors(
            eval_config=eval_config, eval_shared_model=eval_shared_model)

        # pylint: disable=no-value-for-parameter
        (metrics,
         _), _ = (examples_pcollection
                  | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                  | 'Extract' >> Extract(extractors=extractors)
                  | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator.
                  ComputeMetricsAndPlots(eval_shared_model=eval_shared_model))
        # pylint: enable=no-value-for-parameter

        beam_util.assert_that(metrics, check_metrics)
Exemple #11
0
 def testRunModelAnalysisWithUncertainty(self):
     model_location = self._exportEvalSavedModel(
         linear_classifier.simple_linear_classifier)
     examples = [
         self._makeExample(age=3.0, language='english', label=1.0),
         self._makeExample(age=3.0, language='chinese', label=0.0),
         self._makeExample(age=4.0, language='english', label=1.0),
         self._makeExample(age=5.0, language='chinese', label=1.0),
         self._makeExample(age=5.0, language='hindi', label=1.0)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     slicing_specs = [config.SlicingSpec(feature_keys=['language'])]
     options = config.Options()
     options.compute_confidence_intervals.value = True
     options.k_anonymization_count.value = 2
     eval_config = config.EvalConfig(
         input_data_specs=[config.InputDataSpec(location=data_location)],
         model_specs=[config.ModelSpec(location=model_location)],
         output_data_specs=[
             config.OutputDataSpec(default_location=self._getTempDir())
         ],
         slicing_specs=slicing_specs,
         options=options)
     eval_result = model_eval_lib.run_model_analysis(
         eval_config=eval_config,
         eval_shared_models=[
             model_eval_lib.default_eval_shared_model(
                 eval_saved_model_path=model_location,
                 example_weight_key='age')
         ])
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
         (('language', 'hindi'), ): {
             u'__ERROR__': {
                 'debugMessage':
                 u'Example count for this slice key is lower than the '
                 u'minimum required value: 2. No data is aggregated for '
                 u'this slice.'
             },
         },
         (('language', 'chinese'), ): {
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 8.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 2.0
             },
         },
         (('language', 'english'), ): {
             'accuracy': {
                 'boundedValue': {
                     'value': 1.0,
                     'lowerBound': 1.0,
                     'upperBound': 1.0,
                     'methodology': 'POISSON_BOOTSTRAP'
                 }
             },
             'my_mean_label': {
                 'boundedValue': {
                     'value': 1.0,
                     'lowerBound': 1.0,
                     'upperBound': 1.0,
                     'methodology': 'POISSON_BOOTSTRAP'
                 }
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 7.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 2.0
             },
         }
     }
     self.assertEqual(eval_result.config.model_specs[0].location,
                      model_location.decode())
     self.assertEqual(eval_result.config.input_data_specs[0].location,
                      data_location)
     self.assertEqual(eval_result.config.slicing_specs[0],
                      config.SlicingSpec(feature_keys=['language']))
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
     self.assertFalse(eval_result.plots)
def ExtractEvaluateAndWriteResults(  # pylint: disable=invalid-name
        examples: beam.pvalue.PCollection,
        eval_shared_model: Optional[types.EvalSharedModel] = None,
        eval_shared_models: Optional[List[types.EvalSharedModel]] = None,
        eval_config: config.EvalConfig = None,
        extractors: Optional[List[extractor.Extractor]] = None,
        evaluators: Optional[List[evaluator.Evaluator]] = None,
        writers: Optional[List[writer.Writer]] = None,
        output_path: Optional[Text] = None,
        display_only_data_location: Optional[Text] = None,
        slice_spec: Optional[List[slicer.SingleSliceSpec]] = None,
        desired_batch_size: Optional[int] = None,
        write_config: Optional[bool] = True,
        compute_confidence_intervals: Optional[bool] = False,
        k_anonymization_count: int = 1) -> beam.pvalue.PDone:
    """PTransform for performing extraction, evaluation, and writing results.

  Users who want to construct their own Beam pipelines instead of using the
  lightweight run_model_analysis functions should use this PTransform.

  Example usage:
    eval_config = tfma.EvalConfig(
        input_data_specs=[tfma.InputDataSpec(location=data_location)],
        model_specs=[tfma.ModelSpec(location=model_location)],
        output_data_specs=[tfma.OutputDataSpec(default_location=output_path)],
        slicing_specs=[...],
        metrics_specs=[...])
    eval_shared_model = tfma.default_eval_shared_model(
        eval_saved_model_path=model_location,
        add_metrics_callbacks=[...])
    with beam.Pipeline(runner=...) as p:
      _ = (p
           | 'ReadData' >> beam.io.ReadFromTFRecord(data_location)
           | 'ExtractEvaluateAndWriteResults' >>
           tfma.ExtractEvaluateAndWriteResults(
               eval_config=eval_config,
               eval_shared_models=[eval_shared_model],
               ...))
    result = tfma.load_eval_result(output_path=output_path)
    tfma.view.render_slicing_metrics(result)

  Note that the exact serialization format is an internal implementation detail
  and subject to change. Users should only use the TFMA functions to write and
  read the results.

  Args:
    examples: PCollection of input examples. Can be any format the model accepts
      (e.g. string containing CSV row, TensorFlow.Example, etc).
    eval_shared_model: Shared model (single-model evaluation).
    eval_shared_models: Shared models (multi-model evaluation).
    eval_config: Eval config.
    extractors: Optional list of Extractors to apply to Extracts. Typically
      these will be added by calling the default_extractors function. If no
      extractors are provided, default_extractors (non-materialized) will be
      used.
    evaluators: Optional list of Evaluators for evaluating Extracts. Typically
      these will be added by calling the default_evaluators function. If no
      evaluators are provided, default_evaluators will be used.
    writers: Optional list of Writers for writing Evaluation output. Typically
      these will be added by calling the default_writers function. If no writers
      are provided, default_writers will be used.
    output_path: Deprecated (use EvalConfig).
    display_only_data_location: Deprecated (use EvalConfig).
    slice_spec: Deprecated (use EvalConfig).
    desired_batch_size: Deprecated (use EvalConfig).
    write_config: Deprecated (use EvalConfig).
    compute_confidence_intervals: Deprecated (use EvalConfig).
    k_anonymization_count: Deprecated (use EvalConfig).

  Raises:
    ValueError: If matching Extractor not found for an Evaluator.

  Returns:
    PDone.
  """
    if eval_shared_model is not None:
        eval_shared_models = [eval_shared_model]

    if eval_config is None:
        data_location = '<user provided PCollection>'
        if display_only_data_location is not None:
            data_location = display_only_data_location
        disabled_outputs = None
        if not write_config:
            disabled_outputs = [_EVAL_CONFIG_FILE]
        model_specs = []
        for m in eval_shared_models:
            example_weight_key = m.example_weight_key
            example_weight_keys = {}
            if example_weight_key and isinstance(example_weight_key, dict):
                example_weight_keys = example_weight_key
                example_weight_key = ''
            model_specs.append(
                config.ModelSpec(location=m.model_path,
                                 example_weight_key=example_weight_key,
                                 example_weight_keys=example_weight_keys))
        slicing_specs = None
        if slice_spec:
            slicing_specs = [s.to_proto() for s in slice_spec]
        options = config.Options()
        options.compute_confidence_intervals.value = compute_confidence_intervals
        options.k_anonymization_count.value = k_anonymization_count
        if desired_batch_size:
            options.desired_batch_size.value = desired_batch_size
        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec(location=data_location)],
            model_specs=model_specs,
            output_data_specs=[
                config.OutputDataSpec(default_location=output_path,
                                      disabled_outputs=disabled_outputs)
            ],
            slicing_specs=slicing_specs,
            options=options)

    if not extractors:
        extractors = default_extractors(eval_config=eval_config,
                                        eval_shared_models=eval_shared_models,
                                        materialize=False)

    if not evaluators:
        evaluators = default_evaluators(eval_config=eval_config,
                                        eval_shared_models=eval_shared_models)

    for v in evaluators:
        evaluator.verify_evaluator(v, extractors)

    if not writers:
        writers = default_writers(eval_config=eval_config,
                                  eval_shared_models=eval_shared_models)

    # pylint: disable=no-value-for-parameter
    _ = (examples
         | 'InputsToExtracts' >> InputsToExtracts()
         | 'ExtractAndEvaluate' >> ExtractAndEvaluate(extractors=extractors,
                                                      evaluators=evaluators)
         | 'WriteResults' >> WriteResults(writers=writers))

    # TODO(b/141016373): Add support for multiple models.
    if _EVAL_CONFIG_FILE not in eval_config.output_data_specs[
            0].disabled_outputs:
        _ = examples.pipeline | WriteEvalConfig(eval_config)
    # pylint: enable=no-value-for-parameter

    return beam.pvalue.PDone(examples.pipeline)
Exemple #13
0
    def testRunModelAnalysisWithQueryBasedMetrics(self):
        input_layer = tf.keras.layers.Input(shape=(1, ), name='age')
        output_layer = tf.keras.layers.Dense(
            1, activation=tf.nn.sigmoid)(input_layer)
        model = tf.keras.models.Model(input_layer, output_layer)
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001),
                      loss=tf.keras.losses.binary_crossentropy)

        features = {'age': [[20.0]]}
        labels = [[1]]
        example_weights = [1.0]
        dataset = tf.data.Dataset.from_tensor_slices(
            (features, labels, example_weights))
        dataset = dataset.shuffle(buffer_size=1).repeat().batch(1)
        model.fit(dataset, steps_per_epoch=1)

        model_location = os.path.join(self._getTempDir(), 'export_dir')
        model.save(model_location, save_format='tf')

        examples = [
            self._makeExample(age=3.0, language='english', label=1.0),
            self._makeExample(age=5.0, language='chinese', label=0.0),
            self._makeExample(age=3.0, language='english', label=0.0),
            self._makeExample(age=5.0, language='chinese', label=1.0)
        ]
        data_location = self._writeTFExamplesToTFRecords(examples)
        slicing_specs = [config.SlicingSpec()]
        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec(location=data_location)],
            model_specs=[
                config.ModelSpec(location=model_location, label_key='label')
            ],
            output_data_specs=[
                config.OutputDataSpec(default_location=self._getTempDir())
            ],
            slicing_specs=slicing_specs,
            metrics_specs=metric_specs.specs_from_metrics(
                [ndcg.NDCG(gain_key='age', name='ndcg')],
                binarize=config.BinarizationOptions(top_k_list=[1]),
                query_key='language'))
        eval_shared_model = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=model_location,
            tags=[tf.saved_model.SERVING])
        eval_result = model_eval_lib.run_model_analysis(
            eval_config=eval_config,
            eval_shared_models=[eval_shared_model],
            evaluators=[
                metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                    eval_config=eval_config,
                    eval_shared_models=[eval_shared_model])
            ])

        self.assertEqual(eval_result.config.model_specs[0].location,
                         model_location)
        self.assertEqual(eval_result.config.input_data_specs[0].location,
                         data_location)
        self.assertLen(eval_result.slicing_metrics, 1)
        got_slice_key, got_metrics = eval_result.slicing_metrics[0]
        self.assertEqual(got_slice_key, ())
        self.assertIn('', got_metrics)  # output_name
        got_metrics = got_metrics['']
        expected_metrics = {
            '': {
                'example_count': True,
                'weighted_example_count': True,
            },
            'topK:1': {
                'ndcg': True,
            },
        }
        for group in expected_metrics:
            self.assertIn(group, got_metrics)
            for k in expected_metrics[group]:
                self.assertIn(k, got_metrics[group])
Exemple #14
0
 def testRunModelAnalysisWithLegacyQueryExtractor(self):
     model_location = self._exportEvalSavedModel(
         linear_classifier.simple_linear_classifier)
     examples = [
         self._makeExample(age=3.0, language='english', label=1.0),
         self._makeExample(age=3.0, language='chinese', label=0.0),
         self._makeExample(age=4.0, language='english', label=0.0),
         self._makeExample(age=5.0, language='chinese', label=1.0)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     slicing_specs = [config.SlicingSpec()]
     eval_config = config.EvalConfig(
         input_data_specs=[config.InputDataSpec(location=data_location)],
         model_specs=[config.ModelSpec(location=model_location)],
         output_data_specs=[
             config.OutputDataSpec(default_location=self._getTempDir())
         ],
         slicing_specs=slicing_specs)
     eval_shared_model = model_eval_lib.default_eval_shared_model(
         eval_saved_model_path=model_location, example_weight_key='age')
     eval_result = model_eval_lib.run_model_analysis(
         eval_config=eval_config,
         eval_shared_models=[eval_shared_model],
         evaluators=[
             metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(
                 eval_shared_model),
             query_based_metrics_evaluator.QueryBasedMetricsEvaluator(
                 query_id='language',
                 prediction_key='logistic',
                 combine_fns=[
                     query_statistics.QueryStatisticsCombineFn(),
                     legacy_ndcg.NdcgMetricCombineFn(at_vals=[1],
                                                     gain_key='label',
                                                     weight_key='')
                 ]),
         ])
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
         (): {
             'post_export_metrics/total_queries': {
                 'doubleValue': 2.0
             },
             'post_export_metrics/min_documents': {
                 'doubleValue': 2.0
             },
             'post_export_metrics/max_documents': {
                 'doubleValue': 2.0
             },
             'post_export_metrics/total_documents': {
                 'doubleValue': 4.0
             },
             'post_export_metrics/ndcg@1': {
                 'doubleValue': 0.5
             },
             'post_export_metrics/example_weight': {
                 'doubleValue': 15.0
             },
             'post_export_metrics/example_count': {
                 'doubleValue': 4.0
             },
         }
     }
     self.assertEqual(eval_result.config.model_specs[0].location,
                      model_location.decode())
     self.assertEqual(eval_result.config.input_data_specs[0].location,
                      data_location)
     self.assertEqual(eval_result.config.slicing_specs[0],
                      config.SlicingSpec())
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
     self.assertFalse(eval_result.plots)
Exemple #15
0
    def testRunModelAnalysisWithKerasModel(self):
        input_layer = tf.keras.layers.Input(shape=(28 * 28, ), name='data')
        output_layer = tf.keras.layers.Dense(
            10, activation=tf.nn.softmax)(input_layer)
        model = tf.keras.models.Model(input_layer, output_layer)
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001),
                      loss=tf.keras.losses.categorical_crossentropy)

        features = {'data': [[0.0] * 28 * 28]}
        labels = [[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]]
        example_weights = [1.0]
        dataset = tf.data.Dataset.from_tensor_slices(
            (features, labels, example_weights))
        dataset = dataset.shuffle(buffer_size=1).repeat().batch(1)
        model.fit(dataset, steps_per_epoch=1)

        model_location = os.path.join(self._getTempDir(), 'export_dir')
        model.save(model_location, save_format='tf')

        examples = [
            self._makeExample(data=[0.0] * 28 * 28, label=1.0),
            self._makeExample(data=[1.0] * 28 * 28, label=5.0),
            self._makeExample(data=[1.0] * 28 * 28, label=9.0),
        ]
        data_location = self._writeTFExamplesToTFRecords(examples)
        metrics_spec = config.MetricsSpec()
        for metric in (tf.keras.metrics.AUC(), ):
            cfg = tf.keras.utils.serialize_keras_object(metric)
            metrics_spec.metrics.append(
                config.MetricConfig(class_name=cfg['class_name'],
                                    config=json.dumps(cfg['config'])))
        for class_id in (0, 5, 9):
            metrics_spec.binarize.class_ids.append(class_id)
        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec(location=data_location)],
            model_specs=[
                config.ModelSpec(location=model_location, label_key='label')
            ],
            output_data_specs=[
                config.OutputDataSpec(default_location=self._getTempDir())
            ],
            metrics_specs=[metrics_spec])
        eval_result = model_eval_lib.run_model_analysis(
            eval_config=eval_config,
            eval_shared_models=[
                model_eval_lib.default_eval_shared_model(
                    eval_saved_model_path=model_location,
                    tags=[tf.saved_model.SERVING])
            ])
        self.assertEqual(eval_result.config.model_specs[0].location,
                         model_location)
        self.assertEqual(eval_result.config.input_data_specs[0].location,
                         data_location)
        self.assertLen(eval_result.slicing_metrics, 1)
        got_slice_key, got_metrics = eval_result.slicing_metrics[0]
        self.assertEqual(got_slice_key, ())
        self.assertIn('', got_metrics)  # output_name
        got_metrics = got_metrics['']
        expected_metrics = {
            'classId:0': {
                'auc': True,
            },
            'classId:5': {
                'auc': True,
            },
            'classId:9': {
                'auc': True,
            },
        }
        for class_id in expected_metrics:
            self.assertIn(class_id, got_metrics)
            for k in expected_metrics[class_id]:
                self.assertIn(k, got_metrics[class_id])
Exemple #16
0
 def testRunModelAnalysisExtraFieldsPlusFeatureExtraction(self):
     model_location = self._exportEvalSavedModel(
         linear_classifier.simple_linear_classifier)
     examples = [
         self._makeExample(age=3.0,
                           language='english',
                           label=1.0,
                           my_slice='a'),
         self._makeExample(age=3.0,
                           language='chinese',
                           label=0.0,
                           my_slice='a'),
         self._makeExample(age=4.0,
                           language='english',
                           label=1.0,
                           my_slice='b'),
         self._makeExample(age=5.0,
                           language='chinese',
                           label=1.0,
                           my_slice='c'),
         self._makeExample(age=5.0, language='hindi', label=1.0)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     slicing_specs = [config.SlicingSpec(feature_keys=['my_slice'])]
     eval_config = config.EvalConfig(
         input_data_specs=[config.InputDataSpec(location=data_location)],
         model_specs=[config.ModelSpec(location=model_location)],
         output_data_specs=[
             config.OutputDataSpec(default_location=self._getTempDir())
         ],
         slicing_specs=slicing_specs)
     eval_shared_model = model_eval_lib.default_eval_shared_model(
         eval_saved_model_path=model_location, example_weight_key='age')
     slice_spec = [slicer.SingleSliceSpec(spec=slicing_specs[0])]
     extractors_with_feature_extraction = [
         predict_extractor.PredictExtractor(eval_shared_model,
                                            desired_batch_size=3,
                                            materialize=False),
         feature_extractor.FeatureExtractor(
             extract_source=constants.INPUT_KEY,
             extract_dest=constants.FEATURES_PREDICTIONS_LABELS_KEY),
         slice_key_extractor.SliceKeyExtractor(slice_spec,
                                               materialize=False)
     ]
     eval_result = model_eval_lib.run_model_analysis(
         eval_config=eval_config,
         eval_shared_models=[
             model_eval_lib.default_eval_shared_model(
                 eval_saved_model_path=model_location,
                 example_weight_key='age')
         ],
         extractors=extractors_with_feature_extraction)
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
         (('my_slice', 'a'), ): {
             'accuracy': {
                 'doubleValue': 1.0
             },
             'my_mean_label': {
                 'doubleValue': 0.5
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 6.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 2.0
             },
         },
         (('my_slice', 'b'), ): {
             'accuracy': {
                 'doubleValue': 1.0
             },
             'my_mean_label': {
                 'doubleValue': 1.0
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 4.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 1.0
             },
         },
         (('my_slice', 'c'), ): {
             'accuracy': {
                 'doubleValue': 0.0
             },
             'my_mean_label': {
                 'doubleValue': 1.0
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 5.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 1.0
             },
         },
     }
     self.assertEqual(eval_result.config.model_specs[0].location,
                      model_location.decode())
     self.assertEqual(eval_result.config.input_data_specs[0].location,
                      data_location)
     self.assertEqual(eval_result.config.slicing_specs[0],
                      config.SlicingSpec(feature_keys=['my_slice']))
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
     self.assertFalse(eval_result.plots)
def run_model_analysis(
        eval_shared_model: Optional[types.EvalSharedModel] = None,
        eval_shared_models: Optional[List[types.EvalSharedModel]] = None,
        eval_config: config.EvalConfig = None,
        extractors: Optional[List[extractor.Extractor]] = None,
        evaluators: Optional[List[evaluator.Evaluator]] = None,
        writers: Optional[List[writer.Writer]] = None,
        pipeline_options: Optional[Any] = None,
        data_location: Optional[Text] = None,
        file_format: Optional[Text] = 'tfrecords',
        slice_spec: Optional[List[slicer.SingleSliceSpec]] = None,
        output_path: Optional[Text] = None,
        write_config: Optional[bool] = True,
        desired_batch_size: Optional[int] = None,
        compute_confidence_intervals: Optional[bool] = False,
        k_anonymization_count: int = 1) -> EvalResult:
    """Runs TensorFlow model analysis.

  It runs a Beam pipeline to compute the slicing metrics exported in TensorFlow
  Eval SavedModel and returns the results.

  This is a simplified API for users who want to quickly get something running
  locally. Users who wish to create their own Beam pipelines can use the
  Evaluate PTransform instead.

  Args:
    eval_shared_model: Shared model (single-model evaluation).
    eval_shared_models: Shared models (multi-model evaluation).
    eval_config: Eval config.
    extractors: Optional list of Extractors to apply to Extracts. Typically
      these will be added by calling the default_extractors function. If no
      extractors are provided, default_extractors (non-materialized) will be
      used.
    evaluators: Optional list of Evaluators for evaluating Extracts. Typically
      these will be added by calling the default_evaluators function. If no
      evaluators are provided, default_evaluators will be used.
    writers: Optional list of Writers for writing Evaluation output. Typically
      these will be added by calling the default_writers function. If no writers
      are provided, default_writers will be used.
    pipeline_options: Optional arguments to run the Pipeline, for instance
      whether to run directly.
    data_location: Deprecated (use EvalConfig).
    file_format: Deprecated (use EvalConfig).
    slice_spec: Deprecated (use EvalConfig).
    output_path: Deprecated (use EvalConfig).
    write_config: Deprecated (use EvalConfig).
    desired_batch_size: Deprecated (use EvalConfig).
    compute_confidence_intervals: Deprecated (use EvalConfig).
    k_anonymization_count: Deprecated (use EvalConfig).

  Returns:
    An EvalResult that can be used with the TFMA visualization functions.

  Raises:
    ValueError: If the file_format is unknown to us.
  """
    _assert_tensorflow_version()

    if eval_shared_model is not None:
        eval_shared_models = [eval_shared_model]

    if eval_config is None:
        if output_path is None:
            output_path = tempfile.mkdtemp()
        if not tf.io.gfile.exists(output_path):
            tf.io.gfile.makedirs(output_path)
        disabled_outputs = None
        if not write_config:
            disabled_outputs = [_EVAL_CONFIG_FILE]
        model_specs = []
        for m in eval_shared_models:
            example_weight_key = m.example_weight_key
            example_weight_keys = {}
            if example_weight_key and isinstance(example_weight_key, dict):
                example_weight_keys = example_weight_key
                example_weight_key = ''
            model_specs.append(
                config.ModelSpec(location=m.model_path,
                                 example_weight_key=example_weight_key,
                                 example_weight_keys=example_weight_keys))
        slicing_specs = None
        if slice_spec:
            slicing_specs = [s.to_proto() for s in slice_spec]
        options = config.Options()
        options.compute_confidence_intervals.value = compute_confidence_intervals
        options.k_anonymization_count.value = k_anonymization_count
        if desired_batch_size:
            options.desired_batch_size.value = desired_batch_size
        eval_config = config.EvalConfig(
            input_data_specs=[
                config.InputDataSpec(location=data_location,
                                     file_format=file_format)
            ],
            model_specs=model_specs,
            output_data_specs=[
                config.OutputDataSpec(default_location=output_path,
                                      disabled_outputs=disabled_outputs)
            ],
            slicing_specs=slicing_specs,
            options=options)

    if len(eval_config.input_data_specs) != 1:
        raise NotImplementedError(
            'multiple input_data_specs are not yet supported.')
    if len(eval_config.model_specs) != 1:
        raise NotImplementedError(
            'multiple model_specs are not yet supported.')
    if len(eval_config.output_data_specs) != 1:
        raise NotImplementedError(
            'multiple output_data_specs are not yet supported.')

    with beam.Pipeline(options=pipeline_options) as p:
        if (not eval_config.input_data_specs[0].file_format
                or eval_config.input_data_specs[0].file_format == 'tfrecords'):
            data = p | 'ReadFromTFRecord' >> beam.io.ReadFromTFRecord(
                file_pattern=eval_config.input_data_specs[0].location,
                compression_type=beam.io.filesystem.CompressionTypes.AUTO)
        elif eval_config.input_data_specs[0].file_format == 'text':
            data = p | 'ReadFromText' >> beam.io.textio.ReadFromText(
                eval_config.input_data_specs[0].location)
        else:
            raise ValueError('unknown file_format: {}'.format(
                eval_config.input_data_specs[0].file_format))

        # pylint: disable=no-value-for-parameter
        _ = (
            data
            |
            'ExtractEvaluateAndWriteResults' >> ExtractEvaluateAndWriteResults(
                eval_config=eval_config,
                eval_shared_models=eval_shared_models,
                extractors=extractors,
                evaluators=evaluators,
                writers=writers))
        # pylint: enable=no-value-for-parameter

    # TODO(b/141016373): Add support for multiple models.
    return load_eval_result(eval_config.output_data_specs[0].default_location)
    def testWriteMetricsAndPlots(self):
        metrics_file = os.path.join(self._getTempDir(), 'metrics')
        plots_file = os.path.join(self._getTempDir(), 'plots')
        temp_eval_export_dir = os.path.join(self._getTempDir(),
                                            'eval_export_dir')

        _, eval_export_dir = (
            fixed_prediction_estimator.simple_fixed_prediction_estimator(
                None, temp_eval_export_dir))
        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec()],
            model_specs=[config.ModelSpec()],
            output_data_specs=[
                config.OutputDataSpec(disabled_outputs=['eval_config.json'])
            ])
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_export_dir,
            add_metrics_callbacks=[
                post_export_metrics.example_count(),
                post_export_metrics.calibration_plot_and_prediction_histogram(
                    num_buckets=2)
            ])
        extractors = [
            predict_extractor.PredictExtractor(eval_shared_model),
            slice_key_extractor.SliceKeyExtractor()
        ]
        evaluators = [
            metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(
                eval_shared_model)
        ]
        output_paths = {
            constants.METRICS_KEY: metrics_file,
            constants.PLOTS_KEY: plots_file
        }
        writers = [
            metrics_and_plots_writer.MetricsAndPlotsWriter(
                eval_shared_model, output_paths)
        ]

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(prediction=0.0, label=1.0)
            example2 = self._makeExample(prediction=1.0, label=1.0)

            # pylint: disable=no-value-for-parameter
            _ = (pipeline
                 | 'Create' >> beam.Create([
                     example1.SerializeToString(),
                     example2.SerializeToString(),
                 ])
                 | 'ExtractEvaluateAndWriteResults' >>
                 model_eval_lib.ExtractEvaluateAndWriteResults(
                     eval_config=eval_config,
                     eval_shared_models=[eval_shared_model],
                     extractors=extractors,
                     evaluators=evaluators,
                     writers=writers))
            # pylint: enable=no-value-for-parameter

        expected_metrics_for_slice = text_format.Parse(
            """
        slice_key {}
        metrics {
          key: "average_loss"
          value {
            double_value {
              value: 0.5
            }
          }
        }
        metrics {
          key: "post_export_metrics/example_count"
          value {
            double_value {
              value: 2.0
            }
          }
        }
        """, metrics_for_slice_pb2.MetricsForSlice())

        metric_records = []
        for record in tf.compat.v1.python_io.tf_record_iterator(metrics_file):
            metric_records.append(
                metrics_for_slice_pb2.MetricsForSlice.FromString(record))
        self.assertEqual(1, len(metric_records),
                         'metrics: %s' % metric_records)
        self.assertProtoEquals(expected_metrics_for_slice, metric_records[0])

        expected_plots_for_slice = text_format.Parse(
            """
      slice_key {}
      plots {
        key: "post_export_metrics"
        value {
          calibration_histogram_buckets {
            buckets {
              lower_threshold_inclusive: -inf
              num_weighted_examples {}
              total_weighted_label {}
              total_weighted_refined_prediction {}
            }
            buckets {
              upper_threshold_exclusive: 0.5
              num_weighted_examples {
                value: 1.0
              }
              total_weighted_label {
                value: 1.0
              }
              total_weighted_refined_prediction {}
            }
            buckets {
              lower_threshold_inclusive: 0.5
              upper_threshold_exclusive: 1.0
              num_weighted_examples {
              }
              total_weighted_label {}
              total_weighted_refined_prediction {}
            }
            buckets {
              lower_threshold_inclusive: 1.0
              upper_threshold_exclusive: inf
              num_weighted_examples {
                value: 1.0
              }
              total_weighted_label {
                value: 1.0
              }
              total_weighted_refined_prediction {
                value: 1.0
              }
            }
         }
        }
      }
    """, metrics_for_slice_pb2.PlotsForSlice())

        plot_records = []
        for record in tf.compat.v1.python_io.tf_record_iterator(plots_file):
            plot_records.append(
                metrics_for_slice_pb2.PlotsForSlice.FromString(record))
        self.assertEqual(1, len(plot_records), 'plots: %s' % plot_records)
        self.assertProtoEquals(expected_plots_for_slice, plot_records[0])
Exemple #19
0
    def assertMetricsComputedWithBeamAre(
        self,
        eval_saved_model_path: Text,
        serialized_examples: List[bytes],
        expected_metrics: Dict[Text, Any],
        add_metrics_callbacks: Optional[List[
            types.AddMetricsCallbackType]] = None):
        """Checks metrics computed using Beam.

    Metrics will be computed over all examples, without any slicing. If you
    want to provide your own PCollection (e.g. read a large number of examples
    from a file), if you want to check metrics over certain slices, or if you
    want to add additional post-export metrics, use the more general
    assertGeneralMetricsComputedWithBeamAre.

    Example usage:
      self.assertMetricsComputedWithBeamAre(
        eval_saved_model_path=path,
        serialized_examples=[self.makeExample(age=5, label=1.0),
                             self.makeExample(age=10, label=0.0)],
        expected_metrics={'average_loss': 0.1})

    Args:
      eval_saved_model_path: Path to the directory containing the
        EvalSavedModel.
      serialized_examples: List of serialized example bytes.
      expected_metrics: Dictionary of expected metric values.
      add_metrics_callbacks: Optional. Callbacks for adding additional metrics.
    """
        def check_metrics(got):
            """Check metrics callback."""
            try:
                self.assertEqual(
                    1, len(got),
                    'expecting metrics for exactly one slice, but got %d '
                    'slices instead. metrics were: %s' % (len(got), got))
                (slice_key, value) = got[0]
                self.assertEqual((), slice_key)
                self.assertDictElementsWithinBounds(
                    got_values_dict=value,
                    expected_values_dict=expected_metrics)
            except AssertionError as err:
                raise beam_util.BeamAssertException(err)

        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec()],
            model_specs=[config.ModelSpec(location=eval_saved_model_path)],
            output_data_specs=[config.OutputDataSpec()])
        eval_shared_model = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=eval_saved_model_path,
            add_metrics_callbacks=add_metrics_callbacks)
        extractors = model_eval_lib.default_extractors(
            eval_config=eval_config, eval_shared_model=eval_shared_model)

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            (metrics, _), _ = (
                pipeline
                | 'CreateExamples' >> beam.Create(serialized_examples)
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'Extract' >> Extract(extractors=extractors)
                | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator.
                ComputeMetricsAndPlots(eval_shared_model=eval_shared_model))
            # pylint: enable=no-value-for-parameter

            beam_util.assert_that(metrics, check_metrics)