def single_model_analysis(
        model_location: Text,
        data_location: Text,
        output_path: Text = None,
        slice_spec: Optional[List[slicer.SingleSliceSpec]] = None
) -> EvalResult:
    """Run model analysis for a single model on a single data set.

  This is a convenience wrapper around run_model_analysis for a single model
  with a single data set. For more complex use cases, use
  tfma.run_model_analysis.

  Args:
    model_location: Path to the export eval saved model.
    data_location: The location of the data files.
    output_path: The directory to output metrics and results to. If None, we use
      a temporary directory.
    slice_spec: A list of tfma.slicer.SingleSliceSpec.

  Returns:
    An EvalResult that can be used with the TFMA visualization functions.
  """
    # Get working_dir ready.
    if output_path is None:
        output_path = tempfile.mkdtemp()
    if not tf.io.gfile.exists(output_path):
        tf.io.gfile.makedirs(output_path)

    eval_config = config.EvalConfig(
        slicing_specs=[s.to_proto() for s in slice_spec])

    return run_model_analysis(eval_config=eval_config,
                              eval_shared_model=default_eval_shared_model(
                                  eval_saved_model_path=model_location),
                              data_location=data_location,
                              output_path=output_path)  # pytype: disable=bad-return-type
 def testValidateMetricsChangeThresholdAbsoluteFail(self, slicing_specs,
                                                    slice_key):
   threshold = config.MetricThreshold(
       change_threshold=config.GenericChangeThreshold(
           direction=config.MetricDirection.LOWER_IS_BETTER,
           absolute={'value': -1}))
   eval_config = config.EvalConfig(
       model_specs=[
           config.ModelSpec(),
           config.ModelSpec(name='baseline', is_baseline=True)
       ],
       slicing_specs=slicing_specs,
       metrics_specs=[
           config.MetricsSpec(
               metrics=[
                   config.MetricConfig(
                       class_name='MeanPrediction',
                       # Diff = 0 - .333 = -.333 < -1, NOT OK.
                       threshold=threshold if slicing_specs is None else None,
                       per_slice_thresholds=[
                           config.PerSliceMetricThreshold(
                               slicing_specs=slicing_specs,
                               threshold=threshold)
                       ])
               ],
               model_names=['']),
       ],
   )
   sliced_metrics = (slice_key, {
       metric_types.MetricKey(name='mean_prediction', model_name='baseline'):
           0.333,
       metric_types.MetricKey(name='mean_prediction', is_diff=True):
           -0.333,
   })
   result = metrics_validator.validate_metrics(sliced_metrics, eval_config)
   self.assertFalse(result.validation_ok)
    def testSliceKeys(self, model_names, extracts, slice_specs,
                      expected_slices):
        eval_config = config.EvalConfig(
            model_specs=[config.ModelSpec(name=name) for name in model_names])
        with beam.Pipeline() as pipeline:
            slice_keys_extracts = (
                pipeline
                | 'CreateTestInput' >> beam.Create(extracts)
                | 'ExtractSlices' >> slice_key_extractor.ExtractSliceKeys(
                    slice_spec=slice_specs, eval_config=eval_config))

            def check_result(got):
                try:
                    self.assertLen(got, 2)
                    got_results = []
                    for item in got:
                        self.assertIn(constants.SLICE_KEY_TYPES_KEY, item)
                        got_results.append(
                            sorted(item[constants.SLICE_KEY_TYPES_KEY]))
                    self.assertCountEqual(got_results, expected_slices)
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(slice_keys_extracts, check_result)
Exemple #4
0
    def test_features_extractor_no_features(self):
        model_spec = config.ModelSpec()
        eval_config = config.EvalConfig(model_specs=[model_spec])
        feature_extractor = features_extractor.FeaturesExtractor(eval_config)
        tfx_io = tf_example_record.TFExampleBeamRecord(
            raw_record_column_name=constants.ARROW_INPUT_COLUMN,
            physical_format='inmem',
            telemetry_descriptors=['testing'])

        with beam.Pipeline() as pipeline:
            result = (
                pipeline | 'Create' >> beam.Create([b''] * 3)
                | 'DecodeToRecordBatch' >> tfx_io.BeamSource(batch_size=3)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | feature_extractor.stage_name >> feature_extractor.ptransform)

            def check_result(got):
                self.assertLen(got, 1)
                self.assertLen(got[0], 3)
                for d in got[0][constants.FEATURES_KEY]:
                    self.assertEmpty(d)

            util.assert_that(result, check_result, label='CheckResult')
Exemple #5
0
 def testValidateMetricsDivByZero(self):
   threshold = config.MetricThreshold(
       change_threshold=config.GenericChangeThreshold(
           direction=config.MetricDirection.HIGHER_IS_BETTER,
           relative={'value': 0.1}))
   slicing_specs = [config.SlicingSpec()]
   eval_config = config.EvalConfig(
       model_specs=[
           config.ModelSpec(name='candidate'),
           config.ModelSpec(name='baseline', is_baseline=True)
       ],
       slicing_specs=slicing_specs,
       metrics_specs=[
           config.MetricsSpec(
               metrics=[
                   config.MetricConfig(
                       class_name='MeanPrediction',
                       threshold=threshold if slicing_specs is None else None,
                       per_slice_thresholds=[
                           config.PerSliceMetricThreshold(
                               slicing_specs=slicing_specs,
                               threshold=threshold)
                       ])
               ],
               model_names=['baseline', 'candidate']),
       ],
   )
   sliced_metrics = ((()), {
       metric_types.MetricKey(name='mean_prediction', model_name='baseline'):
           0.0,
       metric_types.MetricKey(
           name='mean_prediction', model_name='candidate', is_diff=True):
           0.1,
   })
   result = metrics_validator.validate_metrics(sliced_metrics, eval_config)
   self.assertFalse(result.validation_ok)
Exemple #6
0
 def testRunModelAnalysisForCSVText(self):
     model_location = self._exportEvalSavedModel(
         csv_linear_classifier.simple_csv_linear_classifier)
     examples = [
         '3.0,english,1.0', '3.0,chinese,0.0', '4.0,english,1.0',
         '5.0,chinese,1.0'
     ]
     data_location = self._writeCSVToTextFile(examples)
     eval_config = config.EvalConfig(
         input_data_specs=[
             config.InputDataSpec(location=data_location,
                                  file_format='text')
         ],
         model_specs=[config.ModelSpec(location=model_location)],
         output_data_specs=[
             config.OutputDataSpec(default_location=self._getTempDir())
         ])
     eval_result = model_eval_lib.run_model_analysis(
         eval_config=eval_config,
         eval_shared_models=[
             model_eval_lib.default_eval_shared_model(
                 eval_saved_model_path=model_location)
         ])
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
         (): {
             'accuracy': {
                 'doubleValue': 0.75
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 4.0
             }
         }
     }
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
Exemple #7
0
    def testNoConstructFn(self):
        model_location = self._exportEvalSavedModel(
            linear_classifier.simple_linear_classifier)
        examples = [self._makeExample(age=3.0, language='english', label=1.0)]
        data_location = self._writeTFExamplesToTFRecords(examples)
        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec(location=data_location)],
            model_specs=[config.ModelSpec(location=model_location)],
            output_data_specs=[
                config.OutputDataSpec(default_location=self._getTempDir())
            ])
        # No construct_fn should fail when Beam attempts to call the construct_fn.
        eval_shared_model = types.EvalSharedModel(model_path=model_location)
        with self.assertRaisesRegexp(AttributeError,
                                     '\'NoneType\' object has no attribute'):
            model_eval_lib.run_model_analysis(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model])

        # Using the default_eval_shared_model should pass as it has a construct_fn.
        eval_shared_model = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=model_location)
        model_eval_lib.run_model_analysis(
            eval_config=eval_config, eval_shared_models=[eval_shared_model])
 def testValidateMetricsValueThresholdUpperBoundFail(self):
   eval_config = config.EvalConfig(
       model_specs=[
           config.ModelSpec(),
       ],
       slicing_specs=[config.SlicingSpec()],
       metrics_specs=[
           config.MetricsSpec(
               metrics=[
                   config.MetricConfig(
                       class_name='WeightedExampleCount',
                       # 1.5 < 1, NOT OK.
                       threshold=config.MetricThreshold(
                           value_threshold=config.GenericValueThreshold(
                               upper_bound={'value': 1}))),
               ],
               model_names=['']),
       ],
   )
   sliced_metrics = ((()), {
       metric_types.MetricKey(name='weighted_example_count'): 1.5,
   })
   result = metrics_validator.validate_metrics(sliced_metrics, eval_config)
   self.assertFalse(result.validation_ok)
Exemple #9
0
    def assertMetricsComputedWithBeamAre(
        self,
        eval_saved_model_path: Text,
        serialized_examples: List[bytes],
        expected_metrics: Dict[Text, Any],
        add_metrics_callbacks: Optional[List[
            types.AddMetricsCallbackType]] = None):
        """Checks metrics computed using Beam.

    Metrics will be computed over all examples, without any slicing. If you
    want to provide your own PCollection (e.g. read a large number of examples
    from a file), if you want to check metrics over certain slices, or if you
    want to add additional post-export metrics, use the more general
    assertGeneralMetricsComputedWithBeamAre.

    Example usage:
      self.assertMetricsComputedWithBeamAre(
        eval_saved_model_path=path,
        serialized_examples=[self.makeExample(age=5, label=1.0),
                             self.makeExample(age=10, label=0.0)],
        expected_metrics={'average_loss': 0.1})

    Args:
      eval_saved_model_path: Path to the directory containing the
        EvalSavedModel.
      serialized_examples: List of serialized example bytes.
      expected_metrics: Dictionary of expected metric values.
      add_metrics_callbacks: Optional. Callbacks for adding additional metrics.
    """
        def check_metrics(got):
            """Check metrics callback."""
            try:
                self.assertEqual(
                    1, len(got),
                    'expecting metrics for exactly one slice, but got %d '
                    'slices instead. metrics were: %s' % (len(got), got))
                (slice_key, value) = got[0]
                self.assertEqual((), slice_key)
                self.assertDictElementsWithinBounds(
                    got_values_dict=value,
                    expected_values_dict=expected_metrics)
            except AssertionError as err:
                raise beam_util.BeamAssertException(err)

        eval_config = config.EvalConfig()
        eval_shared_model = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=eval_saved_model_path,
            add_metrics_callbacks=add_metrics_callbacks)
        extractors = model_eval_lib.default_extractors(
            eval_config=eval_config, eval_shared_model=eval_shared_model)

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            (metrics, _), _ = (
                pipeline
                | 'CreateExamples' >> beam.Create(serialized_examples)
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'Extract' >> Extract(extractors=extractors)
                | 'ComputeMetricsAndPlots' >>
                legacy_metrics_and_plots_evaluator.ComputeMetricsAndPlots(
                    eval_shared_model=eval_shared_model))
            # pylint: enable=no-value-for-parameter

            beam_util.assert_that(metrics, check_metrics)
Exemple #10
0
    def testLabelsExtractorMultiModel(self):
        model_spec1 = config.ModelSpec(name='model1', label_key='label')
        model_spec2 = config.ModelSpec(name='model2',
                                       label_keys={
                                           'output1': 'label1',
                                           'output2': 'label2'
                                       })
        eval_config = config.EvalConfig(model_specs=[model_spec1, model_spec2])
        feature_extractor = features_extractor.FeaturesExtractor(eval_config)
        label_extractor = labels_extractor.LabelsExtractor(eval_config)

        schema = text_format.Parse(
            """
        feature {
          name: "label"
          type: FLOAT
        }
        feature {
          name: "label1"
          type: FLOAT
        }
        feature {
          name: "label2"
          type: FLOAT
        }
        feature {
          name: "fixed_int"
          type: INT
        }
        """, schema_pb2.Schema())
        tfx_io = test_util.InMemoryTFExampleRecord(
            schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN)

        examples = [
            self._makeExample(label=1.0, label1=1.0, label2=0.0, fixed_int=1),
            self._makeExample(label=1.0, label1=1.0, label2=1.0, fixed_int=1)
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=2)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | feature_extractor.stage_name >> feature_extractor.ptransform
                | label_extractor.stage_name >> label_extractor.ptransform)

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    for model_name in ('model1', 'model2'):
                        self.assertIn(model_name,
                                      got[0][constants.LABELS_KEY][0])
                    self.assertAlmostEqual(
                        got[0][constants.LABELS_KEY][0]['model1'],
                        np.array([1.0]))
                    self.assertDictElementsAlmostEqual(
                        got[0][constants.LABELS_KEY][0]['model2'], {
                            'output1': np.array([1.0]),
                            'output2': np.array([0.0])
                        })

                    for model_name in ('model1', 'model2'):
                        self.assertIn(model_name,
                                      got[0][constants.LABELS_KEY][1])
                    self.assertAlmostEqual(
                        got[0][constants.LABELS_KEY][1]['model1'],
                        np.array([1.0]))
                    self.assertDictElementsAlmostEqual(
                        got[0][constants.LABELS_KEY][1]['model2'], {
                            'output1': np.array([1.0]),
                            'output2': np.array([1.0])
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
  def testWriteMetricsAndPlots(self):
    metrics_file = os.path.join(self._getTempDir(), 'metrics')
    plots_file = os.path.join(self._getTempDir(), 'plots')
    temp_eval_export_dir = os.path.join(self._getTempDir(), 'eval_export_dir')

    _, eval_export_dir = (
        fixed_prediction_estimator.simple_fixed_prediction_estimator(
            None, temp_eval_export_dir))
    eval_config = config.EvalConfig(
        model_specs=[config.ModelSpec()],
        options=config.Options(
            disabled_outputs={'values': ['eval_config.json']}))
    eval_shared_model = self.createTestEvalSharedModel(
        eval_saved_model_path=eval_export_dir,
        add_metrics_callbacks=[
            post_export_metrics.example_count(),
            post_export_metrics.calibration_plot_and_prediction_histogram(
                num_buckets=2)
        ])
    extractors = [
        predict_extractor.PredictExtractor(eval_shared_model),
        slice_key_extractor.SliceKeyExtractor()
    ]
    evaluators = [
        metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(eval_shared_model)
    ]
    output_paths = {
        constants.METRICS_KEY: metrics_file,
        constants.PLOTS_KEY: plots_file
    }
    writers = [
        metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter(
            output_paths, eval_shared_model.add_metrics_callbacks)
    ]

    with beam.Pipeline() as pipeline:
      example1 = self._makeExample(prediction=0.0, label=1.0)
      example2 = self._makeExample(prediction=1.0, label=1.0)

      # pylint: disable=no-value-for-parameter
      _ = (
          pipeline
          | 'Create' >> beam.Create([
              example1.SerializeToString(),
              example2.SerializeToString(),
          ])
          | 'ExtractEvaluateAndWriteResults' >>
          model_eval_lib.ExtractEvaluateAndWriteResults(
              eval_config=eval_config,
              eval_shared_model=eval_shared_model,
              extractors=extractors,
              evaluators=evaluators,
              writers=writers))
      # pylint: enable=no-value-for-parameter

    expected_metrics_for_slice = text_format.Parse(
        """
        slice_key {}
        metrics {
          key: "average_loss"
          value {
            double_value {
              value: 0.5
            }
          }
        }
        metrics {
          key: "post_export_metrics/example_count"
          value {
            double_value {
              value: 2.0
            }
          }
        }
        """, metrics_for_slice_pb2.MetricsForSlice())

    metric_records = []
    for record in tf.compat.v1.python_io.tf_record_iterator(metrics_file):
      metric_records.append(
          metrics_for_slice_pb2.MetricsForSlice.FromString(record))
    self.assertEqual(1, len(metric_records), 'metrics: %s' % metric_records)
    self.assertProtoEquals(expected_metrics_for_slice, metric_records[0])

    expected_plots_for_slice = text_format.Parse(
        """
      slice_key {}
      plots {
        key: "post_export_metrics"
        value {
          calibration_histogram_buckets {
            buckets {
              lower_threshold_inclusive: -inf
              num_weighted_examples {}
              total_weighted_label {}
              total_weighted_refined_prediction {}
            }
            buckets {
              upper_threshold_exclusive: 0.5
              num_weighted_examples {
                value: 1.0
              }
              total_weighted_label {
                value: 1.0
              }
              total_weighted_refined_prediction {}
            }
            buckets {
              lower_threshold_inclusive: 0.5
              upper_threshold_exclusive: 1.0
              num_weighted_examples {
              }
              total_weighted_label {}
              total_weighted_refined_prediction {}
            }
            buckets {
              lower_threshold_inclusive: 1.0
              upper_threshold_exclusive: inf
              num_weighted_examples {
                value: 1.0
              }
              total_weighted_label {
                value: 1.0
              }
              total_weighted_refined_prediction {
                value: 1.0
              }
            }
         }
        }
      }
    """, metrics_for_slice_pb2.PlotsForSlice())

    plot_records = []
    for record in tf.compat.v1.python_io.tf_record_iterator(plots_file):
      plot_records.append(
          metrics_for_slice_pb2.PlotsForSlice.FromString(record))
    self.assertEqual(1, len(plot_records), 'plots: %s' % plot_records)
    self.assertProtoEquals(expected_plots_for_slice, plot_records[0])
class ModelUtilTest(testutil.TensorflowModelAnalysisTest,
                    parameterized.TestCase):
    def createDenseInputsSchema(self):
        return text_format.Parse(
            """
        tensor_representation_group {
          key: ""
          value {
            tensor_representation {
              key: "input_1"
              value {
                dense_tensor {
                  column_name: "input_1"
                  shape { dim { size: 1 } }
                }
              }
            }
            tensor_representation {
              key: "input_2"
              value {
                dense_tensor {
                  column_name: "input_2"
                  shape { dim { size: 1 } }
                }
              }
            }
          }
        }
        feature {
          name: "input_1"
          type: FLOAT
        }
        feature {
          name: "input_2"
          type: FLOAT
        }
        feature {
          name: "non_model_feature"
          type: INT
        }
        """, schema_pb2.Schema())

    def createModelWithSingleInput(self, save_as_keras):
        input_layer = tf.keras.layers.Input(shape=(1, ), name='input')
        output_layer = tf.keras.layers.Dense(
            1, activation=tf.nn.sigmoid)(input_layer)
        model = tf.keras.models.Model(input_layer, output_layer)

        @tf.function
        def serving_default(s):
            return model(s)

        input_spec = {
            'input': tf.TensorSpec(shape=(None, 1),
                                   dtype=tf.string,
                                   name='input'),
        }
        signatures = {
            'serving_default':
            serving_default.get_concrete_function(input_spec),
            'custom_signature':
            serving_default.get_concrete_function(input_spec),
        }

        export_path = tempfile.mkdtemp()
        if save_as_keras:
            model.save(export_path, save_format='tf', signatures=signatures)
        else:
            tf.saved_model.save(model, export_path, signatures=signatures)
        return export_path

    def createModelWithMultipleDenseInputs(self, save_as_keras):
        input1 = tf.keras.layers.Input(shape=(1, ), name='input_1')
        input2 = tf.keras.layers.Input(shape=(1, ), name='input_2')
        inputs = [input1, input2]
        input_layer = tf.keras.layers.concatenate(inputs)
        output_layer = tf.keras.layers.Dense(1,
                                             activation=tf.nn.sigmoid,
                                             name='output')(input_layer)
        model = tf.keras.models.Model(inputs, output_layer)

        # Add custom attribute to model to test callables stored as attributes
        model.custom_attribute = tf.keras.models.Model(inputs, output_layer)

        @tf.function
        def serving_default(serialized_tf_examples):
            parsed_features = tf.io.parse_example(
                serialized_tf_examples, {
                    'input_1': tf.io.FixedLenFeature([1], dtype=tf.float32),
                    'input_2': tf.io.FixedLenFeature([1], dtype=tf.float32)
                })
            return model(parsed_features)

        @tf.function
        def custom_single_output(features):
            return model(features)

        @tf.function
        def custom_multi_output(features):
            return {'output1': model(features), 'output2': model(features)}

        input_spec = tf.TensorSpec(shape=(None, ),
                                   dtype=tf.string,
                                   name='examples')
        custom_input_spec = {
            'input_1':
            tf.TensorSpec(shape=(None, 1), dtype=tf.float32, name='input_1'),
            'input_2':
            tf.TensorSpec(shape=(None, 1), dtype=tf.float32, name='input_2')
        }
        signatures = {
            'serving_default':
            serving_default.get_concrete_function(input_spec),
            'custom_single_output':
            custom_single_output.get_concrete_function(custom_input_spec),
            'custom_multi_output':
            custom_multi_output.get_concrete_function(custom_input_spec)
        }

        export_path = tempfile.mkdtemp()
        if save_as_keras:
            model.save(export_path, save_format='tf', signatures=signatures)
        else:
            tf.saved_model.save(model, export_path, signatures=signatures)
        return export_path

    def createModelWithMultipleMixedInputs(self, save_as_keras):
        dense_input = tf.keras.layers.Input(shape=(2, ),
                                            name='input_1',
                                            dtype=tf.int64)
        dense_float_input = tf.cast(dense_input, tf.float32)
        sparse_input = tf.keras.layers.Input(shape=(1, ),
                                             name='input_2',
                                             sparse=True)
        dense_sparse_input = tf.keras.layers.Dense(
            1, name='dense_input2')(sparse_input)
        ragged_input = tf.keras.layers.Input(shape=(None, ),
                                             name='input_3',
                                             ragged=True)
        dense_ragged_input = tf.keras.layers.Lambda(lambda x: x.to_tensor())(
            ragged_input)
        dense_ragged_input.set_shape((None, 1))
        inputs = [dense_input, sparse_input, ragged_input]
        input_layer = tf.keras.layers.concatenate(
            [dense_float_input, dense_sparse_input, dense_ragged_input])
        output_layer = tf.keras.layers.Dense(
            1, activation=tf.nn.sigmoid)(input_layer)
        model = tf.keras.models.Model(inputs, output_layer)

        @tf.function
        def serving_default(features):
            return model(features)

        input_spec = {
            'input_1':
            tf.TensorSpec(shape=(None, 2), dtype=tf.int64, name='input_1'),
            'input_2':
            tf.SparseTensorSpec(shape=(None, 1), dtype=tf.float32),
            'input_3':
            tf.RaggedTensorSpec(shape=(None, 1), dtype=tf.float32)
        }
        signatures = {
            'serving_default':
            serving_default.get_concrete_function(input_spec),
            'custom_signature':
            serving_default.get_concrete_function(input_spec),
        }

        export_path = tempfile.mkdtemp()
        if save_as_keras:
            model.save(export_path, save_format='tf', signatures=signatures)
        else:
            tf.saved_model.save(model, export_path, signatures=signatures)
        return export_path

    def testFilterByInputNames(self):
        tensors = {
            'f1': tf.constant([[1.1], [2.1]], dtype=tf.float32),
            'f2': tf.constant([[1], [2]], dtype=tf.int64),
            'f3': tf.constant([['hello'], ['world']], dtype=tf.string)
        }
        filtered_tensors = model_util.filter_by_input_names(
            tensors, ['f1', 'f3'])
        self.assertLen(filtered_tensors, 2)
        self.assertAllEqual(tf.constant([[1.1], [2.1]], dtype=tf.float32),
                            filtered_tensors['f1'])
        self.assertAllEqual(
            tf.constant([['hello'], ['world']], dtype=tf.string),
            filtered_tensors['f3'])

    @parameterized.named_parameters(
        ('one_baseline',
         text_format.Parse(
             """
             model_specs {
               name: "candidate"
             }
             model_specs {
               name: "baseline"
               is_baseline: true
             }
           """, config.EvalConfig()),
         text_format.Parse(
             """
             name: "baseline"
             is_baseline: true
           """, config.ModelSpec())),
        ('no_baseline',
         text_format.Parse(
             """
             model_specs {
               name: "candidate"
             }
           """, config.EvalConfig()), None),
    )
    def test_get_baseline_model(self, eval_config,
                                expected_baseline_model_spec):
        self.assertEqual(expected_baseline_model_spec,
                         model_util.get_baseline_model_spec(eval_config))

    @parameterized.named_parameters(
        ('one_non_baseline',
         text_format.Parse(
             """
             model_specs {
               name: "candidate"
             }
             model_specs {
               name: "baseline"
               is_baseline: true
             }
           """, config.EvalConfig()), [
                 text_format.Parse(
                     """
             name: "candidate"
           """, config.ModelSpec())
             ]),
        ('no_non_baseline',
         text_format.Parse(
             """
             model_specs {
               name: "baseline"
               is_baseline: true
             }
           """, config.EvalConfig()), []),
    )
    def test_get_non_baseline_model(self, eval_config,
                                    expected_non_baseline_model_specs):
        self.assertCountEqual(
            expected_non_baseline_model_specs,
            model_util.get_non_baseline_model_specs(eval_config))

    def testFilterByInputNamesKeras(self):
        tensors = {
            'f1': tf.constant([[1.1], [2.1]], dtype=tf.float32),
            'f2': tf.constant([[1], [2]], dtype=tf.int64),
            'f3': tf.constant([['hello'], ['world']], dtype=tf.string)
        }
        filtered_tensors = model_util.filter_by_input_names(
            tensors, [
                'f1' + model_util.KERAS_INPUT_SUFFIX,
                'f3' + model_util.KERAS_INPUT_SUFFIX
            ])
        self.assertLen(filtered_tensors, 2)
        self.assertAllEqual(
            tf.constant([[1.1], [2.1]], dtype=tf.float32),
            filtered_tensors['f1' + model_util.KERAS_INPUT_SUFFIX])
        self.assertAllEqual(
            tf.constant([['hello'], ['world']], dtype=tf.string),
            filtered_tensors['f3' + model_util.KERAS_INPUT_SUFFIX])

    @parameterized.named_parameters(
        ('output_name_and_label_key', config.ModelSpec(label_key='label'),
         'output', 'label'),
        ('output_name_and_label_keys',
         config.ModelSpec(label_keys={'output': 'label'}), 'output', 'label'),
        ('output_name_and_no_label_keys', config.ModelSpec(), 'output', None),
        ('no_output_name_and_label_key', config.ModelSpec(label_key='label'),
         '', 'label'),
        ('no_output_name_and_no_label_keys', config.ModelSpec(), '', None))
    def testGetLabelKey(self, model_spec, output_name, expected_label_key):
        self.assertEqual(expected_label_key,
                         model_util.get_label_key(model_spec, output_name))

    def testGetLabelKeyNoOutputAndLabelKeys(self):
        with self.assertRaises(ValueError):
            model_util.get_label_key(
                config.ModelSpec(label_keys={'output1': 'label'}), '')

    @parameterized.named_parameters(
        {
            'testcase_name': 'single_model_single_key',
            'model_specs': [config.ModelSpec(label_key='feature1')],
            'field': 'label_key',
            'multi_output_field': 'label_keys',
            'expected_values': [
                [1.0, 1.1, 1.2],
            ]
        },
        {
            'testcase_name':
            'single_model_multi_key',
            'model_specs': [
                config.ModelSpec(label_keys={
                    'output1': 'feature1',
                    'output2': 'feature2'
                })
            ],
            'field':
            'label_key',
            'multi_output_field':
            'label_keys',
            'expected_values': [
                {
                    'output1': [1.0, 1.1, 1.2],
                    'output2': [2.0, 2.1, 2.2]
                },
            ]
        },
        {
            'testcase_name':
            'multi_model_single_key',
            'model_specs': [
                config.ModelSpec(name='model1', example_weight_key='feature2'),
                config.ModelSpec(name='model2', example_weight_key='feature3')
            ],
            'field':
            'example_weight_key',
            'multi_output_field':
            'example_weight_keys',
            'expected_values': [
                {
                    'model1': [2.0, 2.1, 2.2],
                    'model2': [3.0, 3.1, 3.2]
                },
            ]
        },
        {
            'testcase_name':
            'multi_model_multi_key',
            'model_specs': [
                config.ModelSpec(name='model1',
                                 prediction_keys={
                                     'output1': 'feature1',
                                     'output2': 'feature2'
                                 }),
                config.ModelSpec(name='model2',
                                 prediction_keys={
                                     'output1': 'feature1',
                                     'output3': 'feature3'
                                 })
            ],
            'field':
            'prediction_key',
            'multi_output_field':
            'prediction_keys',
            'expected_values': [
                {
                    'model1': {
                        'output1': [1.0, 1.1, 1.2],
                        'output2': [2.0, 2.1, 2.2]
                    },
                    'model2': {
                        'output1': [1.0, 1.1, 1.2],
                        'output3': [3.0, 3.1, 3.2]
                    }
                },
            ]
        },
    )
    def testGetFeatureValuesForModelSpecField(self, model_specs, field,
                                              multi_output_field,
                                              expected_values):
        extracts = {
            # Only need the num_rows from RecordBatch so use fake array of same len
            # as features.
            constants.ARROW_RECORD_BATCH_KEY:
            pa.RecordBatch.from_arrays([pa.array([1])], ['dummy']),
            constants.FEATURES_KEY: [
                {
                    'feature1': [1.0, 1.1, 1.2],
                    'feature2': [2.0, 2.1, 2.2],
                    'feature3': [3.0, 3.1, 3.2],
                },
            ]
        }
        got = model_util.get_feature_values_for_model_spec_field(
            model_specs, field, multi_output_field, extracts)
        self.assertAlmostEqual(expected_values, got)

    @parameterized.named_parameters(
        {
            'testcase_name': 'single_model_single_key',
            'model_specs': [config.ModelSpec(label_key='feature2')],
            'field': 'label_key',
            'multi_output_field': 'label_keys',
            'expected_values': [
                [4.0, 4.1, 4.2],
            ]
        },
        {
            'testcase_name':
            'single_model_multi_key',
            'model_specs': [
                config.ModelSpec(label_keys={
                    'output1': 'feature1',
                    'output2': 'feature2'
                })
            ],
            'field':
            'label_key',
            'multi_output_field':
            'label_keys',
            'expected_values': [
                {
                    'output1': [1.0, 1.1, 1.2],
                    'output2': [4.0, 4.1, 4.2]
                },
            ]
        },
    )
    def testGetFeatureValuesForModelSpecFieldWithSingleModelTransforedFeatures(
            self, model_specs, field, multi_output_field, expected_values):
        extracts = {
            # Only need the num_rows from RecordBatch so use fake array of same len
            # as features.
            constants.ARROW_RECORD_BATCH_KEY:
            pa.RecordBatch.from_arrays([pa.array([1])], ['dummy']),
            constants.FEATURES_KEY: [
                {
                    'feature1': [1.0, 1.1, 1.2],
                    'feature2': [2.0, 2.1, 2.2],
                },
            ],
            constants.TRANSFORMED_FEATURES_KEY: [
                {
                    'feature2': [4.0, 4.1, 4.2],
                },
            ]
        }
        got = model_util.get_feature_values_for_model_spec_field(
            model_specs, field, multi_output_field, extracts)
        self.assertAlmostEqual(expected_values, got)

    @parameterized.named_parameters(
        {
            'testcase_name':
            'multi_model_single_key',
            'model_specs': [
                config.ModelSpec(name='model1', example_weight_key='feature2'),
                config.ModelSpec(name='model2', example_weight_key='feature3')
            ],
            'field':
            'example_weight_key',
            'multi_output_field':
            'example_weight_keys',
            'expected_values': [
                {
                    'model1': [4.0, 4.1, 4.2],
                    'model2': [7.0, 7.1, 7.2]
                },
            ]
        },
        {
            'testcase_name':
            'multi_model_multi_key',
            'model_specs': [
                config.ModelSpec(name='model1',
                                 example_weight_keys={
                                     'output1': 'feature1',
                                     'output2': 'feature2'
                                 }),
                config.ModelSpec(name='model2',
                                 example_weight_keys={
                                     'output1': 'feature1',
                                     'output3': 'feature3'
                                 })
            ],
            'field':
            'example_weight_key',
            'multi_output_field':
            'example_weight_keys',
            'expected_values': [
                {
                    'model1': {
                        'output1': [1.0, 1.1, 1.2],
                        'output2': [4.0, 4.1, 4.2]
                    },
                    'model2': {
                        'output1': [1.0, 1.1, 1.2],
                        'output3': [7.0, 7.1, 7.2]
                    }
                },
            ]
        },
    )
    def testGetFeatureValuesForModelSpecFieldWithMultiModelTransforedFeatures(
            self, model_specs, field, multi_output_field, expected_values):
        extracts = {
            # Only need the num_rows from RecordBatch so use fake array of same len
            # as features.
            constants.ARROW_RECORD_BATCH_KEY:
            pa.RecordBatch.from_arrays([pa.array([1])], ['dummy']),
            constants.FEATURES_KEY: [
                {
                    'feature1': [1.0, 1.1, 1.2],
                    'feature2': [2.0, 2.1, 2.2],
                },
            ],
            constants.TRANSFORMED_FEATURES_KEY: [
                {
                    'model1': {
                        'feature2': [4.0, 4.1, 4.2],
                        'feature3': [5.0, 5.1, 5.2]
                    },
                    'model2': {
                        'feature2': [6.0, 6.1, 6.2],
                        'feature3': [7.0, 7.1, 7.2]
                    }
                },
            ]
        }
        got = model_util.get_feature_values_for_model_spec_field(
            model_specs, field, multi_output_field, extracts)
        self.assertAlmostEqual(expected_values, got)

    def testGetFeatureValuesForModelSpecFieldNoValues(self):
        model_spec = config.ModelSpec(name='model1',
                                      example_weight_key='feature2')
        extracts = {
            constants.ARROW_RECORD_BATCH_KEY:
            pa.RecordBatch.from_arrays([pa.array([1])], ['dummy']),
        }
        got = model_util.get_feature_values_for_model_spec_field(
            [model_spec], 'example_weight', 'example_weights', extracts)
        self.assertIsNone(got)

    @parameterized.named_parameters(
        ('keras_serving_default', True, 'serving_default'),
        ('keras_custom_signature', True, 'custom_signature'),
        ('tf2_serving_default', False, 'serving_default'),
        ('tf2_custom_signature', False, 'custom_signature'))
    def testGetCallableWithSignatures(self, save_as_keras, signature_name):
        export_path = self.createModelWithSingleInput(save_as_keras)
        if save_as_keras:
            model = tf.keras.models.load_model(export_path)
        else:
            model = tf.compat.v1.saved_model.load_v2(export_path)
        self.assertIsNotNone(model_util.get_callable(model, signature_name))

    @parameterized.named_parameters(('keras', True), ('tf2', False))
    def testGetCallableWithMissingSignatures(self, save_as_keras):
        export_path = self.createModelWithSingleInput(save_as_keras)
        if save_as_keras:
            model = tf.keras.models.load_model(export_path)
        else:
            model = tf.compat.v1.saved_model.load_v2(export_path)
        with self.assertRaises(ValueError):
            model_util.get_callable(model, 'non_existent')

    @unittest.skipIf(_TF_MAJOR_VERSION < 2,
                     'not all input types supported for TF1')
    def testGetCallableWithKerasModel(self):
        export_path = self.createModelWithMultipleMixedInputs(True)
        model = tf.keras.models.load_model(export_path)
        self.assertEqual(model, model_util.get_callable(model))

    @parameterized.named_parameters(
        ('keras_serving_default', True, 'serving_default'),
        ('keras_custom_signature', True, 'custom_signature'),
        ('tf2_serving_default', False, None),
        ('tf2_custom_signature', False, 'custom_signature'))
    def testGetInputSpecsWithSignatures(self, save_as_keras, signature_name):
        export_path = self.createModelWithSingleInput(save_as_keras)
        if save_as_keras:
            model = tf.keras.models.load_model(export_path)
        else:
            model = tf.compat.v1.saved_model.load_v2(export_path)
        self.assertEqual(
            {
                'input':
                tf.TensorSpec(name='input', shape=(None, 1), dtype=tf.string),
            }, model_util.get_input_specs(model, signature_name))

    @parameterized.named_parameters(('keras', True), ('tf2', False))
    def testGetInputSpecsWithMissingSignatures(self, save_as_keras):
        export_path = self.createModelWithSingleInput(save_as_keras)
        if save_as_keras:
            model = tf.keras.models.load_model(export_path)
        else:
            model = tf.compat.v1.saved_model.load_v2(export_path)
        with self.assertRaises(ValueError):
            model_util.get_callable(model, 'non_existent')

    @unittest.skipIf(_TF_MAJOR_VERSION < 2,
                     'not all input types supported for TF1')
    def testGetInputSpecsWithKerasModel(self):
        export_path = self.createModelWithMultipleMixedInputs(True)
        model = tf.keras.models.load_model(export_path)

        # Some versions of TF set the TensorSpec.name and others do not. Since we
        # don't care about the name, clear it from the output for testing purposes
        specs = model_util.get_input_specs(model)
        for k, v in specs.items():
            if isinstance(v, tf.TensorSpec):
                specs[k] = tf.TensorSpec(shape=v.shape, dtype=v.dtype)
        self.assertEqual(
            {
                'input_1':
                tf.TensorSpec(shape=(None, 2), dtype=tf.int64),
                'input_2':
                tf.SparseTensorSpec(shape=(None, 1), dtype=tf.float32),
                'input_3':
                tf.RaggedTensorSpec(shape=(None, None), dtype=tf.float32),
            }, specs)

    def testInputSpecsToTensorRepresentations(self):
        tensor_representations = model_util.input_specs_to_tensor_representations(
            {
                'input_1':
                tf.TensorSpec(shape=(None, 2), dtype=tf.int64),
                'input_2':
                tf.SparseTensorSpec(shape=(None, 1), dtype=tf.float32),
                'input_3':
                tf.RaggedTensorSpec(shape=(None, None), dtype=tf.float32),
            })
        dense_tensor_representation = text_format.Parse(
            """
        dense_tensor {
          column_name: "input_1"
          shape { dim { size: 2 } }
        }
        """, schema_pb2.TensorRepresentation())
        sparse_tensor_representation = text_format.Parse(
            """
        varlen_sparse_tensor {
          column_name: "input_2"
        }
        """, schema_pb2.TensorRepresentation())
        ragged_tensor_representation = text_format.Parse(
            """
        ragged_tensor {
          feature_path {
            step: "input_3"
          }
        }
        """, schema_pb2.TensorRepresentation())
        self.assertEqual(
            {
                'input_1': dense_tensor_representation,
                'input_2': sparse_tensor_representation,
                'input_3': ragged_tensor_representation
            }, tensor_representations)

    def testInputSpecsToTensorRepresentationsRaisesWithUnknownDims(self):
        with self.assertRaises(ValueError):
            model_util.input_specs_to_tensor_representations({
                'input_1':
                tf.TensorSpec(shape=(None, None), dtype=tf.int64),
            })

    @parameterized.named_parameters(
        ('keras_default', True, {
            constants.PREDICTIONS_KEY: {
                '': [None]
            }
        }, None, False, True, 1),
        ('tf_default', False, {
            constants.PREDICTIONS_KEY: {
                '': [None]
            }
        }, None, False, True, 1),
        ('keras_serving_default', True, {
            constants.PREDICTIONS_KEY: {
                '': ['serving_default']
            }
        }, None, False, True, 1),
        ('tf_serving_default', False, {
            constants.PREDICTIONS_KEY: {
                '': ['serving_default']
            }
        }, None, False, True, 1),
        ('keras_custom_single_output', True, {
            constants.PREDICTIONS_KEY: {
                '': ['custom_single_output']
            }
        }, None, False, True, 1),
        ('tf_custom_single_output', False, {
            constants.PREDICTIONS_KEY: {
                '': ['custom_single_output']
            }
        }, None, False, True, 1),
        ('keras_custom_multi_output', True, {
            constants.PREDICTIONS_KEY: {
                '': ['custom_multi_output']
            }
        }, None, False, True, 2),
        ('tf_custom_multi_output', False, {
            constants.PREDICTIONS_KEY: {
                '': ['custom_multi_output']
            }
        }, None, False, True, 2),
        ('multi_model', True, {
            constants.PREDICTIONS_KEY: {
                'model1': ['custom_multi_output'],
                'model2': ['custom_multi_output']
            }
        }, None, False, True, 2),
        ('default_signatures', True, {
            constants.PREDICTIONS_KEY: {
                '': [],
            }
        }, ['unknown', 'custom_single_output'], False, True, 1),
        ('keras_prefer_dict_outputs', True, {
            constants.FEATURES_KEY: {
                '': [],
            }
        }, ['unknown', 'custom_single_output', 'custom_multi_output'
            ], True, True, 3),
        ('tf_prefer_dict_outputs', False, {
            constants.FEATURES_KEY: {
                '': [],
            }
        }, ['unknown', 'custom_single_output', 'custom_multi_output'
            ], True, True, 3),
        ('custom_attribute', True, {
            constants.FEATURES_KEY: {
                '': ['custom_attribute'],
            }
        }, None, True, True, 1),
        ('keras_no_schema', True, {
            constants.PREDICTIONS_KEY: {
                '': [None]
            }
        }, None, False, False, 1),
        ('tf_no_schema', False, {
            constants.PREDICTIONS_KEY: {
                '': [None]
            }
        }, None, False, False, 1),
    )
    @unittest.skipIf(_TF_MAJOR_VERSION < 2,
                     'not all signatures supported for TF1')
    def testModelSignaturesDoFn(self, save_as_keras, signature_names,
                                default_signature_names, prefer_dict_outputs,
                                use_schema, expected_num_outputs):
        export_path = self.createModelWithMultipleDenseInputs(save_as_keras)
        eval_shared_models = {}
        model_specs = []
        for sigs in signature_names.values():
            for model_name in sigs:
                if model_name not in eval_shared_models:
                    eval_shared_models[
                        model_name] = self.createTestEvalSharedModel(
                            eval_saved_model_path=export_path,
                            model_name=model_name,
                            tags=[tf.saved_model.SERVING])
                    model_specs.append(config.ModelSpec(name=model_name))
        eval_config = config.EvalConfig(model_specs=model_specs)
        schema = self.createDenseInputsSchema() if use_schema else None
        tfx_io = tf_example_record.TFExampleBeamRecord(
            physical_format='text',
            schema=schema,
            raw_record_column_name=constants.ARROW_INPUT_COLUMN)
        tensor_adapter_config = None
        if use_schema:
            tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
                arrow_schema=tfx_io.ArrowSchema(),
                tensor_representations=tfx_io.TensorRepresentations())

        examples = [
            self._makeExample(input_1=1.0, input_2=2.0),
            self._makeExample(input_1=3.0, input_2=4.0),
            self._makeExample(input_1=5.0, input_2=6.0),
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (pipeline
                      | 'Create' >> beam.Create(
                          [e.SerializeToString() for e in examples])
                      | 'BatchExamples' >> tfx_io.BeamSource(batch_size=3)
                      | 'ToExtracts' >> beam.Map(_record_batch_to_extracts)
                      | 'ModelSignatures' >> beam.ParDo(
                          model_util.ModelSignaturesDoFn(
                              eval_config=eval_config,
                              eval_shared_models=eval_shared_models,
                              signature_names=signature_names,
                              default_signature_names=default_signature_names,
                              prefer_dict_outputs=prefer_dict_outputs,
                              tensor_adapter_config=tensor_adapter_config)))

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    for key in signature_names:
                        self.assertIn(key, got[0])
                        if prefer_dict_outputs:
                            for entry in got[0][key]:
                                self.assertIsInstance(entry, dict)
                                self.assertLen(entry, expected_num_outputs)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')

    def testHasRubberStamp(self):
        # Model agnostic.
        self.assertFalse(model_util.has_rubber_stamp(None))

        # All non baseline models has rubber stamp.
        baseline = self.createTestEvalSharedModel(
            model_name=constants.BASELINE_KEY, is_baseline=True)
        candidate = self.createTestEvalSharedModel(
            model_name=constants.CANDIDATE_KEY, rubber_stamp=True)
        self.assertTrue(model_util.has_rubber_stamp([baseline, candidate]))

        # Not all non baseline has rubber stamp.
        candidate_nr = self.createTestEvalSharedModel(
            model_name=constants.CANDIDATE_KEY)
        self.assertFalse(model_util.has_rubber_stamp([candidate_nr]))
        self.assertFalse(
            model_util.has_rubber_stamp([baseline, candidate, candidate_nr]))
 def testValidateMetricsMetricTDistributionChangeAndThreshold(
         self, slicing_specs, slice_key):
     threshold = config.MetricThreshold(
         change_threshold=config.GenericChangeThreshold(
             direction=config.MetricDirection.LOWER_IS_BETTER,
             absolute={'value': -1}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
             config.ModelSpec(name='baseline', is_baseline=True)
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(metrics=[
                 config.MetricConfig(
                     class_name='AUC',
                     threshold=threshold if slicing_specs is None else None,
                     per_slice_thresholds=[
                         config.PerSliceMetricThreshold(
                             slicing_specs=slicing_specs,
                             threshold=threshold)
                     ]),
             ],
                                model_names=['']),
         ],
     )
     sliced_metrics = (
         slice_key,
         {
             # This is the mean of the diff.
             metric_types.MetricKey(name='auc', model_name='baseline'):
             types.ValueWithTDistribution(sample_mean=0.91,
                                          unsampled_value=0.6),
             metric_types.MetricKey(name='auc', is_diff=True):
             types.ValueWithTDistribution(sample_mean=0.1,
                                          unsampled_value=0.1),
         })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
     expected = text_format.Parse(
         """
     metric_validations_per_slice {
       failures {
         metric_key {
           name: "auc"
           is_diff: true
         }
         metric_value {
           double_value {
             value: 0.1
           }
         }
       }
     }""", validation_result_pb2.ValidationResult())
     expected.metric_validations_per_slice[0].failures[
         0].metric_threshold.CopyFrom(threshold)
     expected.metric_validations_per_slice[0].slice_key.CopyFrom(
         slicer.serialize_slice_key(slice_key))
     for spec in slicing_specs or [None]:
         if (spec is None or slicer.SingleSliceSpec(
                 spec=spec).is_slice_applicable(slice_key)):
             slicing_details = expected.validation_details.slicing_details.add(
             )
             if spec is not None:
                 slicing_details.slicing_spec.CopyFrom(spec)
             else:
                 slicing_details.slicing_spec.CopyFrom(config.SlicingSpec())
             slicing_details.num_matching_slices = 1
     self.assertAlmostEqual(result, expected)
    def testPredictExtractorWithSequentialKerasModel(self):
        # Note that the input will be called 'test_input'
        model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(1,
                                  activation=tf.nn.sigmoid,
                                  input_shape=(2, ),
                                  name='test')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001),
                      loss=tf.keras.losses.binary_crossentropy,
                      metrics=['accuracy'])

        train_features = {'test_input': [[0.0, 0.0], [1.0, 1.0]]}
        labels = [[1], [0]]
        example_weights = [1.0, 0.5]
        dataset = tf.data.Dataset.from_tensor_slices(
            (train_features, labels, example_weights))
        dataset = dataset.shuffle(buffer_size=1).repeat().batch(2)
        model.fit(dataset, steps_per_epoch=1)

        export_dir = self._getExportDir()
        model.save(export_dir, save_format='tf')

        eval_config = config.EvalConfig(model_specs=[config.ModelSpec()])
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])
        schema = text_format.Parse(
            """
        tensor_representation_group {
          key: ""
          value {
            tensor_representation {
              key: "test"
              value {
                dense_tensor {
                  column_name: "test"
                  shape { dim { size: 2 } }
                }
              }
            }
          }
        }
        feature {
          name: "test"
          type: FLOAT
        }
        feature {
          name: "non_model_feature"
          type: INT
        }
        """, schema_pb2.Schema())
        tfx_io = test_util.InMemoryTFExampleRecord(
            schema=schema, raw_record_column_name=constants.BATCHED_INPUT_KEY)
        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
            arrow_schema=tfx_io.ArrowSchema(),
            tensor_representations=tfx_io.TensorRepresentations())
        input_extractor = batched_input_extractor.BatchedInputExtractor(
            eval_config)
        predict_extractor = batched_predict_extractor_v2.BatchedPredictExtractor(
            eval_config=eval_config,
            eval_shared_model=eval_shared_model,
            tensor_adapter_config=tensor_adapter_config)

        # Notice that the features are 'test' but the model expects 'test_input'.
        # This tests that the PredictExtractor properly handles this case.
        examples = [
            self._makeExample(
                test=[0.0,
                      0.0], non_model_feature=0),  # should be ignored by model
            self._makeExample(
                test=[1.0,
                      1.0], non_model_feature=1),  # should be ignored by model
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=2)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | input_extractor.stage_name >> input_extractor.ptransform
                | predict_extractor.stage_name >> predict_extractor.ptransform)

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    # We can't verify the actual predictions, but we can verify the keys.
                    for item in got:
                        self.assertIn(constants.BATCHED_PREDICTIONS_KEY, item)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
Exemple #15
0
    def testTFlitePredictExtractorWithKerasModel(self, multi_model,
                                                 multi_output):
        input1 = tf.keras.layers.Input(shape=(1, ), name='input1')
        input2 = tf.keras.layers.Input(shape=(1, ), name='input2')
        inputs = [input1, input2]
        input_layer = tf.keras.layers.concatenate(inputs)
        output_layers = {}
        output_layers['output1'] = (tf.keras.layers.Dense(
            1, activation=tf.nn.sigmoid, name='output1')(input_layer))
        if multi_output:
            output_layers['output2'] = (tf.keras.layers.Dense(
                1, activation=tf.nn.sigmoid, name='output2')(input_layer))

        model = tf.keras.models.Model(inputs, output_layers)
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001),
                      loss=tf.keras.losses.binary_crossentropy,
                      metrics=['accuracy'])

        train_features = {'input1': [[0.0], [1.0]], 'input2': [[1.0], [0.0]]}
        labels = {'output1': [[1], [0]]}
        if multi_output:
            labels['output2'] = [[1], [0]]

        example_weights = {'output1': [1.0, 0.5]}
        if multi_output:
            example_weights['output2'] = [1.0, 0.5]
        dataset = tf.data.Dataset.from_tensor_slices(
            (train_features, labels, example_weights))
        dataset = dataset.shuffle(buffer_size=1).repeat().batch(2)
        model.fit(dataset, steps_per_epoch=1)

        converter = tf.compat.v2.lite.TFLiteConverter.from_keras_model(model)
        tflite_model = converter.convert()

        tflite_model_dir = tempfile.mkdtemp()
        with tf.io.gfile.GFile(os.path.join(tflite_model_dir, 'tflite'),
                               'wb') as f:
            f.write(tflite_model)

        model_specs = [config.ModelSpec(name='model1', model_type='tf_lite')]
        if multi_model:
            model_specs.append(
                config.ModelSpec(name='model2', model_type='tf_lite'))

        eval_config = config.EvalConfig(model_specs=model_specs)
        eval_shared_models = [
            self.createTestEvalSharedModel(
                model_name='model1',
                eval_saved_model_path=tflite_model_dir,
                model_type='tf_lite')
        ]
        if multi_model:
            eval_shared_models.append(
                self.createTestEvalSharedModel(
                    model_name='model2',
                    eval_saved_model_path=tflite_model_dir,
                    model_type='tf_lite'))

        schema = text_format.Parse(
            """
        feature {
          name: "input1"
          type: FLOAT
        }
        feature {
          name: "input2"
          type: FLOAT
        }
        feature {
          name: "non_model_feature"
          type: INT
        }
        """, schema_pb2.Schema())
        tfx_io = test_util.InMemoryTFExampleRecord(
            schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN)
        feature_extractor = features_extractor.FeaturesExtractor(eval_config)
        predictor = tflite_predict_extractor.TFLitePredictExtractor(
            eval_config=eval_config, eval_shared_model=eval_shared_models)

        examples = [
            self._makeExample(input1=0.0, input2=1.0, non_model_feature=0),
            self._makeExample(input1=1.0, input2=0.0, non_model_feature=1),
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=2)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | feature_extractor.stage_name >> feature_extractor.ptransform
                | predictor.stage_name >> predictor.ptransform)

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got = got[0]
                    self.assertIn(constants.PREDICTIONS_KEY, got)
                    self.assertLen(got[constants.PREDICTIONS_KEY], 2)

                    for item in got[constants.PREDICTIONS_KEY]:
                        if multi_model:
                            self.assertIn('model1', item)
                            self.assertIn('model2', item)
                            if multi_output:
                                self.assertIn('Identity', item['model1'])
                                self.assertIn('Identity_1', item['model1'])

                        elif multi_output:
                            self.assertIn('Identity', item)
                            self.assertIn('Identity_1', item)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

                util.assert_that(result, check_result, label='result')
    def testPredictExtractorWithMultiModels(self):
        temp_export_dir = self._getExportDir()
        export_dir1, _ = multi_head.simple_multi_head(temp_export_dir, None)
        export_dir2, _ = multi_head.simple_multi_head(temp_export_dir, None)

        eval_config = config.EvalConfig(model_specs=[
            config.ModelSpec(name='model1'),
            config.ModelSpec(name='model2')
        ])
        eval_shared_model1 = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir1, tags=[tf.saved_model.SERVING])
        eval_shared_model2 = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir2, tags=[tf.saved_model.SERVING])
        schema = text_format.Parse(
            """
        feature {
          name: "age"
          type: FLOAT
        }
        feature {
          name: "langauge"
          type: BYTES
        }
        feature {
          name: "english_label"
          type: FLOAT
        }
        feature {
          name: "chinese_label"
          type: FLOAT
        }
        feature {
          name: "other_label"
          type: FLOAT
        }
        """, schema_pb2.Schema())
        tfx_io = test_util.InMemoryTFExampleRecord(
            schema=schema, raw_record_column_name=constants.BATCHED_INPUT_KEY)
        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
            arrow_schema=tfx_io.ArrowSchema(),
            tensor_representations=tfx_io.TensorRepresentations())
        input_extractor = batched_input_extractor.BatchedInputExtractor(
            eval_config)
        predict_extractor = batched_predict_extractor_v2.BatchedPredictExtractor(
            eval_config=eval_config,
            eval_shared_model={
                'model1': eval_shared_model1,
                'model2': eval_shared_model2
            },
            tensor_adapter_config=tensor_adapter_config)

        examples = [
            self._makeExample(age=1.0,
                              language='english',
                              english_label=1.0,
                              chinese_label=0.0,
                              other_label=0.0),
            self._makeExample(age=1.0,
                              language='chinese',
                              english_label=0.0,
                              chinese_label=1.0,
                              other_label=0.0),
            self._makeExample(age=2.0,
                              language='english',
                              english_label=1.0,
                              chinese_label=0.0,
                              other_label=0.0),
            self._makeExample(age=2.0,
                              language='other',
                              english_label=0.0,
                              chinese_label=1.0,
                              other_label=1.0)
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=4)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | input_extractor.stage_name >> input_extractor.ptransform
                | predict_extractor.stage_name >> predict_extractor.ptransform)

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    for item in got:
                        # We can't verify the actual predictions, but we can verify the keys
                        self.assertIn(constants.BATCHED_PREDICTIONS_KEY, item)
                        for pred in item[constants.BATCHED_PREDICTIONS_KEY]:
                            for model_name in ('model1', 'model2'):
                                self.assertIn(model_name, pred)
                                for output_name in ('chinese_head',
                                                    'english_head',
                                                    'other_head'):
                                    for pred_key in ('logistic',
                                                     'probabilities',
                                                     'all_classes'):
                                        self.assertIn(
                                            output_name + '/' + pred_key,
                                            pred[model_name])

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
    def testPredictExtractorWithRegressionModel(self):
        temp_export_dir = self._getExportDir()
        export_dir, _ = (fixed_prediction_estimator_extra_fields.
                         simple_fixed_prediction_estimator_extra_fields(
                             temp_export_dir, None))

        eval_config = config.EvalConfig(model_specs=[config.ModelSpec()])
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])
        schema = text_format.Parse(
            """
        feature {
          name: "prediction"
          type: FLOAT
        }
        feature {
          name: "label"
          type: FLOAT
        }
        feature {
          name: "fixed_int"
          type: INT
        }
        feature {
          name: "fixed_float"
          type: FLOAT
        }
        feature {
          name: "fixed_string"
          type: BYTES
        }
        """, schema_pb2.Schema())
        tfx_io = test_util.InMemoryTFExampleRecord(
            schema=schema, raw_record_column_name=constants.BATCHED_INPUT_KEY)
        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
            arrow_schema=tfx_io.ArrowSchema(),
            tensor_representations=tfx_io.TensorRepresentations())
        input_extractor = batched_input_extractor.BatchedInputExtractor(
            eval_config)
        predict_extractor = batched_predict_extractor_v2.BatchedPredictExtractor(
            eval_config=eval_config,
            eval_shared_model=eval_shared_model,
            tensor_adapter_config=tensor_adapter_config)

        examples = [
            self._makeExample(prediction=0.2,
                              label=1.0,
                              fixed_int=1,
                              fixed_float=1.0,
                              fixed_string='fixed_string1'),
            self._makeExample(prediction=0.8,
                              label=0.0,
                              fixed_int=1,
                              fixed_float=1.0,
                              fixed_string='fixed_string2'),
            self._makeExample(prediction=0.5,
                              label=0.0,
                              fixed_int=2,
                              fixed_float=1.0,
                              fixed_string='fixed_string3')
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=3)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | input_extractor.stage_name >> input_extractor.ptransform
                | predict_extractor.stage_name >> predict_extractor.ptransform)

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    self.assertIn(constants.BATCHED_PREDICTIONS_KEY, got[0])
                    expected_preds = [0.2, 0.8, 0.5]
                    self.assertAlmostEqual(
                        got[0][constants.BATCHED_PREDICTIONS_KEY],
                        expected_preds)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
    def testPreprocessedFeaturesExtractor(self, save_as_keras,
                                          preprocessing_function_names,
                                          expected_extract_keys):
        export_path = self.createModelWithMultipleDenseInputs(save_as_keras)

        eval_config = config.EvalConfig(model_specs=[
            config.ModelSpec(
                preprocessing_function_names=preprocessing_function_names)
        ])
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_path, tags=[tf.saved_model.SERVING])
        schema = self.createDenseInputsSchema()
        tfx_io = test_util.InMemoryTFExampleRecord(
            schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN)
        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
            arrow_schema=tfx_io.ArrowSchema(),
            tensor_representations=tfx_io.TensorRepresentations())
        feature_extractor = features_extractor.FeaturesExtractor(eval_config)
        transformation_extractor = (
            transformed_features_extractor.TransformedFeaturesExtractor(
                eval_config=eval_config,
                eval_shared_model=eval_shared_model,
                tensor_adapter_config=tensor_adapter_config))

        examples = [
            self._makeExample(input_1=1.0, input_2=2.0),
            self._makeExample(input_1=3.0, input_2=4.0),
            self._makeExample(input_1=5.0, input_2=6.0),
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=2)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | feature_extractor.stage_name >> feature_extractor.ptransform
                | transformation_extractor.stage_name >>
                transformation_extractor.ptransform)

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 2)
                    for item in got:
                        for extracts_key, feature_keys in expected_extract_keys.items(
                        ):
                            self.assertIn(extracts_key, item)
                            for value in item[extracts_key]:
                                self.assertEqual(set(feature_keys),
                                                 set(value.keys()),
                                                 msg='got={}'.format(item))

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
    def testBatchSizeLimit(self):
        temp_export_dir = self._getExportDir()
        _, export_dir = batch_size_limited_classifier.simple_batch_size_limited_classifier(
            None, temp_export_dir)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])
        eval_config = config.EvalConfig(model_specs=[config.ModelSpec()])
        schema = text_format.Parse(
            """
        feature {
          name: "classes"
          type: BYTES
        }
        feature {
          name: "scores"
          type: FLOAT
        }
        feature {
          name: "labels"
          type: BYTES
        }
        """, schema_pb2.Schema())
        tfx_io = test_util.InMemoryTFExampleRecord(
            schema=schema, raw_record_column_name=constants.BATCHED_INPUT_KEY)
        tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
            arrow_schema=tfx_io.ArrowSchema(),
            tensor_representations=tfx_io.TensorRepresentations())
        input_extractor = batched_input_extractor.BatchedInputExtractor(
            eval_config)
        predict_extractor = batched_predict_extractor_v2.BatchedPredictExtractor(
            eval_config=eval_config,
            eval_shared_model=eval_shared_model,
            tensor_adapter_config=tensor_adapter_config)

        examples = []
        for _ in range(4):
            examples.append(
                self._makeExample(classes='first', scores=0.0, labels='third'))

        with beam.Pipeline() as pipeline:
            predict_extracts = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples], reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=1)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | input_extractor.stage_name >> input_extractor.ptransform
                | predict_extractor.stage_name >> predict_extractor.ptransform)

            def check_result(got):
                try:
                    self.assertLen(got, 4)
                    # We can't verify the actual predictions, but we can verify the keys.
                    for item in got:
                        self.assertIn(constants.BATCHED_PREDICTIONS_KEY, item)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(predict_extracts, check_result, label='result')
Exemple #20
0
 def testRunModelAnalysisExtraFieldsPlusFeatureExtraction(self):
     model_location = self._exportEvalSavedModel(
         linear_classifier.simple_linear_classifier)
     examples = [
         self._makeExample(age=3.0,
                           language='english',
                           label=1.0,
                           my_slice='a'),
         self._makeExample(age=3.0,
                           language='chinese',
                           label=0.0,
                           my_slice='a'),
         self._makeExample(age=4.0,
                           language='english',
                           label=1.0,
                           my_slice='b'),
         self._makeExample(age=5.0,
                           language='chinese',
                           label=1.0,
                           my_slice='c'),
         self._makeExample(age=5.0, language='hindi', label=1.0)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     slicing_specs = [config.SlicingSpec(feature_keys=['my_slice'])]
     eval_config = config.EvalConfig(
         input_data_specs=[config.InputDataSpec(location=data_location)],
         model_specs=[config.ModelSpec(location=model_location)],
         output_data_specs=[
             config.OutputDataSpec(default_location=self._getTempDir())
         ],
         slicing_specs=slicing_specs)
     eval_shared_model = model_eval_lib.default_eval_shared_model(
         eval_saved_model_path=model_location, example_weight_key='age')
     slice_spec = [slicer.SingleSliceSpec(spec=slicing_specs[0])]
     extractors_with_feature_extraction = [
         predict_extractor.PredictExtractor(eval_shared_model,
                                            desired_batch_size=3,
                                            materialize=False),
         feature_extractor.FeatureExtractor(
             extract_source=constants.INPUT_KEY,
             extract_dest=constants.FEATURES_PREDICTIONS_LABELS_KEY),
         slice_key_extractor.SliceKeyExtractor(slice_spec,
                                               materialize=False)
     ]
     eval_result = model_eval_lib.run_model_analysis(
         eval_config=eval_config,
         eval_shared_models=[
             model_eval_lib.default_eval_shared_model(
                 eval_saved_model_path=model_location,
                 example_weight_key='age')
         ],
         extractors=extractors_with_feature_extraction)
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
         (('my_slice', 'a'), ): {
             'accuracy': {
                 'doubleValue': 1.0
             },
             'my_mean_label': {
                 'doubleValue': 0.5
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 6.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 2.0
             },
         },
         (('my_slice', 'b'), ): {
             'accuracy': {
                 'doubleValue': 1.0
             },
             'my_mean_label': {
                 'doubleValue': 1.0
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 4.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 1.0
             },
         },
         (('my_slice', 'c'), ): {
             'accuracy': {
                 'doubleValue': 0.0
             },
             'my_mean_label': {
                 'doubleValue': 1.0
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 5.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 1.0
             },
         },
     }
     self.assertEqual(eval_result.config.model_specs[0].location,
                      model_location.decode())
     self.assertEqual(eval_result.config.input_data_specs[0].location,
                      data_location)
     self.assertEqual(eval_result.config.slicing_specs[0],
                      config.SlicingSpec(feature_keys=['my_slice']))
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
     self.assertFalse(eval_result.plots)
 def testValidateMetricsMetricValueAndThreshold(self, slicing_specs,
                                                slice_key):
     threshold = config.MetricThreshold(
         value_threshold=config.GenericValueThreshold(
             upper_bound={'value': 1}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='WeightedExampleCount',
                         # 1.5 < 1, NOT OK.
                         threshold=threshold
                         if slicing_specs is None else None,
                         per_slice_thresholds=[
                             config.PerSliceMetricThreshold(
                                 slicing_specs=slicing_specs,
                                 threshold=threshold)
                         ]),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = (slice_key, {
         metric_types.MetricKey(name='weighted_example_count'):
         1.5,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
     expected = text_format.Parse(
         """
     metric_validations_per_slice {
       failures {
         metric_key {
           name: "weighted_example_count"
         }
         metric_value {
           double_value {
             value: 1.5
           }
         }
       }
     }""", validation_result_pb2.ValidationResult())
     expected.metric_validations_per_slice[0].failures[
         0].metric_threshold.CopyFrom(threshold)
     expected.metric_validations_per_slice[0].slice_key.CopyFrom(
         slicer.serialize_slice_key(slice_key))
     for spec in slicing_specs or [None]:
         if (spec is None or slicer.SingleSliceSpec(
                 spec=spec).is_slice_applicable(slice_key)):
             slicing_details = expected.validation_details.slicing_details.add(
             )
             if spec is not None:
                 slicing_details.slicing_spec.CopyFrom(spec)
             else:
                 slicing_details.slicing_spec.CopyFrom(config.SlicingSpec())
             slicing_details.num_matching_slices = 1
     self.assertEqual(result, expected)
Exemple #22
0
    def testRunModelAnalysisWithKerasModel(self):
        input_layer = tf.keras.layers.Input(shape=(28 * 28, ), name='data')
        output_layer = tf.keras.layers.Dense(
            10, activation=tf.nn.softmax)(input_layer)
        model = tf.keras.models.Model(input_layer, output_layer)
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001),
                      loss=tf.keras.losses.categorical_crossentropy)

        features = {'data': [[0.0] * 28 * 28]}
        labels = [[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]]
        example_weights = [1.0]
        dataset = tf.data.Dataset.from_tensor_slices(
            (features, labels, example_weights))
        dataset = dataset.shuffle(buffer_size=1).repeat().batch(1)
        model.fit(dataset, steps_per_epoch=1)

        model_location = os.path.join(self._getTempDir(), 'export_dir')
        model.save(model_location, save_format='tf')

        examples = [
            self._makeExample(data=[0.0] * 28 * 28, label=1.0),
            self._makeExample(data=[1.0] * 28 * 28, label=5.0),
            self._makeExample(data=[1.0] * 28 * 28, label=9.0),
        ]
        data_location = self._writeTFExamplesToTFRecords(examples)
        metrics_spec = config.MetricsSpec()
        for metric in (tf.keras.metrics.AUC(), ):
            cfg = tf.keras.utils.serialize_keras_object(metric)
            metrics_spec.metrics.append(
                config.MetricConfig(class_name=cfg['class_name'],
                                    config=json.dumps(cfg['config'])))
        for class_id in (0, 5, 9):
            metrics_spec.binarize.class_ids.append(class_id)
        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec(location=data_location)],
            model_specs=[
                config.ModelSpec(location=model_location, label_key='label')
            ],
            output_data_specs=[
                config.OutputDataSpec(default_location=self._getTempDir())
            ],
            metrics_specs=[metrics_spec])
        eval_result = model_eval_lib.run_model_analysis(
            eval_config=eval_config,
            eval_shared_models=[
                model_eval_lib.default_eval_shared_model(
                    eval_saved_model_path=model_location,
                    tags=[tf.saved_model.SERVING])
            ])
        self.assertEqual(eval_result.config.model_specs[0].location,
                         model_location)
        self.assertEqual(eval_result.config.input_data_specs[0].location,
                         data_location)
        self.assertLen(eval_result.slicing_metrics, 1)
        got_slice_key, got_metrics = eval_result.slicing_metrics[0]
        self.assertEqual(got_slice_key, ())
        self.assertIn('', got_metrics)  # output_name
        got_metrics = got_metrics['']
        expected_metrics = {
            'classId:0': {
                'auc': True,
            },
            'classId:5': {
                'auc': True,
            },
            'classId:9': {
                'auc': True,
            },
        }
        for class_id in expected_metrics:
            self.assertIn(class_id, got_metrics)
            for k in expected_metrics[class_id]:
                self.assertIn(k, got_metrics[class_id])
    def testModelSignaturesDoFn(self, save_as_keras, signature_names,
                                default_signature_names, prefer_dict_outputs,
                                use_schema, expected_num_outputs):
        export_path = self.createModelWithMultipleDenseInputs(save_as_keras)
        eval_shared_models = {}
        model_specs = []
        for sigs in signature_names.values():
            for model_name in sigs:
                if model_name not in eval_shared_models:
                    eval_shared_models[
                        model_name] = self.createTestEvalSharedModel(
                            eval_saved_model_path=export_path,
                            model_name=model_name,
                            tags=[tf.saved_model.SERVING])
                    model_specs.append(config.ModelSpec(name=model_name))
        eval_config = config.EvalConfig(model_specs=model_specs)
        schema = self.createDenseInputsSchema() if use_schema else None
        tfx_io = tf_example_record.TFExampleBeamRecord(
            physical_format='text',
            schema=schema,
            raw_record_column_name=constants.ARROW_INPUT_COLUMN)
        tensor_adapter_config = None
        if use_schema:
            tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
                arrow_schema=tfx_io.ArrowSchema(),
                tensor_representations=tfx_io.TensorRepresentations())

        examples = [
            self._makeExample(input_1=1.0, input_2=2.0),
            self._makeExample(input_1=3.0, input_2=4.0),
            self._makeExample(input_1=5.0, input_2=6.0),
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (pipeline
                      | 'Create' >> beam.Create(
                          [e.SerializeToString() for e in examples])
                      | 'BatchExamples' >> tfx_io.BeamSource(batch_size=3)
                      | 'ToExtracts' >> beam.Map(_record_batch_to_extracts)
                      | 'ModelSignatures' >> beam.ParDo(
                          model_util.ModelSignaturesDoFn(
                              eval_config=eval_config,
                              eval_shared_models=eval_shared_models,
                              signature_names=signature_names,
                              default_signature_names=default_signature_names,
                              prefer_dict_outputs=prefer_dict_outputs,
                              tensor_adapter_config=tensor_adapter_config)))

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    for key in signature_names:
                        self.assertIn(key, got[0])
                        if prefer_dict_outputs:
                            for entry in got[0][key]:
                                self.assertIsInstance(entry, dict)
                                self.assertLen(entry, expected_num_outputs)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
Exemple #24
0
    def testRunModelAnalysisWithQueryBasedMetrics(self):
        input_layer = tf.keras.layers.Input(shape=(1, ), name='age')
        output_layer = tf.keras.layers.Dense(
            1, activation=tf.nn.sigmoid)(input_layer)
        model = tf.keras.models.Model(input_layer, output_layer)
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001),
                      loss=tf.keras.losses.binary_crossentropy)

        features = {'age': [[20.0]]}
        labels = [[1]]
        example_weights = [1.0]
        dataset = tf.data.Dataset.from_tensor_slices(
            (features, labels, example_weights))
        dataset = dataset.shuffle(buffer_size=1).repeat().batch(1)
        model.fit(dataset, steps_per_epoch=1)

        model_location = os.path.join(self._getTempDir(), 'export_dir')
        model.save(model_location, save_format='tf')

        examples = [
            self._makeExample(age=3.0, language='english', label=1.0),
            self._makeExample(age=5.0, language='chinese', label=0.0),
            self._makeExample(age=3.0, language='english', label=0.0),
            self._makeExample(age=5.0, language='chinese', label=1.0)
        ]
        data_location = self._writeTFExamplesToTFRecords(examples)
        slicing_specs = [config.SlicingSpec()]
        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec(location=data_location)],
            model_specs=[
                config.ModelSpec(location=model_location, label_key='label')
            ],
            output_data_specs=[
                config.OutputDataSpec(default_location=self._getTempDir())
            ],
            slicing_specs=slicing_specs,
            metrics_specs=metric_specs.specs_from_metrics(
                [ndcg.NDCG(gain_key='age', name='ndcg')],
                binarize=config.BinarizationOptions(top_k_list=[1]),
                query_key='language'))
        eval_shared_model = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=model_location,
            tags=[tf.saved_model.SERVING])
        eval_result = model_eval_lib.run_model_analysis(
            eval_config=eval_config,
            eval_shared_models=[eval_shared_model],
            evaluators=[
                metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                    eval_config=eval_config,
                    eval_shared_models=[eval_shared_model])
            ])

        self.assertEqual(eval_result.config.model_specs[0].location,
                         model_location)
        self.assertEqual(eval_result.config.input_data_specs[0].location,
                         data_location)
        self.assertLen(eval_result.slicing_metrics, 1)
        got_slice_key, got_metrics = eval_result.slicing_metrics[0]
        self.assertEqual(got_slice_key, ())
        self.assertIn('', got_metrics)  # output_name
        got_metrics = got_metrics['']
        expected_metrics = {
            '': {
                'example_count': True,
                'weighted_example_count': True,
            },
            'topK:1': {
                'ndcg': True,
            },
        }
        for group in expected_metrics:
            self.assertIn(group, got_metrics)
            for k in expected_metrics[group]:
                self.assertIn(k, got_metrics[group])
  def testWriteValidationResults(self):
    model_dir, baseline_dir = self._getExportDir(), self._getBaselineDir()
    eval_shared_model = self._build_keras_model(model_dir, mul=0)
    baseline_eval_shared_model = self._build_keras_model(baseline_dir, mul=1)
    validations_file = os.path.join(self._getTempDir(),
                                    constants.VALIDATIONS_KEY)
    examples = [
        self._makeExample(
            input=0.0,
            label=1.0,
            example_weight=1.0,
            extra_feature='non_model_feature'),
        self._makeExample(
            input=1.0,
            label=0.0,
            example_weight=0.5,
            extra_feature='non_model_feature'),
    ]

    eval_config = config.EvalConfig(
        model_specs=[
            config.ModelSpec(
                name='candidate',
                label_key='label',
                example_weight_key='example_weight'),
            config.ModelSpec(
                name='baseline',
                label_key='label',
                example_weight_key='example_weight',
                is_baseline=True)
        ],
        slicing_specs=[config.SlicingSpec()],
        metrics_specs=[
            config.MetricsSpec(
                metrics=[
                    config.MetricConfig(
                        class_name='WeightedExampleCount',
                        # 1.5 < 1, NOT OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                upper_bound={'value': 1}))),
                    config.MetricConfig(
                        class_name='ExampleCount',
                        # 2 > 10, NOT OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                lower_bound={'value': 10}))),
                    config.MetricConfig(
                        class_name='MeanLabel',
                        # 0 > 0 and 0 > 0%?: NOT OK.
                        threshold=config.MetricThreshold(
                            change_threshold=config.GenericChangeThreshold(
                                direction=config.MetricDirection
                                .HIGHER_IS_BETTER,
                                relative={'value': 0},
                                absolute={'value': 0}))),
                    config.MetricConfig(
                        # MeanPrediction = (0+0)/(1+0.5) = 0
                        class_name='MeanPrediction',
                        # -.01 < 0 < .01, OK.
                        # Diff% = -.333/.333 = -100% < -99%, OK.
                        # Diff = 0 - .333 = -.333 < 0, OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                upper_bound={'value': .01},
                                lower_bound={'value': -.01}),
                            change_threshold=config.GenericChangeThreshold(
                                direction=config.MetricDirection
                                .LOWER_IS_BETTER,
                                relative={'value': -.99},
                                absolute={'value': 0})))
                ],
                model_names=['candidate', 'baseline']),
        ],
        options=config.Options(
            disabled_outputs={'values': ['eval_config.json']}),
    )
    slice_spec = [
        slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
    ]
    eval_shared_models = {
        'candidate': eval_shared_model,
        'baseline': baseline_eval_shared_model
    }
    extractors = [
        input_extractor.InputExtractor(eval_config),
        predict_extractor_v2.PredictExtractor(
            eval_shared_model=eval_shared_models, eval_config=eval_config),
        slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
    ]
    evaluators = [
        metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
            eval_config=eval_config, eval_shared_model=eval_shared_models)
    ]
    output_paths = {
        constants.VALIDATIONS_KEY: validations_file,
    }
    writers = [
        metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter(
            output_paths, add_metrics_callbacks=[])
    ]

    with beam.Pipeline() as pipeline:

      # pylint: disable=no-value-for-parameter
      _ = (
          pipeline
          | 'Create' >> beam.Create([e.SerializeToString() for e in examples])
          | 'ExtractEvaluateAndWriteResults' >>
          model_eval_lib.ExtractEvaluateAndWriteResults(
              eval_config=eval_config,
              eval_shared_model=eval_shared_model,
              extractors=extractors,
              evaluators=evaluators,
              writers=writers))
      # pylint: enable=no-value-for-parameter

    validation_result = model_eval_lib.load_validation_result(
        os.path.dirname(validations_file))

    expected_validations = [
        text_format.Parse(
            """
            metric_key {
              name: "weighted_example_count"
              model_name: "candidate"
            }
            metric_threshold {
              value_threshold {
                upper_bound {
                  value: 1.0
                }
              }
            }
            metric_value {
              double_value {
                value: 1.5
              }
            }
            """, validation_result_pb2.ValidationFailure()),
        text_format.Parse(
            """
            metric_key {
              name: "example_count"
            }
            metric_threshold {
              value_threshold {
                lower_bound {
                  value: 10.0
                }
              }
            }
            metric_value {
              double_value {
                value: 2.0
              }
            }
            """, validation_result_pb2.ValidationFailure()),
        text_format.Parse(
            """
            metric_key {
              name: "mean_label"
              model_name: "candidate"
              is_diff: true
            }
            metric_threshold {
              change_threshold {
                absolute {
                  value: 0.0
                }
                relative {
                  value: 0.0
                }
                direction: HIGHER_IS_BETTER
              }
            }
            metric_value {
              double_value {
                value: 0.0
              }
            }
            """, validation_result_pb2.ValidationFailure()),
    ]
    self.assertFalse(validation_result.validation_ok)
    self.assertLen(validation_result.metric_validations_per_slice, 1)
    self.assertCountEqual(
        expected_validations,
        validation_result.metric_validations_per_slice[0].failures)
Exemple #26
0
 def testRunModelAnalysisWithLegacyQueryExtractor(self):
     model_location = self._exportEvalSavedModel(
         linear_classifier.simple_linear_classifier)
     examples = [
         self._makeExample(age=3.0, language='english', label=1.0),
         self._makeExample(age=3.0, language='chinese', label=0.0),
         self._makeExample(age=4.0, language='english', label=0.0),
         self._makeExample(age=5.0, language='chinese', label=1.0)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     slicing_specs = [config.SlicingSpec()]
     eval_config = config.EvalConfig(
         input_data_specs=[config.InputDataSpec(location=data_location)],
         model_specs=[config.ModelSpec(location=model_location)],
         output_data_specs=[
             config.OutputDataSpec(default_location=self._getTempDir())
         ],
         slicing_specs=slicing_specs)
     eval_shared_model = model_eval_lib.default_eval_shared_model(
         eval_saved_model_path=model_location, example_weight_key='age')
     eval_result = model_eval_lib.run_model_analysis(
         eval_config=eval_config,
         eval_shared_models=[eval_shared_model],
         evaluators=[
             metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(
                 eval_shared_model),
             query_based_metrics_evaluator.QueryBasedMetricsEvaluator(
                 query_id='language',
                 prediction_key='logistic',
                 combine_fns=[
                     query_statistics.QueryStatisticsCombineFn(),
                     legacy_ndcg.NdcgMetricCombineFn(at_vals=[1],
                                                     gain_key='label',
                                                     weight_key='')
                 ]),
         ])
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
         (): {
             'post_export_metrics/total_queries': {
                 'doubleValue': 2.0
             },
             'post_export_metrics/min_documents': {
                 'doubleValue': 2.0
             },
             'post_export_metrics/max_documents': {
                 'doubleValue': 2.0
             },
             'post_export_metrics/total_documents': {
                 'doubleValue': 4.0
             },
             'post_export_metrics/ndcg@1': {
                 'doubleValue': 0.5
             },
             'post_export_metrics/example_weight': {
                 'doubleValue': 15.0
             },
             'post_export_metrics/example_count': {
                 'doubleValue': 4.0
             },
         }
     }
     self.assertEqual(eval_result.config.model_specs[0].location,
                      model_location.decode())
     self.assertEqual(eval_result.config.input_data_specs[0].location,
                      data_location)
     self.assertEqual(eval_result.config.slicing_specs[0],
                      config.SlicingSpec())
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
     self.assertFalse(eval_result.plots)
Exemple #27
0
    def testLabelsExtractor(self, label):
        model_spec = config.ModelSpec(label_key=label)
        eval_config = config.EvalConfig(model_specs=[model_spec])
        feature_extractor = features_extractor.FeaturesExtractor(eval_config)
        label_extractor = labels_extractor.LabelsExtractor(eval_config)

        label_feature = ''
        if label is not None:
            label_feature = """
          feature {
            name: "%s"
            type: FLOAT
          }
          """ % label
        schema = text_format.Parse(
            label_feature + """
        feature {
          name: "fixed_int"
          type: INT
        }
        """, schema_pb2.Schema())
        tfx_io = test_util.InMemoryTFExampleRecord(
            schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN)

        def maybe_add_key(d, key, value):
            if key is not None:
                d[key] = value
            return d

        example_kwargs = [
            maybe_add_key({
                'fixed_int': 1,
            }, label, 1.0),
            maybe_add_key({
                'fixed_int': 1,
            }, label, 0.0),
            maybe_add_key({
                'fixed_int': 2,
            }, label, 0.0),
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create([
                    self._makeExample(**kwargs).SerializeToString()
                    for kwargs in example_kwargs
                ],
                                          reshuffle=False)
                | 'BatchExamples' >> tfx_io.BeamSource(batch_size=3)
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | feature_extractor.stage_name >> feature_extractor.ptransform
                | label_extractor.stage_name >> label_extractor.ptransform)

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    tf.compat.v1.logging.error('HERE >>>> {}'.format(got))
                    self.assertAlmostEqual(
                        got[0][constants.LABELS_KEY][0],
                        np.array([1.0]) if label is not None else None)
                    self.assertAlmostEqual(
                        got[0][constants.LABELS_KEY][1],
                        np.array([0.0]) if label is not None else None)
                    self.assertAlmostEqual(
                        got[0][constants.LABELS_KEY][2],
                        np.array([0.0]) if label is not None else None)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
Exemple #28
0
 def testRunModelAnalysisWithUncertainty(self):
     model_location = self._exportEvalSavedModel(
         linear_classifier.simple_linear_classifier)
     examples = [
         self._makeExample(age=3.0, language='english', label=1.0),
         self._makeExample(age=3.0, language='chinese', label=0.0),
         self._makeExample(age=4.0, language='english', label=1.0),
         self._makeExample(age=5.0, language='chinese', label=1.0),
         self._makeExample(age=5.0, language='hindi', label=1.0)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     slicing_specs = [config.SlicingSpec(feature_keys=['language'])]
     options = config.Options()
     options.compute_confidence_intervals.value = True
     options.k_anonymization_count.value = 2
     eval_config = config.EvalConfig(
         input_data_specs=[config.InputDataSpec(location=data_location)],
         model_specs=[config.ModelSpec(location=model_location)],
         output_data_specs=[
             config.OutputDataSpec(default_location=self._getTempDir())
         ],
         slicing_specs=slicing_specs,
         options=options)
     eval_result = model_eval_lib.run_model_analysis(
         eval_config=eval_config,
         eval_shared_models=[
             model_eval_lib.default_eval_shared_model(
                 eval_saved_model_path=model_location,
                 example_weight_key='age')
         ])
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
         (('language', 'hindi'), ): {
             u'__ERROR__': {
                 'debugMessage':
                 u'Example count for this slice key is lower than the '
                 u'minimum required value: 2. No data is aggregated for '
                 u'this slice.'
             },
         },
         (('language', 'chinese'), ): {
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 8.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 2.0
             },
         },
         (('language', 'english'), ): {
             'accuracy': {
                 'boundedValue': {
                     'value': 1.0,
                     'lowerBound': 1.0,
                     'upperBound': 1.0,
                     'methodology': 'POISSON_BOOTSTRAP'
                 }
             },
             'my_mean_label': {
                 'boundedValue': {
                     'value': 1.0,
                     'lowerBound': 1.0,
                     'upperBound': 1.0,
                     'methodology': 'POISSON_BOOTSTRAP'
                 }
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 7.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 2.0
             },
         }
     }
     self.assertEqual(eval_result.config.model_specs[0].location,
                      model_location.decode())
     self.assertEqual(eval_result.config.input_data_specs[0].location,
                      data_location)
     self.assertEqual(eval_result.config.slicing_specs[0],
                      config.SlicingSpec(feature_keys=['language']))
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
     self.assertFalse(eval_result.plots)
Exemple #29
0
    def assertGeneralMetricsComputedWithBeamAre(
            self, eval_saved_model_path: Text,
            examples_pcollection: beam.pvalue.PCollection,
            slice_spec: List[slicer.SingleSliceSpec],
            add_metrics_callbacks: List[types.AddMetricsCallbackType],
            expected_slice_metrics: Dict[Any, Dict[Text, Any]]):
        """Checks metrics computed using Beam.

    A more general version of assertMetricsComputedWithBeamAre. Note that the
    caller is responsible for setting up and running the Beam pipeline.

    Example usage:
      def add_metrics(features, predictions, labels):
       metric_ops = {
         'mse': tf.metrics.mean_squared_error(labels, predictions['logits']),
         'mae': tf.metrics.mean_absolute_error(labels, predictions['logits']),
      }
      return metric_ops

      with beam.Pipeline() as pipeline:
        expected_slice_metrics = {
            (): {
              'mae': 0.1,
              'mse': 0.2,
              tfma.post_export_metrics.metric_keys.AUC:
                tfma.test.BoundedValue(lower_bound=0.5)
            },
            (('age', 10),): {
              'mae': 0.2,
              'mse': 0.3,
              tfma.post_export_metrics.metric_keys.AUC:
                tfma.test.BoundedValue(lower_bound=0.5)
            },
        }
        examples = pipeline | 'ReadExamples' >> beam.io.ReadFromTFRecord(path)
        self.assertGeneralMetricsComputedWithBeamAre(
          eval_saved_model_path=path,
          examples_pcollection=examples,
          slice_spec=[tfma.slicer.SingleSliceSpec(),
                      tfma.slicer.SingleSliceSpec(columns=['age'])],
          add_metrics_callbacks=[
            add_metrics, tfma.post_export_metrics.auc()],
          expected_slice_metrics=expected_slice_metrics)

    Args:
      eval_saved_model_path: Path to the directory containing the
        EvalSavedModel.
      examples_pcollection: A PCollection of serialized example bytes.
      slice_spec: List of slice specifications.
      add_metrics_callbacks: Callbacks for adding additional metrics.
      expected_slice_metrics: Dictionary of dictionaries describing the expected
        metrics for each slice. The outer dictionary map slice keys to the
        expected metrics for that slice.
    """
        def check_metrics(got):
            """Check metrics callback."""
            try:
                slices = {}
                for slice_key, value in got:
                    slices[slice_key] = value
                self.assertItemsEqual(list(slices.keys()),
                                      list(expected_slice_metrics.keys()))
                for slice_key, expected_metrics in expected_slice_metrics.items(
                ):
                    self.assertDictElementsWithinBounds(
                        got_values_dict=slices[slice_key],
                        expected_values_dict=expected_metrics)
            except AssertionError as err:
                raise beam_util.BeamAssertException(err)

        slicing_specs = None
        if slice_spec:
            slicing_specs = [s.to_proto() for s in slice_spec]
        eval_config = config.EvalConfig(slicing_specs=slicing_specs)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_saved_model_path,
            add_metrics_callbacks=add_metrics_callbacks)
        extractors = model_eval_lib.default_extractors(
            eval_config=eval_config, eval_shared_model=eval_shared_model)

        # pylint: disable=no-value-for-parameter
        (metrics,
         _), _ = (examples_pcollection
                  | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                  | 'Extract' >> Extract(extractors=extractors)
                  | 'ComputeMetricsAndPlots' >>
                  legacy_metrics_and_plots_evaluator.ComputeMetricsAndPlots(
                      eval_shared_model=eval_shared_model))
        # pylint: enable=no-value-for-parameter

        beam_util.assert_that(metrics, check_metrics)
    def testUpdateConfigWithDefaultsRemoveBaselineModel(self):
        eval_config_pbtxt = """
      model_specs { name: "candidate" }
      model_specs { name: "baseline" is_baseline: true }
      metrics_specs {
        metrics {
          class_name: "MeanLabel"
          threshold {
            value_threshold {
              lower_bound { value: 0.9 }
            }
            change_threshold {
              direction: HIGHER_IS_BETTER
              absolute{ value: -1e-10 }
            }
          }
        }
        thresholds {
          key: "my_metric"
          value {
            value_threshold {
              lower_bound { value: 0.9 }
            }
            change_threshold {
              direction: HIGHER_IS_BETTER
              absolute{ value: -1e-10 }
            }
          }
        }
      }
    """
        eval_config = text_format.Parse(eval_config_pbtxt, config.EvalConfig())

        expected_eval_config_pbtxt = """
      model_specs {}
      metrics_specs {
        metrics {
          class_name: "MeanLabel"
          threshold {
            value_threshold {
              lower_bound { value: 0.9 }
            }
          }
        }
        thresholds {
          key: "my_metric"
          value {
            value_threshold {
              lower_bound { value: 0.9 }
            }
          }
        }
        model_names: [""]
      }
    """
        expected_eval_config = text_format.Parse(expected_eval_config_pbtxt,
                                                 config.EvalConfig())

        got_eval_config = config.update_eval_config_with_defaults(
            eval_config, maybe_remove_baseline=True)
        self.assertProtoEquals(got_eval_config, expected_eval_config)