def _runTestWithCustomCheck(self,
                             examples,
                             eval_export_dir,
                             metrics,
                             custom_metrics_check=None,
                             custom_plots_check=None):
   # make sure we are doing some checks
   self.assertTrue(custom_metrics_check is not None or
                   custom_plots_check is not None)
   serialized_examples = [ex.SerializeToString() for ex in examples]
   eval_shared_model = types.EvalSharedModel(
       model_path=eval_export_dir, add_metrics_callbacks=metrics)
   extractors = model_eval_lib.default_extractors(
       eval_shared_model=eval_shared_model)
   with beam.Pipeline() as pipeline:
     metrics, plots = (
         pipeline
         | 'Create' >> beam.Create(serialized_examples)
         | 'ToExampleAndExtracts' >> evaluate.ToExampleAndExtracts()
         | 'Extract' >> evaluate.Extract(extractors=extractors)
         |
         'Evaluate' >> evaluate.Evaluate(eval_shared_model=eval_shared_model))
     if custom_metrics_check is not None:
       util.assert_that(metrics, custom_metrics_check, label='metrics')
     if custom_plots_check is not None:
       util.assert_that(plots, custom_plots_check, label='plot')
Esempio n. 2
0
    def testBuildAnalysisTable(self):
        model_location = self._exportEvalSavedModel(
            linear_classifier.simple_linear_classifier)
        eval_shared_model = types.EvalSharedModel(model_path=model_location)

        example1 = self._makeExample(age=3.0,
                                     language='english',
                                     label=1.0,
                                     slice_key='first_slice')

        with beam.Pipeline() as pipeline:
            result = (
                pipeline
                | 'CreateInput' >> beam.Create([example1.SerializeToString()])
                | 'BuildTable' >> contrib.BuildAnalysisTable(
                    eval_shared_model=eval_shared_model))

            def check_result(got):
                self.assertEqual(1, len(got), 'got: %s' % got)
                extracts = got[0]

                # Values of type MaterializedColumn are emitted to signal to
                # downstream sink components to output the data to file.
                materialized_dict = dict(
                    (k, v) for k, v in extracts.items()
                    if isinstance(v, types.MaterializedColumn))
                self._assertMaterializedColumns(
                    materialized_dict,
                    {
                        # Slice key
                        'features__slice_key':
                        types.MaterializedColumn(name='features__slice_key',
                                                 value=[b'first_slice']),

                        # Features
                        'features__language':
                        types.MaterializedColumn(name='features__language',
                                                 value=[b'english']),
                        'features__age':
                        types.MaterializedColumn(name='features__age',
                                                 value=np.array(
                                                     [3.], dtype=np.float32)),

                        # Label
                        'features__label':
                        types.MaterializedColumn(name='features__label',
                                                 value=np.array(
                                                     [1.], dtype=np.float32)),
                        'labels':
                        types.MaterializedColumn(name='labels',
                                                 value=np.array(
                                                     [1.], dtype=np.float32)),
                    })
                self._assertMaterializedColumnsExist(materialized_dict, [
                    'predictions__logits', 'predictions__probabilities',
                    'predictions__classes', 'predictions__logistic',
                    'predictions__class_ids', constants.SLICE_KEYS_KEY
                ])

            util.assert_that(result[constants.ANALYSIS_KEY], check_result)
def default_eval_shared_model(eval_saved_model_path,
                              add_metrics_callbacks=None,
                              example_weight_key=None):
    """Returns default EvalSharedModel.

  Args:
    eval_saved_model_path: Path to EvalSavedModel.
    add_metrics_callbacks: Optional list of callbacks for adding additional
      metrics to the graph (see EvalSharedModel for more information on how to
      configure additional metrics). Metrics for example counts and example
      weight will be added automatically.
    example_weight_key: The key of the example weight column. If None, weight
      will be 1 for each example.
  """
    # Always compute example weight and example count.
    # pytype: disable=module-attr
    if not add_metrics_callbacks:
        add_metrics_callbacks = []
    example_count_callback = post_export_metrics.example_count()
    add_metrics_callbacks.append(example_count_callback)
    if example_weight_key:
        example_weight_callback = post_export_metrics.example_weight(
            example_weight_key)
        add_metrics_callbacks.append(example_weight_callback)
    # pytype: enable=module-attr

    return types.EvalSharedModel(model_path=eval_saved_model_path,
                                 add_metrics_callbacks=add_metrics_callbacks,
                                 example_weight_key=example_weight_key)
Esempio n. 4
0
    def testEvaluateNoSlicing(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)
        eval_shared_model = types.EvalSharedModel(
            model_path=eval_export_dir,
            add_metrics_callbacks=[_addExampleCountMetricCallback])
        extractors = [
            predict_extractor.PredictExtractor(eval_shared_model),
            slice_key_extractor.SliceKeyExtractor()
        ]

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(age=3.0,
                                         language='english',
                                         label=1.0)
            example2 = self._makeExample(age=3.0,
                                         language='chinese',
                                         label=0.0)
            example3 = self._makeExample(age=4.0,
                                         language='english',
                                         label=1.0)
            example4 = self._makeExample(age=5.0,
                                         language='chinese',
                                         label=0.0)

            metrics, _ = (
                pipeline
                | 'Create' >> beam.Create([
                    example1.SerializeToString(),
                    example2.SerializeToString(),
                    example3.SerializeToString(),
                    example4.SerializeToString()
                ])
                | 'ToExampleAnExtracts' >> evaluate.ToExampleAndExtracts()
                | 'Extract' >> evaluate.Extract(extractors=extractors)
                | 'Evaluate' >>
                evaluate.Evaluate(eval_shared_model=eval_shared_model))

            def check_result(got):
                try:
                    self.assertEqual(1, len(got), 'got: %s' % got)
                    (slice_key, value) = got[0]
                    self.assertEqual((), slice_key)
                    self.assertDictElementsAlmostEqual(
                        value, {
                            'accuracy': 1.0,
                            'label/mean': 0.5,
                            'my_mean_age': 3.75,
                            'my_mean_age_times_label': 1.75,
                            'added_example_count': 4.0
                        })
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics, check_result)
Esempio n. 5
0
    def assertMetricsComputedWithBeamAre(self, eval_saved_model_path,
                                         serialized_examples,
                                         expected_metrics):
        """Checks metrics computed using Beam.

    Metrics will be computed over all examples, without any slicing. If you
    want to provide your own PCollection (e.g. read a large number of examples
    from a file), if you want to check metrics over certain slices, or if you
    want to add additional post-export metrics, use the more general
    assertGeneralMetricsComputedWithBeamAre.

    Example usage:
      self.assertMetricsComputedWithBeamAre(
        eval_saved_model_path=path,
        serialized_examples=[self.makeExample(age=5, label=1.0),
                             self.makeExample(age=10, label=0.0)],
        expected_metrics={'average_loss': 0.1})

    Args:
      eval_saved_model_path: Path to the directory containing the
        EvalSavedModel.
      serialized_examples: List of serialized example bytes.
      expected_metrics: Dictionary of expected metric values.
    """
        def check_metrics(got):
            """Check metrics callback."""
            try:
                self.assertEqual(
                    1, len(got),
                    'expecting metrics for exactly one slice, but got %d '
                    'slices instead. metrics were: %s' % (len(got), got))
                (slice_key, value) = got[0]
                self.assertEqual((), slice_key)
                self.assertDictElementsWithinBounds(
                    got_values_dict=value,
                    expected_values_dict=expected_metrics)
            except AssertionError as err:
                raise beam_util.BeamAssertException(err)

        eval_shared_model = types.EvalSharedModel(
            model_path=eval_saved_model_path)
        extractors = model_eval_lib.default_extractors(
            eval_shared_model=eval_shared_model)

        with beam.Pipeline() as pipeline:
            metrics, _ = (
                pipeline
                | 'CreateExamples' >> beam.Create(serialized_examples)
                | 'ToExampleAndExtracts' >> evaluate.ToExampleAndExtracts()
                | 'Extract' >> evaluate.Extract(extractors=extractors)
                | 'Evaluate' >>
                evaluate.Evaluate(eval_shared_model=eval_shared_model))

            beam_util.assert_that(metrics, check_metrics)
Esempio n. 6
0
    def testEvaluateNoSlicingAddPostExportAndCustomMetricsUnsupervisedModel(
            self):
        # Mainly for testing that the ExampleCount post export metric works with
        # unsupervised models.
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = (fixed_prediction_estimator_no_labels.
                              simple_fixed_prediction_estimator_no_labels(
                                  None, temp_eval_export_dir))
        eval_shared_model = types.EvalSharedModel(
            model_path=eval_export_dir,
            add_metrics_callbacks=[
                post_export_metrics.example_count(),
                post_export_metrics.example_weight(
                    example_weight_key='prediction')
            ])
        extractors = [
            predict_extractor.PredictExtractor(eval_shared_model),
            slice_key_extractor.SliceKeyExtractor()
        ]

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(prediction=1.0)
            example2 = self._makeExample(prediction=2.0)

            metrics, plots = (
                pipeline
                | 'Create' >> beam.Create([
                    example1.SerializeToString(),
                    example2.SerializeToString(),
                ])
                | 'ToExampleAnExtracts' >> evaluate.ToExampleAndExtracts()
                | 'Extract' >> evaluate.Extract(extractors=extractors)
                | 'Evaluate' >>
                evaluate.Evaluate(eval_shared_model=eval_shared_model))

            def check_result(got):
                try:
                    self.assertEqual(1, len(got), 'got: %s' % got)
                    (slice_key, value) = got[0]
                    self.assertEqual((), slice_key)
                    self.assertDictElementsAlmostEqual(
                        got_values_dict=value,
                        expected_values_dict={
                            'average_loss': 2.5,
                            metric_keys.EXAMPLE_COUNT: 2.0,
                            metric_keys.EXAMPLE_WEIGHT: 3.0
                        })
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics, check_result, label='metrics')
            util.assert_that(plots, util.is_empty(), label='plots')
Esempio n. 7
0
    def createTestEvalSharedModel(self,
                                  eval_saved_model_path,
                                  add_metrics_callbacks=None,
                                  include_default_metrics=True,
                                  example_weight_key=None):

        return types.EvalSharedModel(
            model_path=eval_saved_model_path,
            add_metrics_callbacks=add_metrics_callbacks,
            example_weight_key=example_weight_key,
            construct_fn=dofn.make_construct_fn(eval_saved_model_path,
                                                add_metrics_callbacks,
                                                include_default_metrics))
Esempio n. 8
0
def default_eval_shared_model(
        eval_saved_model_path: Text,
        add_metrics_callbacks: Optional[List[
            types.AddMetricsCallbackType]] = None,
        include_default_metrics: Optional[bool] = True,
        example_weight_key: Optional[Text] = None,
        additional_fetches: Optional[List[Text]] = None
) -> types.EvalSharedModel:
    """Returns default EvalSharedModel.

  Args:
    eval_saved_model_path: Path to EvalSavedModel.
    add_metrics_callbacks: Optional list of callbacks for adding additional
      metrics to the graph (see EvalSharedModel for more information on how to
      configure additional metrics). Metrics for example counts and example
      weight will be added automatically.
    include_default_metrics: True to include the default metrics that are part
      of the saved model graph during evaluation.
    example_weight_key: Deprecated.
    additional_fetches: Prefixes of additional tensors stored in
      signature_def.inputs that should be fetched at prediction time. The
      "features" and "labels" tensors are handled automatically and should not
      be included.
  """
    # Always compute example weight and example count.
    # PyType doesn't know about the magic exports we do in post_export_metrics.
    # Additionally, the lines seem to get reordered in compilation, so we can't
    # just put the disable-attr on the add_metrics_callbacks lines.
    # pytype: disable=module-attr
    if not add_metrics_callbacks:
        add_metrics_callbacks = []
    example_count_callback = post_export_metrics.example_count()
    add_metrics_callbacks.append(example_count_callback)
    # TODO(b/126924645): Remove
    if example_weight_key:
        example_weight_callback = post_export_metrics.example_weight(
            example_weight_key)
        add_metrics_callbacks.append(example_weight_callback)
    # pytype: enable=module-attr

    return types.EvalSharedModel(
        model_path=eval_saved_model_path,
        add_metrics_callbacks=add_metrics_callbacks,
        include_default_metrics=include_default_metrics,
        example_weight_key=example_weight_key,
        additional_fetches=additional_fetches,
        construct_fn=dofn.make_construct_fn(
            eval_saved_model_path,
            add_metrics_callbacks,
            include_default_metrics,
            additional_fetches=additional_fetches))
Esempio n. 9
0
    def testAggregateOverallSlice(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)

        eval_saved_model = load.EvalSavedModel(eval_export_dir)
        eval_shared_model = types.EvalSharedModel(model_path=eval_export_dir)

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(age=3.0,
                                         language='english',
                                         label=1.0)
            example2 = self._makeExample(age=3.0,
                                         language='chinese',
                                         label=0.0)
            example3 = self._makeExample(age=4.0,
                                         language='english',
                                         label=1.0)
            example4 = self._makeExample(age=5.0,
                                         language='chinese',
                                         label=0.0)

            predict_result = eval_saved_model.predict_list([
                example1.SerializeToString(),
                example2.SerializeToString(),
                example3.SerializeToString(),
                example4.SerializeToString()
            ])

            metrics, _ = (
                pipeline
                | 'CreateTestInput' >> beam.Create(
                    create_test_input(predict_result, [()]))
                | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics(
                    eval_shared_model=eval_shared_model, desired_batch_size=3))

            def check_result(got):
                self.assertEqual(1, len(got), 'got: %s' % got)
                slice_key, metrics = got[0]
                self.assertEqual(slice_key, ())
                self.assertDictElementsAlmostEqual(
                    metrics, {
                        'accuracy': 1.0,
                        'label/mean': 0.5,
                        'my_mean_age': 3.75,
                        'my_mean_age_times_label': 1.75,
                    })

            util.assert_that(metrics, check_result)
Esempio n. 10
0
    def testBuildAnalysisTableWithSlices(self):
        model_location = self._exportEvalSavedModel(
            linear_classifier.simple_linear_classifier)
        eval_shared_model = types.EvalSharedModel(model_path=model_location)

        example1 = self._makeExample(age=3.0,
                                     language='english',
                                     label=1.0,
                                     slice_key='first_slice')
        slice_spec = [
            slicer.SingleSliceSpec(columns=['age']),
            slicer.SingleSliceSpec(features=[('age', 3)]),
            slicer.SingleSliceSpec(columns=['age'],
                                   features=[('language', 'english')])
        ]

        with beam.Pipeline() as pipeline:
            result = (
                pipeline
                | 'CreateInput' >> beam.Create([example1.SerializeToString()])
                | 'BuildTable' >> contrib.BuildAnalysisTable(
                    eval_shared_model, slice_spec))

            def check_result(got):
                self.assertEqual(1, len(got), 'got: %s' % got)
                extracts = got[0]

                # Values of type MaterializedColumn are emitted to signal to
                # downstream sink components to output the data to file.
                materialized_dict = dict(
                    (k, v) for k, v in extracts.items()
                    if isinstance(v, types.MaterializedColumn))
                self._assertMaterializedColumns(
                    materialized_dict, {
                        constants.SLICE_KEYS_KEY:
                        types.MaterializedColumn(
                            name=constants.SLICE_KEYS_KEY,
                            value=[
                                b'age:3.0', b'age:3',
                                b'age_X_language:3.0_X_english'
                            ])
                    })
                self._assertMaterializedColumnsExist(materialized_dict, [
                    'predictions__logits', 'predictions__probabilities',
                    'predictions__classes', 'predictions__logistic',
                    'predictions__class_ids'
                ])

            util.assert_that(result[constants.ANALYSIS_KEY], check_result)
Esempio n. 11
0
  def testNoConstructFn(self):
    model_location = self._exportEvalSavedModel(
        linear_classifier.simple_linear_classifier)
    examples = [self._makeExample(age=3.0, language='english', label=1.0)]
    data_location = self._writeTFExamplesToTFRecords(examples)
    # No construct_fn should fail when Beam attempts to call the construct_fn.
    eval_shared_model = types.EvalSharedModel(model_path=model_location)
    with self.assertRaisesRegexp(TypeError,
                                 '\'NoneType\' object is not callable'):
      model_eval_lib.run_model_analysis(
          eval_shared_model=eval_shared_model, data_location=data_location)

    # Using the default_eval_shared_model should pass as it has a construct_fn.
    eval_shared_model = model_eval_lib.default_eval_shared_model(
        eval_saved_model_path=model_location)
    model_eval_lib.run_model_analysis(
        eval_shared_model=eval_shared_model, data_location=data_location)
Esempio n. 12
0
    def createTestEvalSharedModel(
        self,
        eval_saved_model_path: Text,
        add_metrics_callbacks: Optional[List[
            types.AddMetricsCallbackType]] = None,
        include_default_metrics: Optional[bool] = True,
        example_weight_key: Optional[Text] = None,
        additional_fetches: Optional[List[Text]] = None
    ) -> types.EvalSharedModel:

        return types.EvalSharedModel(
            model_path=eval_saved_model_path,
            add_metrics_callbacks=add_metrics_callbacks,
            example_weight_key=example_weight_key,
            construct_fn=dofn.make_construct_fn(eval_saved_model_path,
                                                add_metrics_callbacks,
                                                include_default_metrics,
                                                additional_fetches))
  def testPredict(self):
    temp_eval_export_dir = self._getEvalExportDir()
    _, eval_export_dir = linear_classifier.simple_linear_classifier(
        None, temp_eval_export_dir)
    eval_shared_model = types.EvalSharedModel(model_path=eval_export_dir)

    with beam.Pipeline() as pipeline:
      examples = [
          self._makeExample(age=3.0, language='english', label=1.0),
          self._makeExample(age=3.0, language='chinese', label=0.0),
          self._makeExample(age=4.0, language='english', label=1.0),
          self._makeExample(age=5.0, language='chinese', label=0.0),
      ]
      serialized_examples = [e.SerializeToString() for e in examples]

      predict_extracts = (
          pipeline
          | beam.Create(serialized_examples)
          # Our diagnostic outputs, pass types.ExampleAndExtracts throughout,
          # however our aggregating functions do not use this interface.
          | beam.Map(lambda x: types.ExampleAndExtracts(example=x, extracts={}))
          | 'Predict' >> predict_extractor.TFMAPredict(
              eval_shared_model=eval_shared_model, desired_batch_size=3))

      def check_result(got):
        try:
          self.assertEqual(4, len(got), 'got: %s' % got)
          for item in got:
            extracts_dict = item.extracts
            self.assertTrue('fpl' in extracts_dict)
            fpl = extracts_dict['fpl']
            # Verify fpl contains features, probabilities, and correct labels.
            self.assertIn('language', fpl.features)
            self.assertIn('age', fpl.features)
            self.assertIn('label', fpl.features)
            self.assertIn('probabilities', fpl.predictions)
            self.assertAlmostEqual(fpl.features['label'],
                                   fpl.labels['__labels'])

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(predict_extracts, check_result)
Esempio n. 14
0
    def createTestEvalSharedModel(
            self,
            eval_saved_model_path: Optional[Text] = None,
            add_metrics_callbacks: Optional[List[
                types.AddMetricsCallbackType]] = None,
            include_default_metrics: Optional[bool] = True,
            example_weight_key: Optional[Union[Text, Dict[Text, Text]]] = None,
            additional_fetches: Optional[List[Text]] = None,
            tags: Optional[Text] = None,
            model_type: Optional[Text] = None,
            model_name: Text = '',
            rubber_stamp: Optional[bool] = False,
            is_baseline: Optional[bool] = False) -> types.EvalSharedModel:

        if not model_type:
            model_type = model_util.get_model_type(None, eval_saved_model_path,
                                                   tags)
        if not tags:
            if model_type in (constants.TF_GENERIC, constants.TF_ESTIMATOR):
                model_type = constants.TF_ESTIMATOR
                tags = [eval_constants.EVAL_TAG]
            else:
                tags = [tf.saved_model.SERVING]

        return types.EvalSharedModel(
            model_name=model_name,
            model_type=model_type,
            model_path=eval_saved_model_path,
            add_metrics_callbacks=add_metrics_callbacks,
            example_weight_key=example_weight_key,
            rubber_stamp=rubber_stamp,
            is_baseline=is_baseline,
            model_loader=types.ModelLoader(
                tags=tags,
                construct_fn=model_util.model_construct_fn(
                    eval_saved_model_path=eval_saved_model_path,
                    model_type=model_type,
                    add_metrics_callbacks=add_metrics_callbacks,
                    include_default_metrics=include_default_metrics,
                    additional_fetches=additional_fetches,
                    tags=tags)))
Esempio n. 15
0
def default_eval_shared_model(eval_saved_model_path,
                              add_metrics_callbacks=None,
                              include_default_metrics=True,
                              example_weight_key=None):
    """Returns default EvalSharedModel.

  Args:
    eval_saved_model_path: Path to EvalSavedModel.
    add_metrics_callbacks: Optional list of callbacks for adding additional
      metrics to the graph (see EvalSharedModel for more information on how to
      configure additional metrics). Metrics for example counts and example
      weight will be added automatically.
    include_default_metrics: True to include the default metrics that are part
      of the saved model graph during evaluation.
    example_weight_key: The key of the example weight column. If None, weight
      will be 1 for each example.
  """
    # Always compute example weight and example count.
    # PyType doesn't know about the magic exports we do in post_export_metrics.
    # Additionally, the lines seem to get reordered in compilation, so we can't
    # just put the disable-attr on the add_metrics_callbacks lines.
    # pytype: disable=module-attr
    if not add_metrics_callbacks:
        add_metrics_callbacks = []
    example_count_callback = post_export_metrics.example_count()
    add_metrics_callbacks.append(example_count_callback)
    if example_weight_key:
        example_weight_callback = post_export_metrics.example_weight(
            example_weight_key)
        add_metrics_callbacks.append(example_weight_callback)
    # pytype: enable=module-attr

    return types.EvalSharedModel(
        model_path=eval_saved_model_path,
        add_metrics_callbacks=add_metrics_callbacks,
        include_default_metrics=include_default_metrics,
        example_weight_key=example_weight_key,
        construct_fn=dofn.make_construct_fn(eval_saved_model_path,
                                            add_metrics_callbacks,
                                            include_default_metrics))
Esempio n. 16
0
  def createTestEvalSharedModel(
      self,
      eval_saved_model_path: Optional[Text] = None,
      add_metrics_callbacks: Optional[List[
          types.AddMetricsCallbackType]] = None,
      include_default_metrics: Optional[bool] = True,
      example_weight_key: Optional[Union[Text, Dict[Text, Text]]] = None,
      additional_fetches: Optional[List[Text]] = None,
      tags: Optional[Text] = None) -> types.EvalSharedModel:

    return types.EvalSharedModel(
        eval_saved_model_path,
        add_metrics_callbacks=add_metrics_callbacks,
        example_weight_key=example_weight_key,
        model_loader=types.ModelLoader(
            tags=tags,
            construct_fn=model_util.model_construct_fn(
                eval_saved_model_path=eval_saved_model_path,
                add_metrics_callbacks=add_metrics_callbacks,
                include_default_metrics=include_default_metrics,
                additional_fetches=additional_fetches,
                tags=tags)))
  def testPredictMultipleExampleRefPerRawExampleBytes(self):
    temp_eval_export_dir = self._getEvalExportDir()
    _, eval_export_dir = (
        fake_multi_examples_per_input_estimator
        .fake_multi_examples_per_input_estimator(None, temp_eval_export_dir))
    eval_shared_model = types.EvalSharedModel(model_path=eval_export_dir)

    # The trailing zeros make an "empty" output batch.
    raw_example_bytes = ['0', '3', '1', '0', '2', '0', '0', '0', '0']

    def check_result(got):
      try:
        self.assertEqual(6, len(got), 'got: %s' % got)
        self.assertEqual(
            ['3', '3', '3', '1', '2', '2'],
            [example_and_extracts.example for example_and_extracts in got])

        for item in got:
          extracts_dict = item.extracts
          self.assertTrue('fpl' in extracts_dict)
          fpl = extracts_dict['fpl']
          self.assertIn('input_index', fpl.features)
          self.assertIn('example_count', fpl.features)
          self.assertIn('intra_input_index', fpl.features)

      except AssertionError as err:
        raise util.BeamAssertException(err)

    with beam.Pipeline() as pipeline:
      predict_extracts = (
          pipeline
          | beam.Create(raw_example_bytes)
          # Our diagnostic outputs, pass types.ExampleAndExtracts throughout,
          # however our aggregating functions do not use this interface.
          | beam.Map(lambda x: types.ExampleAndExtracts(example=x, extracts={}))
          | 'Predict' >> predict_extractor.TFMAPredict(
              eval_shared_model=eval_shared_model, desired_batch_size=3))

      util.assert_that(predict_extracts, check_result)
  def testNoConstructFn(self):
    model_location = self._exportEvalSavedModel(
        linear_classifier.simple_linear_classifier)
    examples = [self._makeExample(age=3.0, language='english', label=1.0)]
    data_location = self._writeTFExamplesToTFRecords(examples)
    eval_config = config.EvalConfig(
        input_data_specs=[config.InputDataSpec(location=data_location)],
        model_specs=[config.ModelSpec(location=model_location)],
        output_data_specs=[
            config.OutputDataSpec(default_location=self._getTempDir())
        ])
    # No construct_fn should fail when Beam attempts to call the construct_fn.
    eval_shared_model = types.EvalSharedModel(model_path=model_location)
    with self.assertRaisesRegexp(AttributeError,
                                 '\'NoneType\' object has no attribute'):
      model_eval_lib.run_model_analysis(
          eval_config=eval_config, eval_shared_models=[eval_shared_model])

    # Using the default_eval_shared_model should pass as it has a construct_fn.
    eval_shared_model = model_eval_lib.default_eval_shared_model(
        eval_saved_model_path=model_location)
    model_eval_lib.run_model_analysis(
        eval_config=eval_config, eval_shared_models=[eval_shared_model])
    def testModelAgnosticConstructFn(self):
        # End to end test for the entire flow going from tf.Examples -> metrics
        # with slicing.
        with beam.Pipeline() as pipeline:
            # Set up the inputs. All we need is are tf.Examples and an example parsing
            # spec with explicit mapping for key to (Features, Predictions, Labels).
            examples = [
                self._makeExample(age=3.0,
                                  language='english',
                                  probabilities=1.0,
                                  labels=1.0),
                self._makeExample(age=3.0,
                                  language='chinese',
                                  probabilities=3.0,
                                  labels=0.0),
                self._makeExample(age=4.0,
                                  language='english',
                                  probabilities=2.0,
                                  labels=1.0),
                self._makeExample(age=5.0,
                                  language='chinese',
                                  probabilities=3.0,
                                  labels=0.0),
                # Add some examples with no language.
                self._makeExample(age=5.0, probabilities=2.0, labels=10.0),
                self._makeExample(age=6.0, probabilities=1.0, labels=0.0)
            ]
            serialized_examples = [e.SerializeToString() for e in examples]

            # Set up a config to bucket our example keys.
            feature_map = {
                'age': tf.FixedLenFeature([], tf.float32),
                'language': tf.VarLenFeature(tf.string),
                'probabilities': tf.FixedLenFeature([], tf.float32),
                'labels': tf.FixedLenFeature([], tf.float32)
            }

            model_agnostic_config = agnostic_predict.ModelAgnosticConfig(
                label_keys=['labels'],
                prediction_keys=['probabilities'],
                feature_spec=feature_map)

            # Set up the Model Agnostic Extractor
            extractors = [
                model_agnostic_extractor.ModelAgnosticExtractor(
                    model_agnostic_config=model_agnostic_config,
                    desired_batch_size=3),
                slice_key_extractor.SliceKeyExtractor([
                    slicer.SingleSliceSpec(),
                    slicer.SingleSliceSpec(columns=['language'])
                ])
            ]

            # Set up the metrics we wish to calculate via a metric callback. In
            # particular, this metric calculates the mean and sum of all labels.
            eval_shared_model = types.EvalSharedModel(
                add_metrics_callbacks=[add_mean_callback],
                construct_fn=model_agnostic_evaluate_graph.make_construct_fn(
                    add_metrics_callbacks=[add_mean_callback],
                    fpl_feed_config=model_agnostic_extractor.
                    ModelAgnosticGetFPLFeedConfig(model_agnostic_config)))

            # Run our pipeline doing Extract -> Slice -> Fanout -> Calculate Metrics.
            metrics, _ = (
                pipeline
                | 'Create Examples' >> beam.Create(serialized_examples)
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'Extract' >> tfma_unit.Extract(extractors=extractors)  # pylint: disable=no-value-for-parameter
                | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator.
                ComputeMetricsAndPlots(eval_shared_model=eval_shared_model))

            # Verify our metrics are properly generated per slice.
            def check_result(got):
                self.assertEqual(3, len(got), 'got: %s' % got)
                slices = {}
                for slice_key, metrics in got:
                    slices[slice_key] = metrics
                overall_slice = ()
                english_slice = (('language', b'english'), )
                chinese_slice = (('language', b'chinese'), )

                self.assertItemsEqual(
                    list(slices.keys()),
                    [overall_slice, english_slice, chinese_slice])
                # Overall slice has label/predictions sum = 24 and 12 elements.
                self.assertDictElementsAlmostEqual(slices[overall_slice], {
                    'tf_metric_mean': 2.0,
                    'py_func_total_label': 24.0,
                })
                # English slice has label/predictions sum = 5 and 4 elements.
                self.assertDictElementsAlmostEqual(slices[english_slice], {
                    'tf_metric_mean': 1.25,
                    'py_func_total_label': 5.0,
                })
                # Chinese slice has label/predictions sum = 6 and 4 elements.
                self.assertDictElementsAlmostEqual(slices[chinese_slice], {
                    'tf_metric_mean': 1.5,
                    'py_func_total_label': 6.0,
                })

            util.assert_that(metrics, check_result)
Esempio n. 20
0
    def assertGeneralMetricsComputedWithBeamAre(self, eval_saved_model_path,
                                                examples_pcollection,
                                                slice_spec,
                                                add_metrics_callbacks,
                                                expected_slice_metrics):
        """Checks metrics computed using Beam.

    A more general version of assertMetricsComputedWithBeamAre. Note that the
    caller is responsible for setting up and running the Beam pipeline.

    Example usage:
      def add_metrics(features, predictions, labels):
       metric_ops = {
         'mse': tf.metrics.mean_squared_error(labels, predictions['logits']),
         'mae': tf.metrics.mean_absolute_error(labels, predictions['logits']),
      }
      return metric_ops

      with beam.Pipeline() as pipeline:
        expected_slice_metrics = {
            (): {
              'mae': 0.1,
              'mse': 0.2,
              tfma.post_export_metrics.metric_keys.AUC:
                tfma.test.BoundedValue(lower_bound=0.5)
            },
            (('age', 10),): {
              'mae': 0.2,
              'mse': 0.3,
              tfma.post_export_metrics.metric_keys.AUC:
                tfma.test.BoundedValue(lower_bound=0.5)
            },
        }
        examples = pipeline | 'ReadExamples' >> beam.io.ReadFromTFRecord(path)
        self.assertGeneralMetricsComputedWithBeamAre(
          eval_saved_model_path=path,
          examples_pcollection=examples,
          slice_spec=[tfma.SingleSliceSpec(),
                      tfma.SingleSliceSpec(columns=['age'])],
          add_metrics_callbacks=[
            add_metrics, tfma.post_export_metrics.auc()],
          expected_slice_metrics=expected_slice_metrics)

    Args:
      eval_saved_model_path: Path to the directory containing the
        EvalSavedModel.
      examples_pcollection: A PCollection of serialized example bytes.
      slice_spec: List of slice specifications.
      add_metrics_callbacks: Callbacks for adding additional metrics.
      expected_slice_metrics: Dictionary of dictionaries describing the expected
        metrics for each slice. The outer dictionary map slice keys to the
        expected metrics for that slice.
    """
        def check_metrics(got):
            """Check metrics callback."""
            try:
                slices = {}
                for slice_key, value in got:
                    slices[slice_key] = value
                self.assertItemsEqual(list(slices.keys()),
                                      list(expected_slice_metrics.keys()))
                for slice_key, expected_metrics in expected_slice_metrics.items(
                ):
                    self.assertDictElementsWithinBounds(
                        got_values_dict=slices[slice_key],
                        expected_values_dict=expected_metrics)
            except AssertionError as err:
                raise beam_util.BeamAssertException(err)

        eval_shared_model = types.EvalSharedModel(
            model_path=eval_saved_model_path,
            add_metrics_callbacks=add_metrics_callbacks)
        extractors = model_eval_lib.default_extractors(
            eval_shared_model=eval_shared_model, slice_spec=slice_spec)

        metrics, _ = (
            examples_pcollection
            | 'ToExampleAndExtracts' >> evaluate.ToExampleAndExtracts()
            | 'Extract' >> evaluate.Extract(extractors=extractors)
            | 'Evaluate' >>
            evaluate.Evaluate(eval_shared_model=eval_shared_model))

        beam_util.assert_that(metrics, check_metrics)
Esempio n. 21
0
def default_eval_shared_model(
    eval_saved_model_path: Text,
    add_metrics_callbacks: Optional[List[types.AddMetricsCallbackType]] = None,
    include_default_metrics: Optional[bool] = True,
    example_weight_key: Optional[Union[Text, Dict[Text, Text]]] = None,
    additional_fetches: Optional[List[Text]] = None,
    blacklist_feature_fetches: Optional[List[Text]] = None,
    tags: Optional[List[Text]] = None,
    eval_config: Optional[config.EvalConfig] = None) -> types.EvalSharedModel:
  """Returns default EvalSharedModel.

  Args:
    eval_saved_model_path: Path to EvalSavedModel.
    add_metrics_callbacks: Optional list of callbacks for adding additional
      metrics to the graph (see EvalSharedModel for more information on how to
      configure additional metrics). Metrics for example count and example
      weights will be added automatically.
    include_default_metrics: True to include the default metrics that are part
      of the saved model graph during evaluation. Note that
      eval_config.options.include_default_metrics must also be true.
    example_weight_key: Example weight key (single-output model) or dict of
      example weight keys (multi-output model) keyed by output name.
    additional_fetches: Prefixes of additional tensors stored in
      signature_def.inputs that should be fetched at prediction time. The
      "features" and "labels" tensors are handled automatically and should not
      be included.
    blacklist_feature_fetches: List of tensor names in the features dictionary
      which should be excluded from the fetches request. This is useful in
      scenarios where features are large (e.g. images) and can lead to excessive
      memory use if stored.
    tags: Model tags (e.g. 'serve' for serving or 'eval' for EvalSavedModel).
    eval_config: Eval config. Only used for setting default tags.
  """
  if tags is None:
    if eval_config:
      # Default to serving unless all the signature_names are eval. We do not
      # support running with a mixture of eval and non-eval tags.
      signatures = [s.signature_name for s in eval_config.model_specs]
      if eval_constants.EVAL_TAG in signatures:
        if not all(s == eval_constants.EVAL_TAG for s in signatures):
          tf.compat.v1.logging.warning(
              'mixture of eval and non-eval signatures used: '
              'eval_config={}'.format(eval_config))
        tags = [eval_constants.EVAL_TAG]
      else:
        tags = [tf.saved_model.SERVING]
    else:
      tags = [eval_constants.EVAL_TAG]

  # Backwards compatibility for legacy add_metrics_callbacks implementation.
  if tags == [eval_constants.EVAL_TAG]:
    # PyType doesn't know about the magic exports we do in post_export_metrics.
    # Additionally, the lines seem to get reordered in compilation, so we can't
    # just put the disable-attr on the add_metrics_callbacks lines.
    # pytype: disable=module-attr
    if not add_metrics_callbacks:
      add_metrics_callbacks = []
    # Always compute example weight and example count.
    example_count_callback = post_export_metrics.example_count()
    add_metrics_callbacks.append(example_count_callback)
    if example_weight_key:
      if isinstance(example_weight_key, dict):
        for output_name, key in example_weight_key.items():
          example_weight_callback = post_export_metrics.example_weight(
              key, metric_tag=output_name)
          add_metrics_callbacks.append(example_weight_callback)
      else:
        example_weight_callback = post_export_metrics.example_weight(
            example_weight_key)
        add_metrics_callbacks.append(example_weight_callback)
    # pytype: enable=module-attr

  return types.EvalSharedModel(
      model_path=eval_saved_model_path,
      add_metrics_callbacks=add_metrics_callbacks,
      include_default_metrics=include_default_metrics,
      example_weight_key=example_weight_key,
      additional_fetches=additional_fetches,
      model_loader=types.ModelLoader(
          tags=tags,
          construct_fn=model_util.model_construct_fn(
              eval_saved_model_path=eval_saved_model_path,
              add_metrics_callbacks=add_metrics_callbacks,
              include_default_metrics=include_default_metrics,
              additional_fetches=additional_fetches,
              blacklist_feature_fetches=blacklist_feature_fetches,
              tags=tags)))
def default_eval_shared_model(
    eval_saved_model_path: Text,
    add_metrics_callbacks: Optional[List[types.AddMetricsCallbackType]] = None,
    include_default_metrics: Optional[bool] = True,
    example_weight_key: Optional[Union[Text, Dict[Text, Text]]] = None,
    additional_fetches: Optional[List[Text]] = None,
    blacklist_feature_fetches: Optional[List[Text]] = None
) -> types.EvalSharedModel:
  """Returns default EvalSharedModel.

  Args:
    eval_saved_model_path: Path to EvalSavedModel.
    add_metrics_callbacks: Optional list of callbacks for adding additional
      metrics to the graph (see EvalSharedModel for more information on how to
      configure additional metrics). Metrics for example count and example
      weights will be added automatically.
    include_default_metrics: True to include the default metrics that are part
      of the saved model graph during evaluation.
    example_weight_key: Example weight key (single-output model) or dict of
      example weight keys (multi-output model) keyed by output name.
    additional_fetches: Prefixes of additional tensors stored in
      signature_def.inputs that should be fetched at prediction time. The
      "features" and "labels" tensors are handled automatically and should not
      be included.
    blacklist_feature_fetches: List of tensor names in the features dictionary
      which should be excluded from the fetches request. This is useful in
      scenarios where features are large (e.g. images) and can lead to excessive
      memory use if stored.
  """
  # Always compute example weight and example count.
  # PyType doesn't know about the magic exports we do in post_export_metrics.
  # Additionally, the lines seem to get reordered in compilation, so we can't
  # just put the disable-attr on the add_metrics_callbacks lines.
  # pytype: disable=module-attr
  if not add_metrics_callbacks:
    add_metrics_callbacks = []
  example_count_callback = post_export_metrics.example_count()
  add_metrics_callbacks.append(example_count_callback)
  if example_weight_key:
    if isinstance(example_weight_key, dict):
      for output_name, key in example_weight_key.items():
        example_weight_callback = post_export_metrics.example_weight(
            key, metric_tag=output_name)
        add_metrics_callbacks.append(example_weight_callback)
    else:
      example_weight_callback = post_export_metrics.example_weight(
          example_weight_key)
      add_metrics_callbacks.append(example_weight_callback)
  # pytype: enable=module-attr

  return types.EvalSharedModel(
      model_path=eval_saved_model_path,
      add_metrics_callbacks=add_metrics_callbacks,
      include_default_metrics=include_default_metrics,
      example_weight_key=example_weight_key,
      additional_fetches=additional_fetches,
      construct_fn=dofn.make_construct_fn(
          eval_saved_model_path,
          add_metrics_callbacks,
          include_default_metrics,
          additional_fetches=additional_fetches,
          blacklist_feature_fetches=blacklist_feature_fetches))
Esempio n. 23
0
    def testEvaluateWithSlicingAndDifferentBatchSizes(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)
        eval_shared_model = types.EvalSharedModel(
            model_path=eval_export_dir,
            add_metrics_callbacks=[_addExampleCountMetricCallback])
        extractors = [
            predict_extractor.PredictExtractor(eval_shared_model),
            slice_key_extractor.SliceKeyExtractor([
                slicer.SingleSliceSpec(),
                slicer.SingleSliceSpec(columns=['slice_key'])
            ])
        ]

        for batch_size in [1, 2, 4, 8]:

            with beam.Pipeline() as pipeline:
                example1 = self._makeExample(age=3.0,
                                             language='english',
                                             label=1.0,
                                             slice_key='first_slice')
                example2 = self._makeExample(age=3.0,
                                             language='chinese',
                                             label=0.0,
                                             slice_key='first_slice')
                example3 = self._makeExample(age=4.0,
                                             language='english',
                                             label=0.0,
                                             slice_key='second_slice')
                example4 = self._makeExample(age=5.0,
                                             language='chinese',
                                             label=1.0,
                                             slice_key='second_slice')
                example5 = self._makeExample(age=5.0,
                                             language='chinese',
                                             label=1.0,
                                             slice_key='second_slice')

                metrics, plots = (
                    pipeline
                    | 'Create' >> beam.Create([
                        example1.SerializeToString(),
                        example2.SerializeToString(),
                        example3.SerializeToString(),
                        example4.SerializeToString(),
                        example5.SerializeToString(),
                    ])
                    | 'ToExampleAnExtracts' >> evaluate.ToExampleAndExtracts()
                    | 'Extractors' >> evaluate.Extract(extractors=extractors)
                    | 'Evaluate' >> evaluate.Evaluate(
                        eval_shared_model=eval_shared_model,
                        desired_batch_size=batch_size))

                def check_result(got):
                    try:
                        self.assertEqual(3, len(got), 'got: %s' % got)
                        slices = {}
                        for slice_key, value in got:
                            slices[slice_key] = value
                        overall_slice = ()
                        first_slice = (('slice_key', b'first_slice'), )
                        second_slice = (('slice_key', b'second_slice'), )
                        self.assertItemsEqual(
                            list(slices.keys()),
                            [overall_slice, first_slice, second_slice])
                        self.assertDictElementsAlmostEqual(
                            slices[overall_slice], {
                                'accuracy': 0.4,
                                'label/mean': 0.6,
                                'my_mean_age': 4.0,
                                'my_mean_age_times_label': 2.6,
                                'added_example_count': 5.0
                            })
                        self.assertDictElementsAlmostEqual(
                            slices[first_slice], {
                                'accuracy': 1.0,
                                'label/mean': 0.5,
                                'my_mean_age': 3.0,
                                'my_mean_age_times_label': 1.5,
                                'added_example_count': 2.0
                            })
                        self.assertDictElementsAlmostEqual(
                            slices[second_slice], {
                                'accuracy': 0.0,
                                'label/mean': 2.0 / 3.0,
                                'my_mean_age': 14.0 / 3.0,
                                'my_mean_age_times_label': 10.0 / 3.0,
                                'added_example_count': 3.0
                            })

                    except AssertionError as err:
                        # This function is redefined every iteration, so it will have the
                        # right value of batch_size.
                        raise util.BeamAssertException(
                            'batch_size = %d, error: %s' % (batch_size, err))  # pylint: disable=cell-var-from-loop

                util.assert_that(metrics, check_result, label='metrics')
                util.assert_that(plots, util.is_empty(), label='plots')
Esempio n. 24
0
    def testEvaluateNoSlicingAddPostExportAndCustomMetrics(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)
        eval_shared_model = types.EvalSharedModel(
            model_path=eval_export_dir,
            add_metrics_callbacks=[
                _addExampleCountMetricCallback,
                # Note that since everything runs in-process this doesn't
                # actually test that the py_func can be correctly recreated
                # on workers in a distributed context.
                _addPyFuncMetricCallback,
                post_export_metrics.example_count(),
                post_export_metrics.example_weight(example_weight_key='age')
            ])
        extractors = [
            predict_extractor.PredictExtractor(eval_shared_model),
            slice_key_extractor.SliceKeyExtractor()
        ]

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(age=3.0,
                                         language='english',
                                         label=1.0)
            example2 = self._makeExample(age=3.0,
                                         language='chinese',
                                         label=0.0)
            example3 = self._makeExample(age=4.0,
                                         language='english',
                                         label=1.0)
            example4 = self._makeExample(age=5.0,
                                         language='chinese',
                                         label=0.0)

            metrics, plots = (
                pipeline
                | 'Create' >> beam.Create([
                    example1.SerializeToString(),
                    example2.SerializeToString(),
                    example3.SerializeToString(),
                    example4.SerializeToString()
                ])
                | 'ToExampleAnExtracts' >> evaluate.ToExampleAndExtracts()
                | 'Extract' >> evaluate.Extract(extractors=extractors)
                | 'Evaluate' >>
                evaluate.Evaluate(eval_shared_model=eval_shared_model))

            def check_result(got):
                try:
                    self.assertEqual(1, len(got), 'got: %s' % got)
                    (slice_key, value) = got[0]
                    self.assertEqual((), slice_key)
                    self.assertDictElementsAlmostEqual(
                        got_values_dict=value,
                        expected_values_dict={
                            'accuracy': 1.0,
                            'label/mean': 0.5,
                            'my_mean_age': 3.75,
                            'my_mean_age_times_label': 1.75,
                            'added_example_count': 4.0,
                            'py_func_label_sum': 2.0,
                            metric_keys.EXAMPLE_COUNT: 4.0,
                            metric_keys.EXAMPLE_WEIGHT: 15.0
                        })
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics, check_result, label='metrics')
            util.assert_that(plots, util.is_empty(), label='plots')
Esempio n. 25
0
    def testAggregateMultipleSlices(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)

        eval_saved_model = load.EvalSavedModel(eval_export_dir)
        eval_shared_model = types.EvalSharedModel(model_path=eval_export_dir)

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(age=3.0,
                                         language='english',
                                         label=1.0)
            example2 = self._makeExample(age=3.0,
                                         language='chinese',
                                         label=0.0)
            example3 = self._makeExample(age=4.0,
                                         language='english',
                                         label=1.0)
            example4 = self._makeExample(age=5.0,
                                         language='chinese',
                                         label=0.0)

            predict_result_english_slice = eval_saved_model.predict_list(
                [example1.SerializeToString(),
                 example3.SerializeToString()])

            predict_result_chinese_slice = eval_saved_model.predict_list(
                [example2.SerializeToString(),
                 example4.SerializeToString()])

            test_input = (
                create_test_input(predict_result_english_slice, [(
                    ('language', 'english'))]) +
                create_test_input(predict_result_chinese_slice, [(
                    ('language', 'chinese'))]) +
                # Overall slice
                create_test_input(
                    predict_result_english_slice +
                    predict_result_chinese_slice, [()]))

            metrics, _ = (
                pipeline
                | 'CreateTestInput' >> beam.Create(test_input)
                | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics(
                    eval_shared_model=eval_shared_model, desired_batch_size=3))

            def check_result(got):
                self.assertEqual(3, len(got), 'got: %s' % got)
                slices = {}
                for slice_key, metrics in got:
                    slices[slice_key] = metrics
                overall_slice = ()
                english_slice = (('language', 'english'))
                chinese_slice = (('language', 'chinese'))
                self.assertItemsEqual(
                    list(slices.keys()),
                    [overall_slice, english_slice, chinese_slice])
                self.assertDictElementsAlmostEqual(
                    slices[overall_slice], {
                        'accuracy': 1.0,
                        'label/mean': 0.5,
                        'my_mean_age': 3.75,
                        'my_mean_age_times_label': 1.75,
                    })
                self.assertDictElementsAlmostEqual(
                    slices[english_slice], {
                        'accuracy': 1.0,
                        'label/mean': 1.0,
                        'my_mean_age': 3.5,
                        'my_mean_age_times_label': 3.5,
                    })
                self.assertDictElementsAlmostEqual(
                    slices[chinese_slice], {
                        'accuracy': 1.0,
                        'label/mean': 0.0,
                        'my_mean_age': 4.0,
                        'my_mean_age_times_label': 0.0,
                    })

            util.assert_that(metrics, check_result)
Esempio n. 26
0
    def testEvaluateWithPlots(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = (
            fixed_prediction_estimator.simple_fixed_prediction_estimator(
                None, temp_eval_export_dir))
        eval_shared_model = types.EvalSharedModel(
            model_path=eval_export_dir,
            add_metrics_callbacks=[
                post_export_metrics.example_count(),
                post_export_metrics.auc_plots()
            ])
        extractors = [
            predict_extractor.PredictExtractor(eval_shared_model),
            slice_key_extractor.SliceKeyExtractor()
        ]

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(prediction=0.0, label=1.0)
            example2 = self._makeExample(prediction=0.7, label=0.0)
            example3 = self._makeExample(prediction=0.8, label=1.0)
            example4 = self._makeExample(prediction=1.0, label=1.0)

            metrics, plots = (
                pipeline
                | 'Create' >> beam.Create([
                    example1.SerializeToString(),
                    example2.SerializeToString(),
                    example3.SerializeToString(),
                    example4.SerializeToString()
                ])
                | 'ToExampleAnExtracts' >> evaluate.ToExampleAndExtracts()
                | 'Extract' >> evaluate.Extract(extractors=extractors)
                | 'Evaluate' >>
                evaluate.Evaluate(eval_shared_model=eval_shared_model))

            def check_metrics(got):
                try:
                    self.assertEqual(1, len(got), 'got: %s' % got)
                    (slice_key, value) = got[0]
                    self.assertEqual((), slice_key)
                    self.assertDictElementsAlmostEqual(
                        got_values_dict=value,
                        expected_values_dict={
                            metric_keys.EXAMPLE_COUNT: 4.0,
                        })
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics, check_metrics, label='metrics')

            def check_plots(got):
                try:
                    self.assertEqual(1, len(got), 'got: %s' % got)
                    (slice_key, value) = got[0]
                    self.assertEqual((), slice_key)
                    self.assertDictMatrixRowsAlmostEqual(
                        got_values_dict=value,
                        expected_values_dict={
                            metric_keys.AUC_PLOTS_MATRICES:
                            [(8001, [2, 1, 0, 1, 1.0 / 1.0, 1.0 / 3.0])],
                        })
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(plots, check_plots, label='plots')