Example #1
0
    def _runTestWithCustomCheck(self,
                                examples,
                                eval_export_dir,
                                metrics_callbacks,
                                slice_spec=None,
                                custom_metrics_check=None,
                                custom_plots_check=None,
                                custom_result_check=None):
        # make sure we are doing some checks
        self.assertTrue(custom_metrics_check is not None
                        or custom_plots_check is not None
                        or custom_result_check is not None)
        serialized_examples = [ex.SerializeToString() for ex in examples]
        slicing_specs = None
        if slice_spec:
            slicing_specs = [s.to_proto() for s in slice_spec]
        eval_config = config.EvalConfig(slicing_specs=slicing_specs)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_export_dir,
            add_metrics_callbacks=metrics_callbacks)
        extractors = model_eval_lib.default_extractors(
            eval_config=eval_config, eval_shared_model=eval_shared_model)
        tfx_io = raw_tf_record.RawBeamRecordTFXIO(
            physical_format='inmemory',
            raw_record_column_name=constants.ARROW_INPUT_COLUMN,
            telemetry_descriptors=['TFMATest'])
        with beam.Pipeline() as pipeline:
            (metrics, plots), _ = (
                pipeline
                | 'Create' >> beam.Create(serialized_examples)
                | 'BatchExamples' >> tfx_io.BeamSource()
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | 'Extract' >> tfma_unit.Extract(extractors=extractors)  # pylint: disable=no-value-for-parameter
                | 'ComputeMetricsAndPlots' >>
                legacy_metrics_and_plots_evaluator._ComputeMetricsAndPlots(  # pylint: disable=protected-access
                    eval_shared_model=eval_shared_model,
                    compute_confidence_intervals=self.
                    compute_confidence_intervals,
                    random_seed_for_testing=self.deterministic_test_seed))
            if custom_metrics_check is not None:
                util.assert_that(metrics,
                                 custom_metrics_check,
                                 label='metrics')
            if custom_plots_check is not None:
                util.assert_that(plots, custom_plots_check, label='plot')

        result = pipeline.run()
        if custom_result_check is not None:
            custom_result_check(result)
Example #2
0
    def testEvaluateWithSlicingAndUncertainty(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_export_dir,
            add_metrics_callbacks=[_addExampleCountMetricCallback])
        extractors = [
            legacy_predict_extractor.PredictExtractor(eval_shared_model),
            slice_key_extractor.SliceKeyExtractor([
                slicer.SingleSliceSpec(),
                slicer.SingleSliceSpec(columns=['slice_key'])
            ])
        ]

        for batch_size in [1, 2, 4, 8]:

            with beam.Pipeline() as pipeline:
                example1 = self._makeExample(age=3.0,
                                             language='english',
                                             label=1.0,
                                             slice_key='first_slice')
                example2 = self._makeExample(age=3.0,
                                             language='chinese',
                                             label=0.0,
                                             slice_key='first_slice')
                example3 = self._makeExample(age=4.0,
                                             language='english',
                                             label=0.0,
                                             slice_key='second_slice')
                example4 = self._makeExample(age=5.0,
                                             language='chinese',
                                             label=1.0,
                                             slice_key='second_slice')
                example5 = self._makeExample(age=5.0,
                                             language='chinese',
                                             label=1.0,
                                             slice_key='second_slice')

                (metrics, _), _ = (
                    pipeline
                    | 'Create' >> beam.Create([
                        example1.SerializeToString(),
                        example2.SerializeToString(),
                        example3.SerializeToString(),
                        example4.SerializeToString(),
                        example5.SerializeToString(),
                    ])
                    | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                    | 'Extract' >> tfma_unit.Extract(extractors=extractors)  # pylint: disable=no-value-for-parameter
                    | 'ComputeMetricsAndPlots' >>
                    metrics_and_plots_evaluator._ComputeMetricsAndPlots(
                        eval_shared_model=eval_shared_model,
                        desired_batch_size=batch_size,
                        compute_confidence_intervals=True))

                def check_result(got):
                    try:
                        self.assertEqual(3, len(got), 'got: %s' % got)
                        slices = {}
                        for slice_key, value in got:
                            slices[slice_key] = value
                        overall_slice = ()
                        first_slice = (('slice_key', 'first_slice'), )
                        second_slice = (('slice_key', 'second_slice'), )
                        self.assertCountEqual(
                            list(slices.keys()),
                            [overall_slice, first_slice, second_slice])
                        self.assertDictElementsWithTDistributionAlmostEqual(
                            slices[overall_slice], {
                                'accuracy': 0.4,
                                'label/mean': 0.6,
                                'my_mean_age': 4.0,
                                'my_mean_age_times_label': 2.6,
                                'added_example_count': 5.0
                            })
                        self.assertDictElementsWithTDistributionAlmostEqual(
                            slices[first_slice], {
                                'accuracy': 1.0,
                                'label/mean': 0.5,
                                'my_mean_age': 3.0,
                                'my_mean_age_times_label': 1.5,
                                'added_example_count': 2.0
                            })
                        self.assertDictElementsWithTDistributionAlmostEqual(
                            slices[second_slice], {
                                'accuracy': 0.0,
                                'label/mean': 2.0 / 3.0,
                                'my_mean_age': 14.0 / 3.0,
                                'my_mean_age_times_label': 10.0 / 3.0,
                                'added_example_count': 3.0
                            })

                    except AssertionError as err:
                        # This function is redefined every iteration, so it will have the
                        # right value of batch_size.
                        raise util.BeamAssertException(
                            'batch_size = %d, error: %s' % (batch_size, err))  # pylint: disable=cell-var-from-loop

                util.assert_that(metrics, check_result, label='metrics')
Example #3
0
    def assertGeneralMetricsComputedWithBeamAre(
            self, eval_saved_model_path: str,
            examples_pcollection: beam.pvalue.PCollection,
            slice_spec: List[slicer.SingleSliceSpec],
            add_metrics_callbacks: List[types.AddMetricsCallbackType],
            expected_slice_metrics: Dict[Any, Dict[str, Any]]):
        """Checks metrics computed using Beam.

    A more general version of assertMetricsComputedWithBeamAre. Note that the
    caller is responsible for setting up and running the Beam pipeline.

    Example usage:
      def add_metrics(features, predictions, labels):
       metric_ops = {
         'mse': tf.metrics.mean_squared_error(labels, predictions['logits']),
         'mae': tf.metrics.mean_absolute_error(labels, predictions['logits']),
      }
      return metric_ops

      with beam.Pipeline() as pipeline:
        expected_slice_metrics = {
            (): {
              'mae': 0.1,
              'mse': 0.2,
              tfma.post_export_metrics.metric_keys.AUC:
                tfma.test.BoundedValue(lower_bound=0.5)
            },
            (('age', 10),): {
              'mae': 0.2,
              'mse': 0.3,
              tfma.post_export_metrics.metric_keys.AUC:
                tfma.test.BoundedValue(lower_bound=0.5)
            },
        }
        examples = pipeline | 'ReadExamples' >> beam.io.ReadFromTFRecord(path)
        self.assertGeneralMetricsComputedWithBeamAre(
          eval_saved_model_path=path,
          examples_pcollection=examples,
          slice_spec=[tfma.slicer.SingleSliceSpec(),
                      tfma.slicer.SingleSliceSpec(columns=['age'])],
          add_metrics_callbacks=[
            add_metrics, tfma.post_export_metrics.auc()],
          expected_slice_metrics=expected_slice_metrics)

    Args:
      eval_saved_model_path: Path to the directory containing the
        EvalSavedModel.
      examples_pcollection: A PCollection of serialized example bytes.
      slice_spec: List of slice specifications.
      add_metrics_callbacks: Callbacks for adding additional metrics.
      expected_slice_metrics: Dictionary of dictionaries describing the expected
        metrics for each slice. The outer dictionary map slice keys to the
        expected metrics for that slice.
    """
        def check_metrics(got):
            """Check metrics callback."""
            try:
                slices = {}
                for slice_key, value in got:
                    slices[slice_key] = value
                self.assertCountEqual(list(slices.keys()),
                                      list(expected_slice_metrics.keys()))
                for slice_key, expected_metrics in expected_slice_metrics.items(
                ):
                    self.assertDictElementsWithinBounds(
                        got_values_dict=slices[slice_key],
                        expected_values_dict=expected_metrics)
            except AssertionError as err:
                raise beam_util.BeamAssertException(err)

        slicing_specs = None
        if slice_spec:
            slicing_specs = [s.to_proto() for s in slice_spec]
        eval_config = config_pb2.EvalConfig(slicing_specs=slicing_specs)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_saved_model_path,
            add_metrics_callbacks=add_metrics_callbacks)
        extractors = model_eval_lib.default_extractors(
            eval_config=eval_config, eval_shared_model=eval_shared_model)

        tfx_io = raw_tf_record.RawBeamRecordTFXIO(
            physical_format='inmemory',
            raw_record_column_name=constants.ARROW_INPUT_COLUMN,
            telemetry_descriptors=['TFMATest'])
        # pylint: disable=no-value-for-parameter
        (metrics, _), _ = (
            examples_pcollection
            | 'BatchExamples' >> tfx_io.BeamSource()
            | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
            | 'Extract' >> Extract(extractors=extractors)
            | 'ComputeMetricsAndPlots' >>
            legacy_metrics_and_plots_evaluator._ComputeMetricsAndPlots(  # pylint: disable=protected-access
                eval_shared_model=eval_shared_model))
        # pylint: enable=no-value-for-parameter

        beam_util.assert_that(metrics, check_metrics)
Example #4
0
    def assertMetricsComputedWithBeamAre(
        self,
        eval_saved_model_path: str,
        serialized_examples: List[bytes],
        expected_metrics: Dict[str, Any],
        add_metrics_callbacks: Optional[List[
            types.AddMetricsCallbackType]] = None):
        """Checks metrics computed using Beam.

    Metrics will be computed over all examples, without any slicing. If you
    want to provide your own PCollection (e.g. read a large number of examples
    from a file), if you want to check metrics over certain slices, or if you
    want to add additional post-export metrics, use the more general
    assertGeneralMetricsComputedWithBeamAre.

    Example usage:
      self.assertMetricsComputedWithBeamAre(
        eval_saved_model_path=path,
        serialized_examples=[self.makeExample(age=5, label=1.0),
                             self.makeExample(age=10, label=0.0)],
        expected_metrics={'average_loss': 0.1})

    Args:
      eval_saved_model_path: Path to the directory containing the
        EvalSavedModel.
      serialized_examples: List of serialized example bytes.
      expected_metrics: Dictionary of expected metric values.
      add_metrics_callbacks: Optional. Callbacks for adding additional metrics.
    """
        def check_metrics(got):
            """Check metrics callback."""
            try:
                self.assertEqual(
                    1, len(got),
                    'expecting metrics for exactly one slice, but got %d '
                    'slices instead. metrics were: %s' % (len(got), got))
                (slice_key, value) = got[0]
                self.assertEqual((), slice_key)
                self.assertDictElementsWithinBounds(
                    got_values_dict=value,
                    expected_values_dict=expected_metrics)
            except AssertionError as err:
                raise beam_util.BeamAssertException(err)

        eval_config = config_pb2.EvalConfig()
        eval_shared_model = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=eval_saved_model_path,
            add_metrics_callbacks=add_metrics_callbacks)
        extractors = model_eval_lib.default_extractors(
            eval_config=eval_config, eval_shared_model=eval_shared_model)

        tfx_io = raw_tf_record.RawBeamRecordTFXIO(
            physical_format='inmemory',
            raw_record_column_name=constants.ARROW_INPUT_COLUMN,
            telemetry_descriptors=['TFMATest'])
        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            (metrics, _), _ = (
                pipeline
                | 'CreateExamples' >> beam.Create(serialized_examples)
                | 'BatchExamples' >> tfx_io.BeamSource()
                |
                'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
                | 'Extract' >> Extract(extractors=extractors)
                | 'ComputeMetricsAndPlots' >>
                legacy_metrics_and_plots_evaluator._ComputeMetricsAndPlots(  # pylint: disable=protected-access
                    eval_shared_model=eval_shared_model))
            # pylint: enable=no-value-for-parameter

            beam_util.assert_that(metrics, check_metrics)
Example #5
0
    def testModelAgnosticConstructFn(self):
        # End to end test for the entire flow going from tf.Examples -> metrics
        # with slicing.
        with beam.Pipeline() as pipeline:
            # Set up the inputs. All we need is are tf.Examples and an example parsing
            # spec with explicit mapping for key to (Features, Predictions, Labels).
            # TODO(b/119788402): Add a fairness data examples/callbacks as another
            # test.
            examples = [
                self._makeExample(age=3.0,
                                  language='english',
                                  probabilities=1.0,
                                  labels=1.0),
                self._makeExample(age=3.0,
                                  language='chinese',
                                  probabilities=3.0,
                                  labels=0.0),
                self._makeExample(age=4.0,
                                  language='english',
                                  probabilities=2.0,
                                  labels=1.0),
                self._makeExample(age=5.0,
                                  language='chinese',
                                  probabilities=3.0,
                                  labels=0.0),
                # Add some examples with no language.
                self._makeExample(age=5.0, probabilities=2.0, labels=10.0),
                self._makeExample(age=6.0, probabilities=1.0, labels=0.0)
            ]
            serialized_examples = [e.SerializeToString() for e in examples]

            # Set up a config to bucket our example keys.
            feature_map = {
                'age': tf.io.FixedLenFeature([], tf.float32),
                'language': tf.io.VarLenFeature(tf.string),
                'probabilities': tf.io.FixedLenFeature([], tf.float32),
                'labels': tf.io.FixedLenFeature([], tf.float32)
            }

            model_agnostic_config = agnostic_predict.ModelAgnosticConfig(
                label_keys=['labels'],
                prediction_keys=['probabilities'],
                feature_spec=feature_map)

            # Set up the Model Agnostic Extractor
            extractors = [
                model_agnostic_extractor.ModelAgnosticExtractor(
                    model_agnostic_config=model_agnostic_config,
                    desired_batch_size=3),
                slice_key_extractor.SliceKeyExtractor([
                    slicer.SingleSliceSpec(),
                    slicer.SingleSliceSpec(columns=['language'])
                ])
            ]

            # Set up the metrics we wish to calculate via a metric callback. In
            # particular, this metric calculates the mean and sum of all labels.
            eval_shared_model = types.EvalSharedModel(
                add_metrics_callbacks=[add_mean_callback],
                construct_fn=model_agnostic_evaluate_graph.make_construct_fn(
                    add_metrics_callbacks=[add_mean_callback],
                    config=model_agnostic_config))

            # Run our pipeline doing Extract -> Slice -> Fanout -> Calculate Metrics.
            (metrics, _), _ = (
                pipeline
                | 'Create Examples' >> beam.Create(serialized_examples)
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'Extract' >> tfma_unit.Extract(extractors=extractors)  # pylint: disable=no-value-for-parameter
                | 'ComputeMetricsAndPlots' >>
                legacy_metrics_and_plots_evaluator._ComputeMetricsAndPlots(  # pylint: disable=protected-access
                    eval_shared_model=eval_shared_model))

            # Verify our metrics are properly generated per slice.
            def check_result(got):
                self.assertEqual(3, len(got), 'got: %s' % got)
                slices = {}
                for slice_key, metrics in got:
                    slices[slice_key] = metrics
                overall_slice = ()
                english_slice = (('language', 'english'), )
                chinese_slice = (('language', 'chinese'), )

                self.assertItemsEqual(
                    list(slices.keys()),
                    [overall_slice, english_slice, chinese_slice])
                # Overall slice has label/predictions sum = 24 and 12 elements.
                self.assertDictElementsAlmostEqual(slices[overall_slice], {
                    'tf_metric_mean': 2.0,
                    'py_func_total_label': 24.0,
                })
                # English slice has label/predictions sum = 5 and 4 elements.
                self.assertDictElementsAlmostEqual(slices[english_slice], {
                    'tf_metric_mean': 1.25,
                    'py_func_total_label': 5.0,
                })
                # Chinese slice has label/predictions sum = 6 and 4 elements.
                self.assertDictElementsAlmostEqual(slices[chinese_slice], {
                    'tf_metric_mean': 1.5,
                    'py_func_total_label': 6.0,
                })

            util.assert_that(metrics, check_result)