def testMultiModelPredict(self):
    temp_eval_export_dir = self._getEvalExportDir()
    _, model1_dir = linear_classifier.simple_linear_classifier(
        None, temp_eval_export_dir)
    model1 = model_eval_lib.default_eval_shared_model(
        eval_saved_model_path=model1_dir)
    _, model2_dir = linear_classifier.simple_linear_classifier(
        None, temp_eval_export_dir)
    model2 = model_eval_lib.default_eval_shared_model(
        eval_saved_model_path=model2_dir)
    eval_shared_model = {'model1': model1, 'model2': model2}
    eval_config = config.EvalConfig(model_specs=[
        config.ModelSpec(name='model1', example_weight_key='age'),
        config.ModelSpec(name='model2', example_weight_key='age')
    ])

    tfx_io = raw_tf_record.RawBeamRecordTFXIO(
        physical_format='inmemory',
        raw_record_column_name=constants.ARROW_INPUT_COLUMN,
        telemetry_descriptors=['TFMATest'])
    extractor = predict_extractor.PredictExtractor(
        eval_shared_model, eval_config=eval_config)
    with beam.Pipeline() as pipeline:
      examples = [
          self._makeExample(age=3.0, language='english', label=1.0),
          self._makeExample(age=3.0, language='chinese', label=0.0),
          self._makeExample(age=4.0, language='english', label=1.0),
          self._makeExample(age=5.0, language='chinese', label=0.0),
      ]
      serialized_examples = [e.SerializeToString() for e in examples]

      predict_extracts = (
          pipeline
          | beam.Create(serialized_examples, reshuffle=False)
          | 'BatchExamples' >> tfx_io.BeamSource(batch_size=2)
          | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
          | 'Predict' >> extractor.ptransform)

      def check_result(got):
        try:
          self.assertLen(got, 2)
          for item in got:
            self.assertIn(constants.FEATURES_KEY, item)
            for feature in ('language', 'age'):
              for features_dict in item[constants.FEATURES_KEY]:
                self.assertIn(feature, features_dict)
            self.assertIn(constants.LABELS_KEY, item)
            self.assertIn(constants.PREDICTIONS_KEY, item)
            for model in ('model1', 'model2'):
              for predictions_dict in item[constants.PREDICTIONS_KEY]:
                self.assertIn(model, predictions_dict)
            self.assertIn(constants.EXAMPLE_WEIGHTS_KEY, item)
            for i in range(len(item[constants.FEATURES_KEY])):
              self.assertAlmostEqual(item[constants.FEATURES_KEY][i]['age'],
                                     item[constants.EXAMPLE_WEIGHTS_KEY][i])

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(predict_extracts, check_result)
    def testMultiModelPredict(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, model1_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)
        model1 = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=model1_dir)
        _, model2_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)
        model2 = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=model2_dir)
        eval_config = config.EvalConfig(model_specs=[
            config.ModelSpec(name='model1', example_weight_key='age'),
            config.ModelSpec(name='model2', example_weight_key='age')
        ])

        with beam.Pipeline() as pipeline:
            examples = [
                self._makeExample(age=3.0, language='english', label=1.0),
                self._makeExample(age=3.0, language='chinese', label=0.0),
                self._makeExample(age=4.0, language='english', label=1.0),
                self._makeExample(age=5.0, language='chinese', label=0.0),
            ]
            serialized_examples = [e.SerializeToString() for e in examples]

            predict_extracts = (
                pipeline
                | beam.Create(serialized_examples, reshuffle=False)
                # Our diagnostic outputs, pass types.Extracts throughout, however our
                # aggregating functions do not use this interface.
                | beam.Map(lambda x: {constants.INPUT_KEY: x})
                | 'Predict' >> predict_extractor._TFMAPredict(
                    eval_shared_models={
                        'model1': model1,
                        'model2': model2
                    },
                    desired_batch_size=3,
                    eval_config=eval_config))

            def check_result(got):
                try:
                    self.assertLen(got, 4)
                    for item in got:
                        self.assertIn(constants.FEATURES_KEY, item)
                        for feature in ('language', 'age'):
                            self.assertIn(feature,
                                          item[constants.FEATURES_KEY])
                        self.assertIn(constants.LABELS_KEY, item)
                        self.assertIn(constants.PREDICTIONS_KEY, item)
                        for model in ('model1', 'model2'):
                            self.assertIn(model,
                                          item[constants.PREDICTIONS_KEY])
                        self.assertIn(constants.EXAMPLE_WEIGHTS_KEY, item)
                        self.assertAlmostEqual(
                            item[constants.FEATURES_KEY]['age'],
                            item[constants.EXAMPLE_WEIGHTS_KEY])

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(predict_extracts, check_result)
Exemple #3
0
    def testEvaluateExistingMetricsWithExportedCustomMetrics(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)

        eval_saved_model = load.EvalSavedModel(eval_export_dir)
        example1 = self._makeExample(age=3.0, language='english', label=1.0)
        features_predictions_labels = self.predict_injective_single_example(
            eval_saved_model, example1.SerializeToString())
        eval_saved_model.perform_metrics_update(features_predictions_labels)

        example2 = self._makeExample(age=2.0, language='chinese', label=0.0)
        features_predictions_labels = self.predict_injective_single_example(
            eval_saved_model, example2.SerializeToString())
        eval_saved_model.perform_metrics_update(features_predictions_labels)

        metric_values = eval_saved_model.get_metric_values()
        self.assertDictElementsAlmostEqual(
            metric_values, {
                'accuracy': 1.0,
                'auc': 1.0,
                'my_mean_age': 2.5,
                'my_mean_label': 0.5,
                'my_mean_age_times_label': 1.5
            })

        self.assertIn('my_mean_prediction', metric_values)
        self.assertIn('prediction/mean', metric_values)
        self.assertAlmostEqual(metric_values['prediction/mean'],
                               metric_values['my_mean_prediction'],
                               places=5)
    def testNativeEvalSavedModelMetricComputations(self):
        temp_export_dir = self._getExportDir()
        _, export_dir = linear_classifier.simple_linear_classifier(
            None, temp_export_dir)

        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir)

        computation = (
            eval_saved_model_util.metric_computations_using_eval_saved_model(
                '', eval_shared_model.model_loader)[0])

        examples = [
            self._makeExample(age=3.0, language='english', label=1.0),
            self._makeExample(age=3.0, language='chinese', label=0.0),
            self._makeExample(age=4.0, language='english', label=1.0),
            self._makeExample(age=5.0, language='chinese', label=0.0)
        ]

        extracts = []
        for e in examples:
            extracts.append({constants.INPUT_KEY: e.SerializeToString()})

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(extracts)
                | 'Process' >> beam.ParDo(computation.preprocessor)
                | 'ToStandardMetricInputs' >> beam.Map(
                    metric_types.StandardMetricInputs)
                | 'AddSlice' >> beam.Map(lambda x: ((), x))
                | 'ComputeMetric' >> beam.CombinePerKey(computation.combiner))

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    self.assertDictElementsAlmostEqual(
                        got_metrics, {
                            metric_types.MetricKey(name='accuracy'):
                            1.0,
                            metric_types.MetricKey(name='label/mean'):
                            0.5,
                            metric_types.MetricKey(name='my_mean_age'):
                            3.75,
                            metric_types.MetricKey(name='my_mean_age_times_label'):
                            1.75
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
Exemple #5
0
    def testEvaluateNoSlicingAddPostExportAndCustomMetrics(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(age=3.0,
                                         language='english',
                                         label=1.0)
            example2 = self._makeExample(age=3.0,
                                         language='chinese',
                                         label=0.0)
            example3 = self._makeExample(age=4.0,
                                         language='english',
                                         label=1.0)
            example4 = self._makeExample(age=5.0,
                                         language='chinese',
                                         label=0.0)

            metrics, plots = (pipeline
                              | beam.Create([
                                  example1.SerializeToString(),
                                  example2.SerializeToString(),
                                  example3.SerializeToString(),
                                  example4.SerializeToString()
                              ])
                              | evaluate.Evaluate(
                                  eval_saved_model_path=eval_export_dir,
                                  add_metrics_callbacks=[
                                      _addExampleCountMetricCallback,
                                      post_export_metrics.example_count(),
                                      post_export_metrics.example_weight(
                                          example_weight_key='age')
                                  ]))

            def check_result(got):
                try:
                    self.assertEqual(1, len(got), 'got: %s' % got)
                    (slice_key, value) = got[0]
                    self.assertEqual((), slice_key)
                    self.assertDictElementsAlmostEqual(
                        got_values_dict=value,
                        expected_values_dict={
                            'accuracy': 1.0,
                            'label/mean': 0.5,
                            'my_mean_age': 3.75,
                            'my_mean_age_times_label': 1.75,
                            'added_example_count': 4.0,
                            metric_keys.EXAMPLE_COUNT: 4.0,
                            metric_keys.EXAMPLE_WEIGHT: 15.0
                        })
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics, check_result, label='metrics')
            util.assert_that(plots, util.is_empty(), label='plots')
    def testEvaluateNoSlicing(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_export_dir,
            add_metrics_callbacks=[_addExampleCountMetricCallback])
        extractors = [
            predict_extractor.PredictExtractor(eval_shared_model),
            slice_key_extractor.SliceKeyExtractor()
        ]

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(age=3.0,
                                         language='english',
                                         label=1.0)
            example2 = self._makeExample(age=3.0,
                                         language='chinese',
                                         label=0.0)
            example3 = self._makeExample(age=4.0,
                                         language='english',
                                         label=1.0)
            example4 = self._makeExample(age=5.0,
                                         language='chinese',
                                         label=0.0)

            metrics, _ = (
                pipeline
                | 'Create' >> beam.Create([
                    example1.SerializeToString(),
                    example2.SerializeToString(),
                    example3.SerializeToString(),
                    example4.SerializeToString()
                ])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'Extract' >> tfma_unit.Extract(extractors=extractors)  # pylint: disable=no-value-for-parameter
                | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator.
                ComputeMetricsAndPlots(eval_shared_model=eval_shared_model))

            def check_result(got):
                try:
                    self.assertEqual(1, len(got), 'got: %s' % got)
                    (slice_key, value) = got[0]
                    self.assertEqual((), slice_key)
                    self.assertDictElementsAlmostEqual(
                        value, {
                            'accuracy': 1.0,
                            'label/mean': 0.5,
                            'my_mean_age': 3.75,
                            'my_mean_age_times_label': 1.75,
                            'added_example_count': 4.0
                        })
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics, check_result)
    def testPredict(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(age=3.0,
                                         language='english',
                                         label=1.0)
            example2 = self._makeExample(age=3.0,
                                         language='chinese',
                                         label=0.0)
            example3 = self._makeExample(age=4.0,
                                         language='english',
                                         label=1.0)
            example4 = self._makeExample(age=5.0,
                                         language='chinese',
                                         label=0.0)

            predict_extracts = (
                pipeline
                | beam.Create([
                    example1.SerializeToString(),
                    example2.SerializeToString(),
                    example3.SerializeToString(),
                    example4.SerializeToString()
                ])
                # Our diagnostic outputs, pass types.ExampleAndExtracts throughout,
                # however our aggregating functions do not use this interface.
                | beam.Map(
                    lambda x: types.ExampleAndExtracts(example=x, extracts={}))
                | 'Predict' >> predict_extractor.TFMAPredict(
                    eval_saved_model_path=eval_export_dir,
                    add_metrics_callbacks=None,
                    shared_handle=shared.Shared(),
                    desired_batch_size=3))

            def check_result(got):
                try:
                    self.assertEqual(4, len(got), 'got: %s' % got)
                    for item in got:
                        extracts_dict = item.extracts
                        self.assertTrue(extracts_dict.has_key('fpl'))
                        fpl = extracts_dict['fpl']
                        # Verify fpl contains features, probabilities, and correct labels.
                        self.assertIn('language', fpl.features)
                        self.assertIn('age', fpl.features)
                        self.assertIn('label', fpl.features)
                        self.assertIn('probabilities', fpl.predictions)
                        self.assertAlmostEqual(fpl.features['label'],
                                               fpl.labels['__labels'])
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(predict_extracts, check_result)
    def testAggregateOverallSlice(self):

        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)

        eval_saved_model = load.EvalSavedModel(eval_export_dir)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_export_dir)

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(age=3.0,
                                         language='english',
                                         label=1.0)
            example2 = self._makeExample(age=3.0,
                                         language='chinese',
                                         label=0.0)
            example3 = self._makeExample(age=4.0,
                                         language='english',
                                         label=1.0)
            example4 = self._makeExample(age=5.0,
                                         language='chinese',
                                         label=0.0)

            predict_result = eval_saved_model.as_features_predictions_labels(
                eval_saved_model.predict_list([
                    example1.SerializeToString(),
                    example2.SerializeToString(),
                    example3.SerializeToString(),
                    example4.SerializeToString()
                ]))

            metrics, _ = (
                pipeline
                | 'CreateTestInput' >> beam.Create(
                    create_test_input(predict_result, [()]))
                | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics(
                    eval_shared_model=eval_shared_model, desired_batch_size=3))

            def check_result(got):
                self.assertEqual(1, len(got), 'got: %s' % got)
                slice_key, metrics = got[0]
                self.assertEqual(slice_key, ())
                self.assertDictElementsAlmostEqual(
                    metrics, {
                        'accuracy': 1.0,
                        'label/mean': 0.5,
                        'my_mean_age': 3.75,
                        'my_mean_age_times_label': 1.75,
                    })

            util.assert_that(metrics, check_result)
Exemple #9
0
    def testPredict(self, features_blacklist):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)
        eval_shared_model = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=eval_export_dir,
            blacklist_feature_fetches=features_blacklist)
        with beam.Pipeline() as pipeline:
            examples = [
                self._makeExample(age=3.0, language='english', label=1.0),
                self._makeExample(age=3.0, language='chinese', label=0.0),
                self._makeExample(age=4.0, language='english', label=1.0),
                self._makeExample(age=5.0, language='chinese', label=0.0),
            ]
            serialized_examples = [e.SerializeToString() for e in examples]

            predict_extracts = (
                pipeline
                | beam.Create(serialized_examples, reshuffle=False)
                # Our diagnostic outputs, pass types.Extracts throughout, however our
                # aggregating functions do not use this interface.
                | beam.Map(lambda x: {constants.INPUT_KEY: x})
                | 'Predict' >> predict_extractor._TFMAPredict(
                    eval_shared_models={'': eval_shared_model},
                    desired_batch_size=3))

            def check_result(got):
                try:
                    self.assertLen(got, 4)
                    for item in got:
                        self.assertIn(
                            constants.FEATURES_PREDICTIONS_LABELS_KEY, item)
                        fpl = item[constants.FEATURES_PREDICTIONS_LABELS_KEY]
                        # Verify fpl contains features, probabilities, and correct labels.
                        blacklisted_features = set(features_blacklist or [])
                        expected_features = (
                            set(['language', 'age', 'label']) -
                            blacklisted_features)
                        for feature in expected_features:
                            self.assertIn(feature, fpl.features)
                        for feature in blacklisted_features:
                            self.assertNotIn(feature, fpl.features)
                        self.assertAlmostEqual(fpl.features['label'],
                                               fpl.labels['__labels'])

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(predict_extracts, check_result)
Exemple #10
0
    def testEvaluateNoSlicing(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(age=3.0,
                                         language='english',
                                         label=1.0)
            example2 = self._makeExample(age=3.0,
                                         language='chinese',
                                         label=0.0)
            example3 = self._makeExample(age=4.0,
                                         language='english',
                                         label=1.0)
            example4 = self._makeExample(age=5.0,
                                         language='chinese',
                                         label=0.0)

            metrics, _ = (
                pipeline
                | beam.Create([
                    example1.SerializeToString(),
                    example2.SerializeToString(),
                    example3.SerializeToString(),
                    example4.SerializeToString()
                ])
                | evaluate.Evaluate(
                    eval_saved_model_path=eval_export_dir,
                    add_metrics_callbacks=[_addExampleCountMetricCallback]))

            def check_result(got):
                try:
                    self.assertEqual(1, len(got), 'got: %s' % got)
                    (slice_key, value) = got[0]
                    self.assertEqual((), slice_key)
                    self.assertDictElementsAlmostEqual(
                        value, {
                            'accuracy': 1.0,
                            'label/mean': 0.5,
                            'my_mean_age': 3.75,
                            'my_mean_age_times_label': 1.75,
                            'added_example_count': 4.0
                        })
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics, check_result)
 def testPostExportMetricsLinearClassifier(self):
     temp_eval_export_dir = self._getEvalExportDir()
     _, eval_export_dir = linear_classifier.simple_linear_classifier(
         None, temp_eval_export_dir)
     examples = [
         self._makeExample(age=3.0, language='english', label=1.0),
         self._makeExample(age=3.0, language='chinese', label=0.0),
         self._makeExample(age=4.0, language='english', label=1.0),
         self._makeExample(age=5.0, language='chinese', label=0.0)
     ]
     metrics_to_check = [
         (metric_keys.EXAMPLE_COUNT, post_export_metrics.example_count(),
          4.0),
         (metric_keys.EXAMPLE_WEIGHT,
          post_export_metrics.example_weight('age'), 15.0),
     ]
     self._runTest(examples, eval_export_dir, metrics_to_check)
Exemple #12
0
 def testPostExportMetricsLinearClassifier(self):
     temp_eval_export_dir = self._getEvalExportDir()
     _, eval_export_dir = linear_classifier.simple_linear_classifier(
         None, temp_eval_export_dir)
     examples = [
         self._makeExample(age=3.0, language='english', label=1.0),
         self._makeExample(age=3.0, language='chinese', label=0.0),
         self._makeExample(age=4.0, language='english', label=1.0),
         self._makeExample(age=5.0, language='chinese', label=0.0)
     ]
     expected_values_dict = {
         metric_keys.EXAMPLE_COUNT: 4.0,
         metric_keys.EXAMPLE_WEIGHT: 15.0,
     }
     self._runTest(examples, eval_export_dir, [
         post_export_metrics.example_count(),
         post_export_metrics.example_weight('age')
     ], expected_values_dict)
Exemple #13
0
    def testPredict(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)
        eval_shared_model = types.EvalSharedModel(model_path=eval_export_dir)

        with beam.Pipeline() as pipeline:
            examples = [
                self._makeExample(age=3.0, language='english', label=1.0),
                self._makeExample(age=3.0, language='chinese', label=0.0),
                self._makeExample(age=4.0, language='english', label=1.0),
                self._makeExample(age=5.0, language='chinese', label=0.0),
            ]
            serialized_examples = [e.SerializeToString() for e in examples]

            predict_extracts = (
                pipeline
                | beam.Create(serialized_examples)
                # Our diagnostic outputs, pass types.Extracts throughout, however our
                # aggregating functions do not use this interface.
                | beam.Map(lambda x: {constants.INPUT_KEY: x})
                | 'Predict' >> predict_extractor._TFMAPredict(
                    eval_shared_model=eval_shared_model, desired_batch_size=3))

            def check_result(got):
                try:
                    self.assertEqual(4, len(got), 'got: %s' % got)
                    for item in got:
                        self.assertTrue(
                            constants.FEATURES_PREDICTIONS_LABELS_KEY in item)
                        fpl = item[constants.FEATURES_PREDICTIONS_LABELS_KEY]
                        # Verify fpl contains features, probabilities, and correct labels.
                        self.assertIn('language', fpl.features)
                        self.assertIn('age', fpl.features)
                        self.assertIn('label', fpl.features)
                        self.assertIn('probabilities', fpl.predictions)
                        self.assertAlmostEqual(fpl.features['label'],
                                               fpl.labels['__labels'])

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(predict_extracts, check_result)
Exemple #14
0
    def testCalibrationPlotAndPredictionHistogramLinearClassifier(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = (linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir))

        examples = [
            self._makeExample(age=3.0, language='english', label=1.0),
            self._makeExample(age=3.0, language='chinese', label=0.0),
            self._makeExample(age=4.0, language='english', label=1.0),
            self._makeExample(age=5.0, language='chinese', label=0.0)
        ]

        def check_result(got):  # pylint: disable=invalid-name
            try:
                self.assertEqual(1, len(got), 'got: %s' % got)
                (slice_key, value) = got[0]
                self.assertEqual((), slice_key)
                self.assertIn(metric_keys.CALIBRATION_PLOT_MATRICES, value)
                # We just check that the bucket sums look sane, since we don't know
                # the exact predictions of the model.
                #
                # Note that the correctness of the bucketing is tested in the other
                # two tests with the fixed prediction estimator. This test is more
                # for ensuring that this metric is compatible with the canned
                # Estimators, for which the prediction Tensor returned for a batch
                # of examples will be a N x 1 Tensor, rather than just an N element
                # vector.
                buckets = value[metric_keys.CALIBRATION_PLOT_MATRICES]
                bucket_sums = np.sum(buckets, axis=0)
                self.assertAlmostEqual(bucket_sums[1], 2.0)  # label sum
                self.assertAlmostEqual(bucket_sums[2], 4.0)  # weight sum
            except AssertionError as err:
                raise util.BeamAssertException(err)

        self._runTestWithCustomCheck(
            examples,
            eval_export_dir,
            [post_export_metrics.calibration_plot_and_prediction_histogram()],
            custom_plots_check=check_result)
    def testEvaluateNoSlicingAddPostExportAndCustomMetrics(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_export_dir,
            add_metrics_callbacks=[
                _addExampleCountMetricCallback,
                # Note that since everything runs in-process this doesn't
                # actually test that the py_func can be correctly recreated
                # on workers in a distributed context.
                _addPyFuncMetricCallback,
                post_export_metrics.example_count(),
                post_export_metrics.example_weight(example_weight_key='age')
            ])
        extractors = [
            predict_extractor.PredictExtractor(eval_shared_model),
            slice_key_extractor.SliceKeyExtractor()
        ]

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(age=3.0,
                                         language='english',
                                         label=1.0)
            example2 = self._makeExample(age=3.0,
                                         language='chinese',
                                         label=0.0)
            example3 = self._makeExample(age=4.0,
                                         language='english',
                                         label=1.0)
            example4 = self._makeExample(age=5.0,
                                         language='chinese',
                                         label=0.0)

            metrics, plots = (
                pipeline
                | 'Create' >> beam.Create([
                    example1.SerializeToString(),
                    example2.SerializeToString(),
                    example3.SerializeToString(),
                    example4.SerializeToString()
                ])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'Extract' >> tfma_unit.Extract(extractors=extractors)  # pylint: disable=no-value-for-parameter
                | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator.
                ComputeMetricsAndPlots(eval_shared_model=eval_shared_model))

            def check_result(got):
                try:
                    self.assertEqual(1, len(got), 'got: %s' % got)
                    (slice_key, value) = got[0]
                    self.assertEqual((), slice_key)
                    self.assertDictElementsAlmostEqual(
                        got_values_dict=value,
                        expected_values_dict={
                            'accuracy': 1.0,
                            'label/mean': 0.5,
                            'my_mean_age': 3.75,
                            'my_mean_age_times_label': 1.75,
                            'added_example_count': 4.0,
                            'py_func_label_sum': 2.0,
                            metric_keys.EXAMPLE_COUNT: 4.0,
                            metric_keys.EXAMPLE_WEIGHT: 15.0
                        })
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics, check_result, label='metrics')
            util.assert_that(plots, util.is_empty(), label='plots')
    def testEvaluateWithSlicingAndDifferentBatchSizes(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_export_dir,
            add_metrics_callbacks=[_addExampleCountMetricCallback])
        extractors = [
            predict_extractor.PredictExtractor(eval_shared_model),
            slice_key_extractor.SliceKeyExtractor([
                slicer.SingleSliceSpec(),
                slicer.SingleSliceSpec(columns=['slice_key'])
            ])
        ]

        for batch_size in [1, 2, 4, 8]:

            with beam.Pipeline() as pipeline:
                example1 = self._makeExample(age=3.0,
                                             language='english',
                                             label=1.0,
                                             slice_key='first_slice')
                example2 = self._makeExample(age=3.0,
                                             language='chinese',
                                             label=0.0,
                                             slice_key='first_slice')
                example3 = self._makeExample(age=4.0,
                                             language='english',
                                             label=0.0,
                                             slice_key='second_slice')
                example4 = self._makeExample(age=5.0,
                                             language='chinese',
                                             label=1.0,
                                             slice_key='second_slice')
                example5 = self._makeExample(age=5.0,
                                             language='chinese',
                                             label=1.0,
                                             slice_key='second_slice')

                metrics, plots = (
                    pipeline
                    | 'Create' >> beam.Create([
                        example1.SerializeToString(),
                        example2.SerializeToString(),
                        example3.SerializeToString(),
                        example4.SerializeToString(),
                        example5.SerializeToString(),
                    ])
                    | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                    | 'Extract' >> tfma_unit.Extract(extractors=extractors)  # pylint: disable=no-value-for-parameter
                    | 'ComputeMetricsAndPlots' >>
                    metrics_and_plots_evaluator.ComputeMetricsAndPlots(
                        eval_shared_model=eval_shared_model,
                        desired_batch_size=batch_size))

                def check_result(got):
                    try:
                        self.assertEqual(3, len(got), 'got: %s' % got)
                        slices = {}
                        for slice_key, value in got:
                            slices[slice_key] = value
                        overall_slice = ()
                        first_slice = (('slice_key', b'first_slice'), )
                        second_slice = (('slice_key', b'second_slice'), )
                        self.assertItemsEqual(
                            list(slices.keys()),
                            [overall_slice, first_slice, second_slice])
                        self.assertDictElementsAlmostEqual(
                            slices[overall_slice], {
                                'accuracy': 0.4,
                                'label/mean': 0.6,
                                'my_mean_age': 4.0,
                                'my_mean_age_times_label': 2.6,
                                'added_example_count': 5.0
                            })
                        self.assertDictElementsAlmostEqual(
                            slices[first_slice], {
                                'accuracy': 1.0,
                                'label/mean': 0.5,
                                'my_mean_age': 3.0,
                                'my_mean_age_times_label': 1.5,
                                'added_example_count': 2.0
                            })
                        self.assertDictElementsAlmostEqual(
                            slices[second_slice], {
                                'accuracy': 0.0,
                                'label/mean': 2.0 / 3.0,
                                'my_mean_age': 14.0 / 3.0,
                                'my_mean_age_times_label': 10.0 / 3.0,
                                'added_example_count': 3.0
                            })

                    except AssertionError as err:
                        # This function is redefined every iteration, so it will have the
                        # right value of batch_size.
                        raise util.BeamAssertException(
                            'batch_size = %d, error: %s' % (batch_size, err))  # pylint: disable=cell-var-from-loop

                util.assert_that(metrics, check_result, label='metrics')
                util.assert_that(plots, util.is_empty(), label='plots')
  def testAggregateMultipleSlices(self):
    temp_eval_export_dir = self._getEvalExportDir()
    _, eval_export_dir = linear_classifier.simple_linear_classifier(
        None, temp_eval_export_dir)

    eval_shared_model = self.createTestEvalSharedModel(
        eval_saved_model_path=eval_export_dir)

    with beam.Pipeline() as pipeline:
      example1 = self._makeExample(age=3.0, language='english', label=1.0)
      example2 = self._makeExample(age=3.0, language='chinese', label=0.0)
      example3 = self._makeExample(age=4.0, language='english', label=1.0)
      example4 = self._makeExample(age=5.0, language='chinese', label=0.0)

      predict_result_english_slice = ([
          example1.SerializeToString(),
          example3.SerializeToString()
      ])

      predict_result_chinese_slice = ([
          example2.SerializeToString(),
          example4.SerializeToString()
      ])

      test_input = (
          create_test_input(predict_result_english_slice, [(
              ('language', 'english'))]) +
          create_test_input(predict_result_chinese_slice, [(
              ('language', 'chinese'))]) +
          # Overall slice
          create_test_input(
              predict_result_english_slice + predict_result_chinese_slice,
              [()]))

      metrics = (
          pipeline
          | 'CreateTestInput' >> beam.Create(test_input)
          | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics(
              eval_shared_model=eval_shared_model, desired_batch_size=3))

      def check_result(got):
        self.assertEqual(3, len(got), 'got: %s' % got)
        slices = {}
        for slice_key, metrics in got:
          slices[slice_key] = metrics
        overall_slice = ()
        english_slice = (('language', 'english'))
        chinese_slice = (('language', 'chinese'))
        self.assertCountEqual(
            list(slices.keys()), [overall_slice, english_slice, chinese_slice])
        self.assertDictElementsAlmostEqual(
            slices[overall_slice], {
                'accuracy': 1.0,
                'label/mean': 0.5,
                'my_mean_age': 3.75,
                'my_mean_age_times_label': 1.75,
            })
        self.assertDictElementsAlmostEqual(
            slices[english_slice], {
                'accuracy': 1.0,
                'label/mean': 1.0,
                'my_mean_age': 3.5,
                'my_mean_age_times_label': 3.5,
            })
        self.assertDictElementsAlmostEqual(
            slices[chinese_slice], {
                'accuracy': 1.0,
                'label/mean': 0.0,
                'my_mean_age': 4.0,
                'my_mean_age_times_label': 0.0,
            })

      util.assert_that(metrics, check_result)
  def testAggregateMultipleSlicesWithSampling(self):
    temp_eval_export_dir = self._getEvalExportDir()
    _, eval_export_dir = linear_classifier.simple_linear_classifier(
        None, temp_eval_export_dir)

    eval_shared_model = self.createTestEvalSharedModel(
        eval_saved_model_path=eval_export_dir)

    with beam.Pipeline() as pipeline:
      example1 = self._makeExample(age=3.0, language='english', label=1.0)
      example2 = self._makeExample(age=3.0, language='chinese', label=0.0)
      example3 = self._makeExample(age=4.0, language='english', label=1.0)
      example4 = self._makeExample(age=5.0, language='chinese', label=0.0)

      predict_result_english_slice = ([
          example1.SerializeToString(),
          example3.SerializeToString()
      ])

      predict_result_chinese_slice = ([
          example2.SerializeToString(),
          example4.SerializeToString()
      ])

      test_input = (
          create_test_input(predict_result_english_slice, [(
              ('language', 'english'))]) +
          create_test_input(predict_result_chinese_slice, [(
              ('language', 'chinese'))]) +
          # Overall slice
          create_test_input(
              predict_result_english_slice + predict_result_chinese_slice,
              [()]))
      metrics = (
          pipeline
          | 'CreateTestInput' >> beam.Create(test_input)
          | 'ComputePerSliceMetrics' >>
          poisson_bootstrap.ComputeWithConfidenceIntervals(
              aggregate.ComputePerSliceMetrics,
              num_bootstrap_samples=10,
              eval_shared_model=eval_shared_model,
              desired_batch_size=3))

      def assert_almost_equal_to_value_with_t_distribution(
          target,
          unsampled_value,
          sample_mean,
          sample_standard_deviation,
          sample_degrees_of_freedom,
          delta=2):
        self.assertEqual(target.unsampled_value, unsampled_value)
        self.assertAlmostEqual(target.sample_mean, sample_mean, delta=delta)
        self.assertAlmostEqual(
            target.sample_standard_deviation,
            sample_standard_deviation,
            delta=delta)
        # The possion resampling could return [0, 0, ... ], which will reduce
        # the number of samples.
        self.assertLessEqual(target.sample_degrees_of_freedom,
                             sample_degrees_of_freedom)

      def check_overall_slice(slices):
        my_dict = slices[()]
        assert_almost_equal_to_value_with_t_distribution(
            my_dict['my_mean_age'], 3.75, 3.64, 0.34, 19)
        assert_almost_equal_to_value_with_t_distribution(
            my_dict['accuracy'], 1.0, 1.0, 0, 19)
        assert_almost_equal_to_value_with_t_distribution(
            my_dict['label/mean'], 0.5, 0.59, 0.29, 19)
        assert_almost_equal_to_value_with_t_distribution(
            my_dict['my_mean_age_times_label'], 1.75, 2.15, 1.06, 19)

      def check_english_slice(slices):
        my_dict = slices[(('language', 'english'))]
        assert_almost_equal_to_value_with_t_distribution(
            my_dict['my_mean_age'], 3.5, 3.18, 0.28, 19)
        assert_almost_equal_to_value_with_t_distribution(
            my_dict['accuracy'], 1.0, 1.0, 0, 19)
        assert_almost_equal_to_value_with_t_distribution(
            my_dict['label/mean'], 1.0, 1.0, 0, 19)
        assert_almost_equal_to_value_with_t_distribution(
            my_dict['my_mean_age_times_label'], 3.5, 3.18, 0.28, 19)

      def check_chinese_slice(slices):
        my_dict = slices[(('language', 'chinese'))]
        assert_almost_equal_to_value_with_t_distribution(
            my_dict['my_mean_age'], 4.0, 4.12, 0.83, 19)
        assert_almost_equal_to_value_with_t_distribution(
            my_dict['accuracy'], 1.0, 1.0, 0, 19)
        assert_almost_equal_to_value_with_t_distribution(
            my_dict['label/mean'], 0, 0, 0, 19)
        assert_almost_equal_to_value_with_t_distribution(
            my_dict['my_mean_age_times_label'], 0, 0, 0, 19)

      def check_result(got):
        self.assertEqual(3, len(got), 'got: %s' % got)
        slices = {}
        for slice_key, metrics in got:
          slices[slice_key] = metrics
        check_overall_slice(slices)
        check_english_slice(slices)
        check_chinese_slice(slices)

      util.assert_that(metrics, check_result)
    def testEvaluateWithEvalSavedModel(self):
        temp_export_dir = self._getExportDir()
        _, export_dir = linear_classifier.simple_linear_classifier(
            None, temp_export_dir)
        eval_config = config.EvalConfig(
            model_specs=[config.ModelSpec(signature_name='eval')],
            slicing_specs=[
                config.SlicingSpec(),
                config.SlicingSpec(feature_keys=['slice_key']),
            ])
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir,
            add_metrics_callbacks=[_addExampleCountMetricCallback])
        slice_spec = [
            slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
        ]
        extractors = [
            predict_extractor.PredictExtractor(eval_shared_model),
            slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
        ]
        evaluators = [
            metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                eval_config=eval_config, eval_shared_model=eval_shared_model)
        ]

        examples = [
            self._makeExample(age=3.0,
                              language='english',
                              label=1.0,
                              slice_key='first_slice'),
            self._makeExample(age=3.0,
                              language='chinese',
                              label=0.0,
                              slice_key='first_slice'),
            self._makeExample(age=4.0,
                              language='english',
                              label=0.0,
                              slice_key='second_slice'),
            self._makeExample(age=5.0,
                              language='chinese',
                              label=1.0,
                              slice_key='second_slice'),
            self._makeExample(age=5.0,
                              language='chinese',
                              label=1.0,
                              slice_key='second_slice')
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            metrics = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate(
                    extractors=extractors, evaluators=evaluators))

            # pylint: enable=no-value-for-parameter

            def check_metrics(got):
                try:
                    self.assertLen(got, 3)
                    slices = {}
                    for slice_key, value in got:
                        slices[slice_key] = value
                    overall_slice = ()
                    first_slice = (('slice_key', b'first_slice'), )
                    second_slice = (('slice_key', b'second_slice'), )
                    self.assertCountEqual(
                        list(slices.keys()),
                        [overall_slice, first_slice, second_slice])
                    self.assertDictElementsAlmostEqual(
                        slices[overall_slice], {
                            metric_types.MetricKey(name='accuracy'):
                            0.4,
                            metric_types.MetricKey(name='label/mean'):
                            0.6,
                            metric_types.MetricKey(name='my_mean_age'):
                            4.0,
                            metric_types.MetricKey(name='my_mean_age_times_label'):
                            2.6,
                            metric_types.MetricKey(name='added_example_count'):
                            5.0
                        })
                    self.assertDictElementsAlmostEqual(
                        slices[first_slice], {
                            metric_types.MetricKey(name='accuracy'):
                            1.0,
                            metric_types.MetricKey(name='label/mean'):
                            0.5,
                            metric_types.MetricKey(name='my_mean_age'):
                            3.0,
                            metric_types.MetricKey(name='my_mean_age_times_label'):
                            1.5,
                            metric_types.MetricKey(name='added_example_count'):
                            2.0
                        })
                    self.assertDictElementsAlmostEqual(
                        slices[second_slice], {
                            metric_types.MetricKey(name='accuracy'):
                            0.0,
                            metric_types.MetricKey(name='label/mean'):
                            2.0 / 3.0,
                            metric_types.MetricKey(name='my_mean_age'):
                            14.0 / 3.0,
                            metric_types.MetricKey(name='my_mean_age_times_label'):
                            10.0 / 3.0,
                            metric_types.MetricKey(name='added_example_count'):
                            3.0
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics[constants.METRICS_KEY],
                             check_metrics,
                             label='metrics')
Exemple #20
0
    def testAggregateMultipleSlicesWithSampling(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)

        eval_saved_model = load.EvalSavedModel(eval_export_dir)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_export_dir)

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(age=3.0,
                                         language='english',
                                         label=1.0)
            example2 = self._makeExample(age=3.0,
                                         language='chinese',
                                         label=0.0)
            example3 = self._makeExample(age=4.0,
                                         language='english',
                                         label=1.0)
            example4 = self._makeExample(age=5.0,
                                         language='chinese',
                                         label=0.0)

            predict_result_english_slice = (
                eval_saved_model.as_features_predictions_labels(
                    eval_saved_model.predict_list([
                        example1.SerializeToString(),
                        example3.SerializeToString()
                    ])))

            predict_result_chinese_slice = (
                eval_saved_model.as_features_predictions_labels(
                    eval_saved_model.predict_list([
                        example2.SerializeToString(),
                        example4.SerializeToString()
                    ])))

            test_input = (
                create_test_input(predict_result_english_slice, [(
                    ('language', 'english'))]) +
                create_test_input(predict_result_chinese_slice, [(
                    ('language', 'chinese'))]) +
                # Overall slice
                create_test_input(
                    predict_result_english_slice +
                    predict_result_chinese_slice, [()]))
            metrics, _ = (
                pipeline
                | 'CreateTestInput' >> beam.Create(test_input)
                | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics(
                    eval_shared_model=eval_shared_model,
                    desired_batch_size=3,
                    num_bootstrap_samples=10))

            def check_overall_slice(slices):
                my_dict = slices[()]
                self.assertAlmostEqual(3.75,
                                       my_dict['my_mean_age'].value,
                                       delta=1)
                self.assertAlmostEqual(3.75,
                                       my_dict['my_mean_age'].unsampled_value)
                for value in my_dict['accuracy']:
                    self.assertAlmostEqual(1.0, value)
                for value in my_dict['label/mean']:
                    self.assertAlmostEqual(0.5, value, delta=0.5)
                for value in my_dict['my_mean_age_times_label']:
                    self.assertAlmostEqual(2.5, value, delta=2.5)

            def check_english_slice(slices):
                my_dict = slices[(('language', 'english'))]
                self.assertAlmostEqual(3.5,
                                       my_dict['my_mean_age'].value,
                                       delta=1)
                self.assertAlmostEqual(3.5,
                                       my_dict['my_mean_age'].unsampled_value)
                for value in my_dict['accuracy']:
                    self.assertAlmostEqual(1.0, value)
                for value in my_dict['label/mean']:
                    self.assertAlmostEqual(1.0, value)
                for value in my_dict['my_mean_age_times_label']:
                    self.assertAlmostEqual(3.5, value, delta=1)

            def check_chinese_slice(slices):
                my_dict = slices[(('language', 'chinese'))]
                self.assertAlmostEqual(4.0,
                                       my_dict['my_mean_age'].value,
                                       delta=1)
                self.assertAlmostEqual(4.0,
                                       my_dict['my_mean_age'].unsampled_value)
                for value in my_dict['accuracy']:
                    self.assertAlmostEqual(1.0, value)
                for value in my_dict['label/mean']:
                    self.assertAlmostEqual(0, value)
                for value in my_dict['my_mean_age_times_label']:
                    self.assertAlmostEqual(0, value)

            def check_result(got):
                self.assertEqual(3, len(got), 'got: %s' % got)
                slices = {}
                for slice_key, metrics in got:
                    slices[slice_key] = metrics
                check_overall_slice(slices)
                check_english_slice(slices)
                check_chinese_slice(slices)

            util.assert_that(metrics, check_result)