def ComputeMetricsAndPlots(  # pylint: disable=invalid-name
    extracts: beam.pvalue.PCollection,
    eval_shared_model: types.EvalSharedModel,
    desired_batch_size: Optional[int] = None,
    num_bootstrap_samples: Optional[int] = 1,
    random_seed_for_testing: Optional[int] = None
) -> Tuple[beam.pvalue.DoOutputsTuple, beam.pvalue.PCollection]:
    """Computes metrics and plots using the EvalSavedModel.

  Args:
    extracts: PCollection of Extracts. The extracts MUST contain a
      FeaturesPredictionsLabels extract keyed by
      tfma.FEATURE_PREDICTIONS_LABELS_KEY and a list of SliceKeyType extracts
      keyed by tfma.SLICE_KEY_TYPES_KEY. Typically these will be added by
      calling the default_extractors function.
    eval_shared_model: Shared model parameters for EvalSavedModel including any
      additional metrics (see EvalSharedModel for more information on how to
      configure additional metrics).
    desired_batch_size: Optional batch size for batching in Aggregate.
    num_bootstrap_samples: Set to value > 1 to run metrics analysis over
      multiple bootstrap samples and compute uncertainty intervals.
    random_seed_for_testing: Provide for deterministic tests only.

  Returns:
    Tuple of Tuple[PCollection of (slice key, metrics),
    PCollection of (slice key, plot metrics)] and
    PCollection of (slice_key and its example count).
  """

    _ = (extracts.pipeline
         | counter_util.IncrementMetricsComputationCounters(
             eval_shared_model.add_metrics_callbacks))

    # pylint: disable=no-value-for-parameter
    slices = (
        extracts

        # Input: one example at a time, with slice keys in extracts.
        # Output: one fpl example per slice key (notice that the example turns
        #         into n logical examples, references to which are replicated once
        #         per applicable slice key).
        | 'FanoutSlices' >> slicer.FanoutSlices())

    slices_count = (slices
                    | 'ExtractSliceKeys' >> beam.Keys()
                    | 'CountPerSliceKey' >> beam.combiners.Count.PerElement())

    aggregated_metrics = (
        slices
        # Metrics are computed per slice key.
        # Output: Multi-outputs, a dict of slice key to computed metrics, and
        # plots if applicable.
        | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics(
            eval_shared_model=eval_shared_model,
            desired_batch_size=desired_batch_size,
            num_bootstrap_samples=num_bootstrap_samples,
            random_seed_for_testing=random_seed_for_testing))
    return (aggregated_metrics, slices_count)
Ejemplo n.º 2
0
    def testAggregateOverallSlice(self):

        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)

        eval_saved_model = load.EvalSavedModel(eval_export_dir)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_export_dir)

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(age=3.0,
                                         language='english',
                                         label=1.0)
            example2 = self._makeExample(age=3.0,
                                         language='chinese',
                                         label=0.0)
            example3 = self._makeExample(age=4.0,
                                         language='english',
                                         label=1.0)
            example4 = self._makeExample(age=5.0,
                                         language='chinese',
                                         label=0.0)

            predict_result = eval_saved_model.as_features_predictions_labels(
                eval_saved_model.predict_list([
                    example1.SerializeToString(),
                    example2.SerializeToString(),
                    example3.SerializeToString(),
                    example4.SerializeToString()
                ]))

            metrics, _ = (
                pipeline
                | 'CreateTestInput' >> beam.Create(
                    create_test_input(predict_result, [()]))
                | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics(
                    eval_shared_model=eval_shared_model, desired_batch_size=3))

            def check_result(got):
                self.assertEqual(1, len(got), 'got: %s' % got)
                slice_key, metrics = got[0]
                self.assertEqual(slice_key, ())
                self.assertDictElementsAlmostEqual(
                    metrics, {
                        'accuracy': 1.0,
                        'label/mean': 0.5,
                        'my_mean_age': 3.75,
                        'my_mean_age_times_label': 1.75,
                    })

            util.assert_that(metrics, check_result)
Ejemplo n.º 3
0
def ComputeMetricsAndPlots(  # pylint: disable=invalid-name
    extracts,
    eval_shared_model,
    desired_batch_size = None,
    num_bootstrap_samples = 1,
    random_seed = None,
):
  """Computes metrics and plots using the EvalSavedModel.

  Args:
    extracts: PCollection of Extracts. The extracts MUST contain a
      FeaturesPredictionsLabels extract keyed by
      tfma.FEATURE_PREDICTIONS_LABELS_KEY and a list of SliceKeyType extracts
      keyed by tfma.SLICE_KEY_TYPES_KEY. Typically these will be added by
      calling the default_extractors function.
    eval_shared_model: Shared model parameters for EvalSavedModel including any
      additional metrics (see EvalSharedModel for more information on how to
      configure additional metrics).
    desired_batch_size: Optional batch size for batching in Aggregate.
    num_bootstrap_samples: Set to value > 1 to run metrics analysis over
      multiple bootstrap samples and compute uncertainty intervals.
    random_seed: Provide for deterministic tests only.

  Returns:
    DoOutputsTuple. The tuple entries are
    PCollection of (slice key, metrics) and
    PCollection of (slice key, plot metrics).
  """
  # pylint: disable=no-value-for-parameter
  return (
      extracts

      # Input: one example at a time, with slice keys in extracts.
      # Output: one fpl example per slice key (notice that the example turns
      #         into n, replicated once per applicable slice key)
      | 'FanoutSlices' >> slicer.FanoutSlices()

      # Each slice key lands on one shard where metrics are computed for all
      # examples in that shard -- the "map" and "reduce" parts of the
      # computation happen within this shard.
      # Output: Multi-outputs, a dict of slice key to computed metrics, and
      # plots if applicable.
      | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics(
          eval_shared_model=eval_shared_model,
          desired_batch_size=desired_batch_size,
          num_bootstrap_samples=num_bootstrap_samples,
          random_seed=random_seed))
Ejemplo n.º 4
0
  def testAggregateMultipleSlices(self):
    temp_eval_export_dir = self._getEvalExportDir()
    _, eval_export_dir = linear_classifier.simple_linear_classifier(
        None, temp_eval_export_dir)

    eval_shared_model = self.createTestEvalSharedModel(
        eval_saved_model_path=eval_export_dir)

    with beam.Pipeline() as pipeline:
      example1 = self._makeExample(age=3.0, language='english', label=1.0)
      example2 = self._makeExample(age=3.0, language='chinese', label=0.0)
      example3 = self._makeExample(age=4.0, language='english', label=1.0)
      example4 = self._makeExample(age=5.0, language='chinese', label=0.0)

      predict_result_english_slice = ([
          example1.SerializeToString(),
          example3.SerializeToString()
      ])

      predict_result_chinese_slice = ([
          example2.SerializeToString(),
          example4.SerializeToString()
      ])

      test_input = (
          create_test_input(predict_result_english_slice, [(
              ('language', 'english'))]) +
          create_test_input(predict_result_chinese_slice, [(
              ('language', 'chinese'))]) +
          # Overall slice
          create_test_input(
              predict_result_english_slice + predict_result_chinese_slice,
              [()]))

      metrics = (
          pipeline
          | 'CreateTestInput' >> beam.Create(test_input)
          | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics(
              eval_shared_model=eval_shared_model, desired_batch_size=3))

      def check_result(got):
        self.assertEqual(3, len(got), 'got: %s' % got)
        slices = {}
        for slice_key, metrics in got:
          slices[slice_key] = metrics
        overall_slice = ()
        english_slice = (('language', 'english'))
        chinese_slice = (('language', 'chinese'))
        self.assertCountEqual(
            list(slices.keys()), [overall_slice, english_slice, chinese_slice])
        self.assertDictElementsAlmostEqual(
            slices[overall_slice], {
                'accuracy': 1.0,
                'label/mean': 0.5,
                'my_mean_age': 3.75,
                'my_mean_age_times_label': 1.75,
            })
        self.assertDictElementsAlmostEqual(
            slices[english_slice], {
                'accuracy': 1.0,
                'label/mean': 1.0,
                'my_mean_age': 3.5,
                'my_mean_age_times_label': 3.5,
            })
        self.assertDictElementsAlmostEqual(
            slices[chinese_slice], {
                'accuracy': 1.0,
                'label/mean': 0.0,
                'my_mean_age': 4.0,
                'my_mean_age_times_label': 0.0,
            })

      util.assert_that(metrics, check_result)
Ejemplo n.º 5
0
    def testAggregateMultipleSlicesWithSampling(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)

        eval_saved_model = load.EvalSavedModel(eval_export_dir)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_export_dir)

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(age=3.0,
                                         language='english',
                                         label=1.0)
            example2 = self._makeExample(age=3.0,
                                         language='chinese',
                                         label=0.0)
            example3 = self._makeExample(age=4.0,
                                         language='english',
                                         label=1.0)
            example4 = self._makeExample(age=5.0,
                                         language='chinese',
                                         label=0.0)

            predict_result_english_slice = (
                eval_saved_model.as_features_predictions_labels(
                    eval_saved_model.predict_list([
                        example1.SerializeToString(),
                        example3.SerializeToString()
                    ])))

            predict_result_chinese_slice = (
                eval_saved_model.as_features_predictions_labels(
                    eval_saved_model.predict_list([
                        example2.SerializeToString(),
                        example4.SerializeToString()
                    ])))

            test_input = (
                create_test_input(predict_result_english_slice, [(
                    ('language', 'english'))]) +
                create_test_input(predict_result_chinese_slice, [(
                    ('language', 'chinese'))]) +
                # Overall slice
                create_test_input(
                    predict_result_english_slice +
                    predict_result_chinese_slice, [()]))
            metrics, _ = (
                pipeline
                | 'CreateTestInput' >> beam.Create(test_input)
                | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics(
                    eval_shared_model=eval_shared_model,
                    desired_batch_size=3,
                    num_bootstrap_samples=10))

            def assert_almost_equal_to_value_with_t_distribution(
                    target,
                    unsampled_value,
                    sample_mean,
                    sample_standard_deviation,
                    sample_degrees_of_freedom,
                    delta=2):
                self.assertEqual(target.unsampled_value, unsampled_value)
                self.assertAlmostEqual(target.sample_mean,
                                       sample_mean,
                                       delta=delta)
                self.assertAlmostEqual(target.sample_standard_deviation,
                                       sample_standard_deviation,
                                       delta=delta)
                # The possion resampling could return [0, 0, ... ], which will reduce
                # the number of samples.
                self.assertLessEqual(target.sample_degrees_of_freedom,
                                     sample_degrees_of_freedom)

            def check_overall_slice(slices):
                my_dict = slices[()]
                assert_almost_equal_to_value_with_t_distribution(
                    my_dict['my_mean_age'], 3.75, 3.64, 0.34, 9)
                assert_almost_equal_to_value_with_t_distribution(
                    my_dict['accuracy'], 1.0, 1.0, 0, 9)
                assert_almost_equal_to_value_with_t_distribution(
                    my_dict['label/mean'], 0.5, 0.59, 0.29, 9)
                assert_almost_equal_to_value_with_t_distribution(
                    my_dict['my_mean_age_times_label'], 1.75, 2.15, 1.06, 9)

            def check_english_slice(slices):
                my_dict = slices[(('language', 'english'))]
                assert_almost_equal_to_value_with_t_distribution(
                    my_dict['my_mean_age'], 3.5, 3.18, 0.28, 9)
                assert_almost_equal_to_value_with_t_distribution(
                    my_dict['accuracy'], 1.0, 1.0, 0, 9)
                assert_almost_equal_to_value_with_t_distribution(
                    my_dict['label/mean'], 1.0, 1.0, 0, 9)
                assert_almost_equal_to_value_with_t_distribution(
                    my_dict['my_mean_age_times_label'], 3.5, 3.18, 0.28, 9)

            def check_chinese_slice(slices):
                my_dict = slices[(('language', 'chinese'))]
                assert_almost_equal_to_value_with_t_distribution(
                    my_dict['my_mean_age'], 4.0, 4.12, 0.83, 9)
                assert_almost_equal_to_value_with_t_distribution(
                    my_dict['accuracy'], 1.0, 1.0, 0, 9)
                assert_almost_equal_to_value_with_t_distribution(
                    my_dict['label/mean'], 0, 0, 0, 9)
                assert_almost_equal_to_value_with_t_distribution(
                    my_dict['my_mean_age_times_label'], 0, 0, 0, 9)

            def check_result(got):
                self.assertEqual(3, len(got), 'got: %s' % got)
                slices = {}
                for slice_key, metrics in got:
                    slices[slice_key] = metrics
                check_overall_slice(slices)
                check_english_slice(slices)
                check_chinese_slice(slices)

            util.assert_that(metrics, check_result)
Ejemplo n.º 6
0
    def testAggregateMultipleSlicesWithSampling(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)

        eval_saved_model = load.EvalSavedModel(eval_export_dir)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_export_dir)

        with beam.Pipeline() as pipeline:
            example1 = self._makeExample(age=3.0,
                                         language='english',
                                         label=1.0)
            example2 = self._makeExample(age=3.0,
                                         language='chinese',
                                         label=0.0)
            example3 = self._makeExample(age=4.0,
                                         language='english',
                                         label=1.0)
            example4 = self._makeExample(age=5.0,
                                         language='chinese',
                                         label=0.0)

            predict_result_english_slice = (
                eval_saved_model.as_features_predictions_labels(
                    eval_saved_model.predict_list([
                        example1.SerializeToString(),
                        example3.SerializeToString()
                    ])))

            predict_result_chinese_slice = (
                eval_saved_model.as_features_predictions_labels(
                    eval_saved_model.predict_list([
                        example2.SerializeToString(),
                        example4.SerializeToString()
                    ])))

            test_input = (
                create_test_input(predict_result_english_slice, [(
                    ('language', 'english'))]) +
                create_test_input(predict_result_chinese_slice, [(
                    ('language', 'chinese'))]) +
                # Overall slice
                create_test_input(
                    predict_result_english_slice +
                    predict_result_chinese_slice, [()]))
            metrics, _ = (
                pipeline
                | 'CreateTestInput' >> beam.Create(test_input)
                | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics(
                    eval_shared_model=eval_shared_model,
                    desired_batch_size=3,
                    num_bootstrap_samples=10))

            def check_overall_slice(slices):
                my_dict = slices[()]
                self.assertAlmostEqual(3.75,
                                       my_dict['my_mean_age'].value,
                                       delta=1)
                self.assertAlmostEqual(3.75,
                                       my_dict['my_mean_age'].unsampled_value)
                for value in my_dict['accuracy']:
                    self.assertAlmostEqual(1.0, value)
                for value in my_dict['label/mean']:
                    self.assertAlmostEqual(0.5, value, delta=0.5)
                for value in my_dict['my_mean_age_times_label']:
                    self.assertAlmostEqual(2.5, value, delta=2.5)

            def check_english_slice(slices):
                my_dict = slices[(('language', 'english'))]
                self.assertAlmostEqual(3.5,
                                       my_dict['my_mean_age'].value,
                                       delta=1)
                self.assertAlmostEqual(3.5,
                                       my_dict['my_mean_age'].unsampled_value)
                for value in my_dict['accuracy']:
                    self.assertAlmostEqual(1.0, value)
                for value in my_dict['label/mean']:
                    self.assertAlmostEqual(1.0, value)
                for value in my_dict['my_mean_age_times_label']:
                    self.assertAlmostEqual(3.5, value, delta=1)

            def check_chinese_slice(slices):
                my_dict = slices[(('language', 'chinese'))]
                self.assertAlmostEqual(4.0,
                                       my_dict['my_mean_age'].value,
                                       delta=1)
                self.assertAlmostEqual(4.0,
                                       my_dict['my_mean_age'].unsampled_value)
                for value in my_dict['accuracy']:
                    self.assertAlmostEqual(1.0, value)
                for value in my_dict['label/mean']:
                    self.assertAlmostEqual(0, value)
                for value in my_dict['my_mean_age_times_label']:
                    self.assertAlmostEqual(0, value)

            def check_result(got):
                self.assertEqual(3, len(got), 'got: %s' % got)
                slices = {}
                for slice_key, metrics in got:
                    slices[slice_key] = metrics
                check_overall_slice(slices)
                check_english_slice(slices)
                check_chinese_slice(slices)

            util.assert_that(metrics, check_result)