def ComputeMetricsAndPlots( # pylint: disable=invalid-name extracts: beam.pvalue.PCollection, eval_shared_model: types.EvalSharedModel, desired_batch_size: Optional[int] = None, num_bootstrap_samples: Optional[int] = 1, random_seed_for_testing: Optional[int] = None ) -> Tuple[beam.pvalue.DoOutputsTuple, beam.pvalue.PCollection]: """Computes metrics and plots using the EvalSavedModel. Args: extracts: PCollection of Extracts. The extracts MUST contain a FeaturesPredictionsLabels extract keyed by tfma.FEATURE_PREDICTIONS_LABELS_KEY and a list of SliceKeyType extracts keyed by tfma.SLICE_KEY_TYPES_KEY. Typically these will be added by calling the default_extractors function. eval_shared_model: Shared model parameters for EvalSavedModel including any additional metrics (see EvalSharedModel for more information on how to configure additional metrics). desired_batch_size: Optional batch size for batching in Aggregate. num_bootstrap_samples: Set to value > 1 to run metrics analysis over multiple bootstrap samples and compute uncertainty intervals. random_seed_for_testing: Provide for deterministic tests only. Returns: Tuple of Tuple[PCollection of (slice key, metrics), PCollection of (slice key, plot metrics)] and PCollection of (slice_key and its example count). """ _ = (extracts.pipeline | counter_util.IncrementMetricsComputationCounters( eval_shared_model.add_metrics_callbacks)) # pylint: disable=no-value-for-parameter slices = ( extracts # Input: one example at a time, with slice keys in extracts. # Output: one fpl example per slice key (notice that the example turns # into n logical examples, references to which are replicated once # per applicable slice key). | 'FanoutSlices' >> slicer.FanoutSlices()) slices_count = (slices | 'ExtractSliceKeys' >> beam.Keys() | 'CountPerSliceKey' >> beam.combiners.Count.PerElement()) aggregated_metrics = ( slices # Metrics are computed per slice key. # Output: Multi-outputs, a dict of slice key to computed metrics, and # plots if applicable. | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics( eval_shared_model=eval_shared_model, desired_batch_size=desired_batch_size, num_bootstrap_samples=num_bootstrap_samples, random_seed_for_testing=random_seed_for_testing)) return (aggregated_metrics, slices_count)
def testAggregateOverallSlice(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_saved_model = load.EvalSavedModel(eval_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir) with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) predict_result = eval_saved_model.as_features_predictions_labels( eval_saved_model.predict_list([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ])) metrics, _ = ( pipeline | 'CreateTestInput' >> beam.Create( create_test_input(predict_result, [()])) | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics( eval_shared_model=eval_shared_model, desired_batch_size=3)) def check_result(got): self.assertEqual(1, len(got), 'got: %s' % got) slice_key, metrics = got[0] self.assertEqual(slice_key, ()) self.assertDictElementsAlmostEqual( metrics, { 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.75, 'my_mean_age_times_label': 1.75, }) util.assert_that(metrics, check_result)
def ComputeMetricsAndPlots( # pylint: disable=invalid-name extracts, eval_shared_model, desired_batch_size = None, num_bootstrap_samples = 1, random_seed = None, ): """Computes metrics and plots using the EvalSavedModel. Args: extracts: PCollection of Extracts. The extracts MUST contain a FeaturesPredictionsLabels extract keyed by tfma.FEATURE_PREDICTIONS_LABELS_KEY and a list of SliceKeyType extracts keyed by tfma.SLICE_KEY_TYPES_KEY. Typically these will be added by calling the default_extractors function. eval_shared_model: Shared model parameters for EvalSavedModel including any additional metrics (see EvalSharedModel for more information on how to configure additional metrics). desired_batch_size: Optional batch size for batching in Aggregate. num_bootstrap_samples: Set to value > 1 to run metrics analysis over multiple bootstrap samples and compute uncertainty intervals. random_seed: Provide for deterministic tests only. Returns: DoOutputsTuple. The tuple entries are PCollection of (slice key, metrics) and PCollection of (slice key, plot metrics). """ # pylint: disable=no-value-for-parameter return ( extracts # Input: one example at a time, with slice keys in extracts. # Output: one fpl example per slice key (notice that the example turns # into n, replicated once per applicable slice key) | 'FanoutSlices' >> slicer.FanoutSlices() # Each slice key lands on one shard where metrics are computed for all # examples in that shard -- the "map" and "reduce" parts of the # computation happen within this shard. # Output: Multi-outputs, a dict of slice key to computed metrics, and # plots if applicable. | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics( eval_shared_model=eval_shared_model, desired_batch_size=desired_batch_size, num_bootstrap_samples=num_bootstrap_samples, random_seed=random_seed))
def testAggregateMultipleSlices(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir) with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) predict_result_english_slice = ([ example1.SerializeToString(), example3.SerializeToString() ]) predict_result_chinese_slice = ([ example2.SerializeToString(), example4.SerializeToString() ]) test_input = ( create_test_input(predict_result_english_slice, [( ('language', 'english'))]) + create_test_input(predict_result_chinese_slice, [( ('language', 'chinese'))]) + # Overall slice create_test_input( predict_result_english_slice + predict_result_chinese_slice, [()])) metrics = ( pipeline | 'CreateTestInput' >> beam.Create(test_input) | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics( eval_shared_model=eval_shared_model, desired_batch_size=3)) def check_result(got): self.assertEqual(3, len(got), 'got: %s' % got) slices = {} for slice_key, metrics in got: slices[slice_key] = metrics overall_slice = () english_slice = (('language', 'english')) chinese_slice = (('language', 'chinese')) self.assertCountEqual( list(slices.keys()), [overall_slice, english_slice, chinese_slice]) self.assertDictElementsAlmostEqual( slices[overall_slice], { 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.75, 'my_mean_age_times_label': 1.75, }) self.assertDictElementsAlmostEqual( slices[english_slice], { 'accuracy': 1.0, 'label/mean': 1.0, 'my_mean_age': 3.5, 'my_mean_age_times_label': 3.5, }) self.assertDictElementsAlmostEqual( slices[chinese_slice], { 'accuracy': 1.0, 'label/mean': 0.0, 'my_mean_age': 4.0, 'my_mean_age_times_label': 0.0, }) util.assert_that(metrics, check_result)
def testAggregateMultipleSlicesWithSampling(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_saved_model = load.EvalSavedModel(eval_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir) with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) predict_result_english_slice = ( eval_saved_model.as_features_predictions_labels( eval_saved_model.predict_list([ example1.SerializeToString(), example3.SerializeToString() ]))) predict_result_chinese_slice = ( eval_saved_model.as_features_predictions_labels( eval_saved_model.predict_list([ example2.SerializeToString(), example4.SerializeToString() ]))) test_input = ( create_test_input(predict_result_english_slice, [( ('language', 'english'))]) + create_test_input(predict_result_chinese_slice, [( ('language', 'chinese'))]) + # Overall slice create_test_input( predict_result_english_slice + predict_result_chinese_slice, [()])) metrics, _ = ( pipeline | 'CreateTestInput' >> beam.Create(test_input) | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics( eval_shared_model=eval_shared_model, desired_batch_size=3, num_bootstrap_samples=10)) def assert_almost_equal_to_value_with_t_distribution( target, unsampled_value, sample_mean, sample_standard_deviation, sample_degrees_of_freedom, delta=2): self.assertEqual(target.unsampled_value, unsampled_value) self.assertAlmostEqual(target.sample_mean, sample_mean, delta=delta) self.assertAlmostEqual(target.sample_standard_deviation, sample_standard_deviation, delta=delta) # The possion resampling could return [0, 0, ... ], which will reduce # the number of samples. self.assertLessEqual(target.sample_degrees_of_freedom, sample_degrees_of_freedom) def check_overall_slice(slices): my_dict = slices[()] assert_almost_equal_to_value_with_t_distribution( my_dict['my_mean_age'], 3.75, 3.64, 0.34, 9) assert_almost_equal_to_value_with_t_distribution( my_dict['accuracy'], 1.0, 1.0, 0, 9) assert_almost_equal_to_value_with_t_distribution( my_dict['label/mean'], 0.5, 0.59, 0.29, 9) assert_almost_equal_to_value_with_t_distribution( my_dict['my_mean_age_times_label'], 1.75, 2.15, 1.06, 9) def check_english_slice(slices): my_dict = slices[(('language', 'english'))] assert_almost_equal_to_value_with_t_distribution( my_dict['my_mean_age'], 3.5, 3.18, 0.28, 9) assert_almost_equal_to_value_with_t_distribution( my_dict['accuracy'], 1.0, 1.0, 0, 9) assert_almost_equal_to_value_with_t_distribution( my_dict['label/mean'], 1.0, 1.0, 0, 9) assert_almost_equal_to_value_with_t_distribution( my_dict['my_mean_age_times_label'], 3.5, 3.18, 0.28, 9) def check_chinese_slice(slices): my_dict = slices[(('language', 'chinese'))] assert_almost_equal_to_value_with_t_distribution( my_dict['my_mean_age'], 4.0, 4.12, 0.83, 9) assert_almost_equal_to_value_with_t_distribution( my_dict['accuracy'], 1.0, 1.0, 0, 9) assert_almost_equal_to_value_with_t_distribution( my_dict['label/mean'], 0, 0, 0, 9) assert_almost_equal_to_value_with_t_distribution( my_dict['my_mean_age_times_label'], 0, 0, 0, 9) def check_result(got): self.assertEqual(3, len(got), 'got: %s' % got) slices = {} for slice_key, metrics in got: slices[slice_key] = metrics check_overall_slice(slices) check_english_slice(slices) check_chinese_slice(slices) util.assert_that(metrics, check_result)
def testAggregateMultipleSlicesWithSampling(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_saved_model = load.EvalSavedModel(eval_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir) with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) predict_result_english_slice = ( eval_saved_model.as_features_predictions_labels( eval_saved_model.predict_list([ example1.SerializeToString(), example3.SerializeToString() ]))) predict_result_chinese_slice = ( eval_saved_model.as_features_predictions_labels( eval_saved_model.predict_list([ example2.SerializeToString(), example4.SerializeToString() ]))) test_input = ( create_test_input(predict_result_english_slice, [( ('language', 'english'))]) + create_test_input(predict_result_chinese_slice, [( ('language', 'chinese'))]) + # Overall slice create_test_input( predict_result_english_slice + predict_result_chinese_slice, [()])) metrics, _ = ( pipeline | 'CreateTestInput' >> beam.Create(test_input) | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics( eval_shared_model=eval_shared_model, desired_batch_size=3, num_bootstrap_samples=10)) def check_overall_slice(slices): my_dict = slices[()] self.assertAlmostEqual(3.75, my_dict['my_mean_age'].value, delta=1) self.assertAlmostEqual(3.75, my_dict['my_mean_age'].unsampled_value) for value in my_dict['accuracy']: self.assertAlmostEqual(1.0, value) for value in my_dict['label/mean']: self.assertAlmostEqual(0.5, value, delta=0.5) for value in my_dict['my_mean_age_times_label']: self.assertAlmostEqual(2.5, value, delta=2.5) def check_english_slice(slices): my_dict = slices[(('language', 'english'))] self.assertAlmostEqual(3.5, my_dict['my_mean_age'].value, delta=1) self.assertAlmostEqual(3.5, my_dict['my_mean_age'].unsampled_value) for value in my_dict['accuracy']: self.assertAlmostEqual(1.0, value) for value in my_dict['label/mean']: self.assertAlmostEqual(1.0, value) for value in my_dict['my_mean_age_times_label']: self.assertAlmostEqual(3.5, value, delta=1) def check_chinese_slice(slices): my_dict = slices[(('language', 'chinese'))] self.assertAlmostEqual(4.0, my_dict['my_mean_age'].value, delta=1) self.assertAlmostEqual(4.0, my_dict['my_mean_age'].unsampled_value) for value in my_dict['accuracy']: self.assertAlmostEqual(1.0, value) for value in my_dict['label/mean']: self.assertAlmostEqual(0, value) for value in my_dict['my_mean_age_times_label']: self.assertAlmostEqual(0, value) def check_result(got): self.assertEqual(3, len(got), 'got: %s' % got) slices = {} for slice_key, metrics in got: slices[slice_key] = metrics check_overall_slice(slices) check_english_slice(slices) check_chinese_slice(slices) util.assert_that(metrics, check_result)