def testSerializePlots(self): slice_key = _make_slice_key('fruit', 'apple') tfma_plots = { _full_key(metric_keys.CALIBRATION_PLOT_MATRICES): np.array([ [0.0, 0.0, 0.0], [0.3, 1.0, 1.0], [0.7, 0.0, 1.0], [0.0, 0.0, 0.0], ]), _full_key(metric_keys.CALIBRATION_PLOT_BOUNDARIES): np.array([0.0, 0.5, 1.0]), } expected_plot_data = """ slice_key { single_slice_keys { column: 'fruit' bytes_value: 'apple' } } plot_data { calibration_histogram_buckets { buckets { lower_threshold_inclusive: -inf upper_threshold_exclusive: 0.0 num_weighted_examples { value: 0.0 } total_weighted_label { value: 0.0 } total_weighted_refined_prediction { value: 0.0 } } buckets { lower_threshold_inclusive: 0.0 upper_threshold_exclusive: 0.5 num_weighted_examples { value: 1.0 } total_weighted_label { value: 1.0 } total_weighted_refined_prediction { value: 0.3 } } buckets { lower_threshold_inclusive: 0.5 upper_threshold_exclusive: 1.0 num_weighted_examples { value: 1.0 } total_weighted_label { value: 0.0 } total_weighted_refined_prediction { value: 0.7 } } buckets { lower_threshold_inclusive: 1.0 upper_threshold_exclusive: inf num_weighted_examples { value: 0.0 } total_weighted_label { value: 0.0 } total_weighted_refined_prediction { value: 0.0 } } } } """ calibration_plot = ( post_export_metrics.calibration_plot_and_prediction_histogram()) serialized = metrics_and_plots_evaluator._serialize_plots( (slice_key, tfma_plots), [calibration_plot]) self.assertProtoEquals( expected_plot_data, metrics_for_slice_pb2.PlotsForSlice.FromString(serialized))
def testCalibrationPlotSerialization(self): # Calibration plots for the model # {prediction:0.3, true_label:+}, # {prediction:0.7, true_label:-} # # These plots were generated by hand. For this test to make sense # it must actually match the kind of output the TFMA produces. tfma_plots = { metric_keys.CALIBRATION_PLOT_MATRICES: np.array([ [0.0, 0.0, 0.0], [0.3, 1.0, 1.0], [0.7, 0.0, 1.0], [0.0, 0.0, 0.0], ]), metric_keys.CALIBRATION_PLOT_BOUNDARIES: np.array([0.0, 0.5, 1.0]), } expected_plot_data = """ calibration_histogram_buckets { buckets { lower_threshold_inclusive: -inf upper_threshold_exclusive: 0.0 num_weighted_examples { value: 0.0 } total_weighted_label { value: 0.0 } total_weighted_refined_prediction { value: 0.0 } } buckets { lower_threshold_inclusive: 0.0 upper_threshold_exclusive: 0.5 num_weighted_examples { value: 1.0 } total_weighted_label { value: 1.0 } total_weighted_refined_prediction { value: 0.3 } } buckets { lower_threshold_inclusive: 0.5 upper_threshold_exclusive: 1.0 num_weighted_examples { value: 1.0 } total_weighted_label { value: 0.0 } total_weighted_refined_prediction { value: 0.7 } } buckets { lower_threshold_inclusive: 1.0 upper_threshold_exclusive: inf num_weighted_examples { value: 0.0 } total_weighted_label { value: 0.0 } total_weighted_refined_prediction { value: 0.0 } } } """ plot_data = metrics_for_slice_pb2.PlotData() calibration_plot = ( post_export_metrics.calibration_plot_and_prediction_histogram()) calibration_plot.populate_plots_and_pop(tfma_plots, plot_data) self.assertProtoEquals(expected_plot_data, plot_data) self.assertFalse(metric_keys.CALIBRATION_PLOT_MATRICES in tfma_plots) self.assertFalse(metric_keys.CALIBRATION_PLOT_BOUNDARIES in tfma_plots)
def testSerializePlots_emptyPlot(self): slice_key = _make_slice_key('fruit', 'apple') tfma_plots = {metric_keys.ERROR_METRIC: 'error_message'} calibration_plot = ( post_export_metrics.calibration_plot_and_prediction_histogram()) actual_plot = metrics_and_plots_serialization._serialize_plots( (slice_key, tfma_plots), [calibration_plot]) expected_plot = metrics_for_slice_pb2.PlotsForSlice() expected_plot.slice_key.CopyFrom(slicer.serialize_slice_key(slice_key)) expected_plot.plots[ metric_keys.ERROR_METRIC].debug_message = 'error_message' self.assertProtoEquals( expected_plot, metrics_for_slice_pb2.PlotsForSlice.FromString(actual_plot))
def testCalibrationPlotAndPredictionHistogramLinearClassifier(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = ( linear_classifier.simple_linear_classifier(None, temp_eval_export_dir)) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=0.0) ] def check_result(got): # pylint: disable=invalid-name try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertIn(metric_keys.CALIBRATION_PLOT_MATRICES, value) # We just check that the bucket sums look sane, since we don't know # the exact predictions of the model. # # Note that the correctness of the bucketing is tested in the other # two tests with the fixed prediction estimator. This test is more # for ensuring that this metric is compatible with the canned # Estimators, for which the prediction Tensor returned for a batch # of examples will be a N x 1 Tensor, rather than just an N element # vector. buckets = value[metric_keys.CALIBRATION_PLOT_MATRICES] bucket_sums = np.sum(buckets, axis=0) self.assertAlmostEqual(bucket_sums[1], 2.0) # label sum self.assertAlmostEqual(bucket_sums[2], 4.0) # weight sum except AssertionError as err: raise util.BeamAssertException(err) self._runTestWithCustomCheck( examples, eval_export_dir, [post_export_metrics.calibration_plot_and_prediction_histogram()], custom_plots_check=check_result)
def testWriteMetricsAndPlots(self): metrics_file = os.path.join(self._getTempDir(), 'metrics') plots_file = os.path.join(self._getTempDir(), 'plots') temp_eval_export_dir = os.path.join(self._getTempDir(), 'eval_export_dir') _, eval_export_dir = ( fixed_prediction_estimator.simple_fixed_prediction_estimator( None, temp_eval_export_dir)) eval_config = config.EvalConfig( model_specs=[config.ModelSpec()], options=config.Options( disabled_outputs={'values': ['eval_config.json']})) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[ post_export_metrics.example_count(), post_export_metrics.calibration_plot_and_prediction_histogram( num_buckets=2) ]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] evaluators = [ metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(eval_shared_model) ] output_paths = { constants.METRICS_KEY: metrics_file, constants.PLOTS_KEY: plots_file } writers = [ metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter( output_paths, eval_shared_model.add_metrics_callbacks) ] with beam.Pipeline() as pipeline: example1 = self._makeExample(prediction=0.0, label=1.0) example2 = self._makeExample(prediction=1.0, label=1.0) # pylint: disable=no-value-for-parameter _ = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), ]) | 'ExtractEvaluateAndWriteResults' >> model_eval_lib.ExtractEvaluateAndWriteResults( eval_config=eval_config, eval_shared_model=eval_shared_model, extractors=extractors, evaluators=evaluators, writers=writers)) # pylint: enable=no-value-for-parameter expected_metrics_for_slice = text_format.Parse( """ slice_key {} metrics { key: "average_loss" value { double_value { value: 0.5 } } } metrics { key: "post_export_metrics/example_count" value { double_value { value: 2.0 } } } """, metrics_for_slice_pb2.MetricsForSlice()) metric_records = [] for record in tf.compat.v1.python_io.tf_record_iterator(metrics_file): metric_records.append( metrics_for_slice_pb2.MetricsForSlice.FromString(record)) self.assertEqual(1, len(metric_records), 'metrics: %s' % metric_records) self.assertProtoEquals(expected_metrics_for_slice, metric_records[0]) expected_plots_for_slice = text_format.Parse( """ slice_key {} plots { key: "post_export_metrics" value { calibration_histogram_buckets { buckets { lower_threshold_inclusive: -inf num_weighted_examples {} total_weighted_label {} total_weighted_refined_prediction {} } buckets { upper_threshold_exclusive: 0.5 num_weighted_examples { value: 1.0 } total_weighted_label { value: 1.0 } total_weighted_refined_prediction {} } buckets { lower_threshold_inclusive: 0.5 upper_threshold_exclusive: 1.0 num_weighted_examples { } total_weighted_label {} total_weighted_refined_prediction {} } buckets { lower_threshold_inclusive: 1.0 upper_threshold_exclusive: inf num_weighted_examples { value: 1.0 } total_weighted_label { value: 1.0 } total_weighted_refined_prediction { value: 1.0 } } } } } """, metrics_for_slice_pb2.PlotsForSlice()) plot_records = [] for record in tf.compat.v1.python_io.tf_record_iterator(plots_file): plot_records.append( metrics_for_slice_pb2.PlotsForSlice.FromString(record)) self.assertEqual(1, len(plot_records), 'plots: %s' % plot_records) self.assertProtoEquals(expected_plots_for_slice, plot_records[0])
def testConvertSlicePlotsToProtoLegacyStringKeys(self): slice_key = _make_slice_key('fruit', 'apple') tfma_plots = { metric_keys.CALIBRATION_PLOT_MATRICES: np.array([ [0.0, 0.0, 0.0], [0.3, 1.0, 1.0], [0.7, 0.0, 1.0], [0.0, 0.0, 0.0], ]), metric_keys.CALIBRATION_PLOT_BOUNDARIES: np.array([0.0, 0.5, 1.0]), } expected_plot_data = """ slice_key { single_slice_keys { column: 'fruit' bytes_value: 'apple' } } plots { key: "post_export_metrics" value { calibration_histogram_buckets { buckets { lower_threshold_inclusive: -inf upper_threshold_exclusive: 0.0 num_weighted_examples { value: 0.0 } total_weighted_label { value: 0.0 } total_weighted_refined_prediction { value: 0.0 } } buckets { lower_threshold_inclusive: 0.0 upper_threshold_exclusive: 0.5 num_weighted_examples { value: 1.0 } total_weighted_label { value: 1.0 } total_weighted_refined_prediction { value: 0.3 } } buckets { lower_threshold_inclusive: 0.5 upper_threshold_exclusive: 1.0 num_weighted_examples { value: 1.0 } total_weighted_label { value: 0.0 } total_weighted_refined_prediction { value: 0.7 } } buckets { lower_threshold_inclusive: 1.0 upper_threshold_exclusive: inf num_weighted_examples { value: 0.0 } total_weighted_label { value: 0.0 } total_weighted_refined_prediction { value: 0.0 } } } } } """ calibration_plot = ( post_export_metrics.calibration_plot_and_prediction_histogram()) got = metrics_plots_and_validations_writer.convert_slice_plots_to_proto( (slice_key, tfma_plots), [calibration_plot]) self.assertProtoEquals(expected_plot_data, got)
def testCalibrationPlotAndPredictionHistogramWeighted(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = ( fixed_prediction_estimator_extra_fields .simple_fixed_prediction_estimator_extra_fields(None, temp_eval_export_dir)) examples = [ # For each example, we set label to prediction + 1. self._makeExample( prediction=-10.0, label=-9.0, fixed_float=1.0, fixed_string='', fixed_int=0), self._makeExample( prediction=-9.0, label=-8.0, fixed_float=2.0, fixed_string='', fixed_int=0), self._makeExample( prediction=0.0000, label=1.0000, fixed_float=0.0, fixed_string='', fixed_int=0), self._makeExample( prediction=0.00100, label=1.00100, fixed_float=1.0, fixed_string='', fixed_int=0), self._makeExample( prediction=0.00101, label=1.00101, fixed_float=2.0, fixed_string='', fixed_int=0), self._makeExample( prediction=0.00102, label=1.00102, fixed_float=3.0, fixed_string='', fixed_int=0), self._makeExample( prediction=10.0, label=11.0, fixed_float=7.0, fixed_string='', fixed_int=0), ] def check_result(got): # pylint: disable=invalid-name try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertIn(metric_keys.CALIBRATION_PLOT_MATRICES, value) buckets = value[metric_keys.CALIBRATION_PLOT_MATRICES] self.assertSequenceAlmostEqual(buckets[0], [-28.0, -25.0, 3.0]) self.assertSequenceAlmostEqual(buckets[1], [0.0, 0.0, 0.0]) self.assertSequenceAlmostEqual(buckets[11], [0.00608, 6.00608, 6.0]) self.assertSequenceAlmostEqual(buckets[10001], [70.0, 77.0, 7.0]) except AssertionError as err: raise util.BeamAssertException(err) self._runTestWithCustomCheck( examples, eval_export_dir, [ post_export_metrics.calibration_plot_and_prediction_histogram( example_weight_key='fixed_float') ], custom_plots_check=check_result)
def testCalibrationPlotAndPredictionHistogramUnweighted(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = ( fixed_prediction_estimator.simple_fixed_prediction_estimator( None, temp_eval_export_dir)) examples = [ # For each example, we set label to prediction + 1. # These two go in bucket 0: (-inf, 0) self._makeExample(prediction=-10.0, label=-9.0), self._makeExample(prediction=-9.0, label=-8.0), # This goes in bucket 1: [0, 0.00100) self._makeExample(prediction=0.00000, label=1.00000), # These three go in bucket 1: [0.00100, 0.00110) self._makeExample(prediction=0.00100, label=1.00100), self._makeExample(prediction=0.00101, label=1.00101), self._makeExample(prediction=0.00102, label=1.00102), # These two go in bucket 10000: [0.99990, 1.00000) self._makeExample(prediction=0.99998, label=1.99998), self._makeExample(prediction=0.99999, label=1.99999), # These four go in bucket 10001: [1.0000, +inf) self._makeExample(prediction=1.0, label=2.0), self._makeExample(prediction=8.0, label=9.0), self._makeExample(prediction=9.0, label=10.0), self._makeExample(prediction=10.0, label=11.0), ] calibration_plot = ( post_export_metrics.calibration_plot_and_prediction_histogram()) def check_result(got): # pylint: disable=invalid-name try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertIn(metric_keys.CALIBRATION_PLOT_MATRICES, value) buckets = value[metric_keys.CALIBRATION_PLOT_MATRICES] self.assertSequenceAlmostEqual(buckets[0], [-19.0, -17.0, 2.0]) self.assertSequenceAlmostEqual(buckets[1], [0.0, 1.0, 1.0]) self.assertSequenceAlmostEqual(buckets[11], [0.00303, 3.00303, 3.0]) self.assertSequenceAlmostEqual(buckets[10000], [1.99997, 3.99997, 2.0]) self.assertSequenceAlmostEqual(buckets[10001], [28.0, 32.0, 4.0]) self.assertIn(metric_keys.CALIBRATION_PLOT_BOUNDARIES, value) boundaries = value[metric_keys.CALIBRATION_PLOT_BOUNDARIES] self.assertAlmostEqual(0.0, boundaries[0]) self.assertAlmostEqual(0.001, boundaries[10]) self.assertAlmostEqual(0.005, boundaries[50]) self.assertAlmostEqual(0.010, boundaries[100]) self.assertAlmostEqual(0.100, boundaries[1000]) self.assertAlmostEqual(0.800, boundaries[8000]) self.assertAlmostEqual(1.000, boundaries[10000]) plot_data = metrics_for_slice_pb2.PlotData() calibration_plot.populate_plots_and_pop(value, plot_data) self.assertProtoEquals( """lower_threshold_inclusive:1.0 upper_threshold_exclusive: inf num_weighted_examples { value: 4.0 } total_weighted_label { value: 32.0 } total_weighted_refined_prediction { value: 28.0 }""", plot_data.calibration_histogram_buckets.buckets[10001]) except AssertionError as err: raise util.BeamAssertException(err) self._runTestWithCustomCheck( examples, eval_export_dir, [calibration_plot], custom_plots_check=check_result)