def _runTestWithCustomCheck(self, examples, eval_export_dir, metrics, custom_metrics_check=None, custom_plots_check=None): # make sure we are doing some checks self.assertTrue(custom_metrics_check is not None or custom_plots_check is not None) serialized_examples = [ex.SerializeToString() for ex in examples] eval_shared_model = types.EvalSharedModel( model_path=eval_export_dir, add_metrics_callbacks=metrics) extractors = model_eval_lib.default_extractors( eval_shared_model=eval_shared_model) with beam.Pipeline() as pipeline: metrics, plots = ( pipeline | 'Create' >> beam.Create(serialized_examples) | 'ToExampleAndExtracts' >> evaluate.ToExampleAndExtracts() | 'Extract' >> evaluate.Extract(extractors=extractors) | 'Evaluate' >> evaluate.Evaluate(eval_shared_model=eval_shared_model)) if custom_metrics_check is not None: util.assert_that(metrics, custom_metrics_check, label='metrics') if custom_plots_check is not None: util.assert_that(plots, custom_plots_check, label='plot')
def testEvaluateNoSlicing(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = types.EvalSharedModel( model_path=eval_export_dir, add_metrics_callbacks=[_addExampleCountMetricCallback]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) metrics, _ = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) | 'ToExampleAnExtracts' >> evaluate.ToExampleAndExtracts() | 'Extract' >> evaluate.Extract(extractors=extractors) | 'Evaluate' >> evaluate.Evaluate(eval_shared_model=eval_shared_model)) def check_result(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( value, { 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.75, 'my_mean_age_times_label': 1.75, 'added_example_count': 4.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_result)
def assertMetricsComputedWithBeamAre(self, eval_saved_model_path, serialized_examples, expected_metrics): """Checks metrics computed using Beam. Metrics will be computed over all examples, without any slicing. If you want to provide your own PCollection (e.g. read a large number of examples from a file), if you want to check metrics over certain slices, or if you want to add additional post-export metrics, use the more general assertGeneralMetricsComputedWithBeamAre. Example usage: self.assertMetricsComputedWithBeamAre( eval_saved_model_path=path, serialized_examples=[self.makeExample(age=5, label=1.0), self.makeExample(age=10, label=0.0)], expected_metrics={'average_loss': 0.1}) Args: eval_saved_model_path: Path to the directory containing the EvalSavedModel. serialized_examples: List of serialized example bytes. expected_metrics: Dictionary of expected metric values. """ def check_metrics(got): """Check metrics callback.""" try: self.assertEqual( 1, len(got), 'expecting metrics for exactly one slice, but got %d ' 'slices instead. metrics were: %s' % (len(got), got)) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsWithinBounds( got_values_dict=value, expected_values_dict=expected_metrics) except AssertionError as err: raise beam_util.BeamAssertException(err) eval_shared_model = types.EvalSharedModel( model_path=eval_saved_model_path) extractors = model_eval_lib.default_extractors( eval_shared_model=eval_shared_model) with beam.Pipeline() as pipeline: metrics, _ = ( pipeline | 'CreateExamples' >> beam.Create(serialized_examples) | 'ToExampleAndExtracts' >> evaluate.ToExampleAndExtracts() | 'Extract' >> evaluate.Extract(extractors=extractors) | 'Evaluate' >> evaluate.Evaluate(eval_shared_model=eval_shared_model)) beam_util.assert_that(metrics, check_metrics)
def testEvaluateNoSlicingAddPostExportAndCustomMetricsUnsupervisedModel( self): # Mainly for testing that the ExampleCount post export metric works with # unsupervised models. temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = (fixed_prediction_estimator_no_labels. simple_fixed_prediction_estimator_no_labels( None, temp_eval_export_dir)) eval_shared_model = types.EvalSharedModel( model_path=eval_export_dir, add_metrics_callbacks=[ post_export_metrics.example_count(), post_export_metrics.example_weight( example_weight_key='prediction') ]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] with beam.Pipeline() as pipeline: example1 = self._makeExample(prediction=1.0) example2 = self._makeExample(prediction=2.0) metrics, plots = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), ]) | 'ToExampleAnExtracts' >> evaluate.ToExampleAndExtracts() | 'Extract' >> evaluate.Extract(extractors=extractors) | 'Evaluate' >> evaluate.Evaluate(eval_shared_model=eval_shared_model)) def check_result(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( got_values_dict=value, expected_values_dict={ 'average_loss': 2.5, metric_keys.EXAMPLE_COUNT: 2.0, metric_keys.EXAMPLE_WEIGHT: 3.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_result, label='metrics') util.assert_that(plots, util.is_empty(), label='plots')
def assertGeneralMetricsComputedWithBeamAre(self, eval_saved_model_path, examples_pcollection, slice_spec, add_metrics_callbacks, expected_slice_metrics): """Checks metrics computed using Beam. A more general version of assertMetricsComputedWithBeamAre. Note that the caller is responsible for setting up and running the Beam pipeline. Example usage: def add_metrics(features, predictions, labels): metric_ops = { 'mse': tf.metrics.mean_squared_error(labels, predictions['logits']), 'mae': tf.metrics.mean_absolute_error(labels, predictions['logits']), } return metric_ops with beam.Pipeline() as pipeline: expected_slice_metrics = { (): { 'mae': 0.1, 'mse': 0.2, tfma.post_export_metrics.metric_keys.AUC: tfma.test.BoundedValue(lower_bound=0.5) }, (('age', 10),): { 'mae': 0.2, 'mse': 0.3, tfma.post_export_metrics.metric_keys.AUC: tfma.test.BoundedValue(lower_bound=0.5) }, } examples = pipeline | 'ReadExamples' >> beam.io.ReadFromTFRecord(path) self.assertGeneralMetricsComputedWithBeamAre( eval_saved_model_path=path, examples_pcollection=examples, slice_spec=[tfma.SingleSliceSpec(), tfma.SingleSliceSpec(columns=['age'])], add_metrics_callbacks=[ add_metrics, tfma.post_export_metrics.auc()], expected_slice_metrics=expected_slice_metrics) Args: eval_saved_model_path: Path to the directory containing the EvalSavedModel. examples_pcollection: A PCollection of serialized example bytes. slice_spec: List of slice specifications. add_metrics_callbacks: Callbacks for adding additional metrics. expected_slice_metrics: Dictionary of dictionaries describing the expected metrics for each slice. The outer dictionary map slice keys to the expected metrics for that slice. """ def check_metrics(got): """Check metrics callback.""" try: slices = {} for slice_key, value in got: slices[slice_key] = value self.assertItemsEqual(list(slices.keys()), list(expected_slice_metrics.keys())) for slice_key, expected_metrics in expected_slice_metrics.items( ): self.assertDictElementsWithinBounds( got_values_dict=slices[slice_key], expected_values_dict=expected_metrics) except AssertionError as err: raise beam_util.BeamAssertException(err) eval_shared_model = types.EvalSharedModel( model_path=eval_saved_model_path, add_metrics_callbacks=add_metrics_callbacks) extractors = model_eval_lib.default_extractors( eval_shared_model=eval_shared_model, slice_spec=slice_spec) metrics, _ = ( examples_pcollection | 'ToExampleAndExtracts' >> evaluate.ToExampleAndExtracts() | 'Extract' >> evaluate.Extract(extractors=extractors) | 'Evaluate' >> evaluate.Evaluate(eval_shared_model=eval_shared_model)) beam_util.assert_that(metrics, check_metrics)
def ExtractEvaluateAndWriteResults( # pylint: disable=invalid-name examples, eval_shared_model, output_path, display_only_data_location=None, slice_spec=None, desired_batch_size=None, extractors=None, fanout=16, ): """Public API version of evaluate.Evaluate that handles example weights. Users who want to construct their own Beam pipelines instead of using the lightweight run_model_analysis functions should use this PTransform. Example usage: eval_shared_model = tfma.default_eval_shared_model( eval_saved_model_path=model_location, add_metrics_callbacks=[...], example_weight_key=example_weight_key) with beam.Pipeline(runner=...) as p: _ = (p | 'ReadData' >> beam.io.ReadFromTFRecord(data_location) | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_shared_model=eval_shared_model, output_path=output_path, display_only_data_location=data_location, slice_spec=slice_spec, ...)) result = tfma.load_eval_result(output_path=output_path) tfma.view.render_slicing_metrics(result) Note that the exact serialization format is an internal implementation detail and subject to change. Users should only use the TFMA functions to write and read the results. Args: examples: PCollection of input examples. Can be any format the model accepts (e.g. string containing CSV row, TensorFlow.Example, etc). eval_shared_model: Shared model parameters for EvalSavedModel including any additional metrics (see EvalSharedModel for more information on how to configure additional metrics). output_path: Path to output metrics and plots results. display_only_data_location: Optional path indicating where the examples were read from. This is used only for display purposes - data will not actually be read from this path. slice_spec: Optional list of SingleSliceSpec specifying the slices to slice the data into. If None, defaults to the overall slice. desired_batch_size: Optional batch size for batching in Predict and Aggregate. extractors: Optional list of Extractors to apply to ExampleAndExtracts. If provided, the extracts MUST contain a FeaturesPredictionsLabels extract with key 'fpl' and a list of SliceKeyType extracts with key 'slice_keys'. Typically these will be added by calling the default_extractors function. If no extractors are provided, default_extractors (non-materialized) will be used. Raises: ValueError: If PredictExtractor or SliceKeyExtractor is not present in extractors. Returns: PDone. """ if not extractors: extractors = default_extractors(eval_shared_model=eval_shared_model, slice_spec=slice_spec, desired_batch_size=desired_batch_size, materialize=False) metrics, plots = ( examples | 'ToExampleAndExtracts' >> evaluate.ToExampleAndExtracts() | 'Extract' >> evaluate.Extract(extractors=extractors) | 'Evaluate' >> evaluate.Evaluate(eval_shared_model=eval_shared_model, desired_batch_size=desired_batch_size, fanout=fanout)) data_location = '<user provided PCollection>' if display_only_data_location is not None: data_location = display_only_data_location example_weight_metric_key = metric_keys.EXAMPLE_COUNT if eval_shared_model.example_weight_key: example_weight_metric_key = metric_keys.EXAMPLE_WEIGHT eval_config = api_types.EvalConfig( model_location=eval_shared_model.model_path, data_location=data_location, slice_spec=slice_spec, example_weight_metric_key=example_weight_metric_key) _ = ((metrics, plots) | 'SerializeMetricsAndPlots' >> serialization.SerializeMetricsAndPlots( post_export_metrics=eval_shared_model.add_metrics_callbacks) | 'WriteMetricsPlotsAndConfig' >> serialization.WriteMetricsPlotsAndConfig(output_path=output_path, eval_config=eval_config)) return beam.pvalue.PDone(examples.pipeline)
def testEvaluateWithPlots(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = ( fixed_prediction_estimator.simple_fixed_prediction_estimator( None, temp_eval_export_dir)) eval_shared_model = types.EvalSharedModel( model_path=eval_export_dir, add_metrics_callbacks=[ post_export_metrics.example_count(), post_export_metrics.auc_plots() ]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] with beam.Pipeline() as pipeline: example1 = self._makeExample(prediction=0.0, label=1.0) example2 = self._makeExample(prediction=0.7, label=0.0) example3 = self._makeExample(prediction=0.8, label=1.0) example4 = self._makeExample(prediction=1.0, label=1.0) metrics, plots = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) | 'ToExampleAnExtracts' >> evaluate.ToExampleAndExtracts() | 'Extract' >> evaluate.Extract(extractors=extractors) | 'Evaluate' >> evaluate.Evaluate(eval_shared_model=eval_shared_model)) def check_metrics(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( got_values_dict=value, expected_values_dict={ metric_keys.EXAMPLE_COUNT: 4.0, }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_metrics, label='metrics') def check_plots(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictMatrixRowsAlmostEqual( got_values_dict=value, expected_values_dict={ metric_keys.AUC_PLOTS_MATRICES: [(8001, [2, 1, 0, 1, 1.0 / 1.0, 1.0 / 3.0])], }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(plots, check_plots, label='plots')
def testEvaluateNoSlicingAddPostExportAndCustomMetrics(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = types.EvalSharedModel( model_path=eval_export_dir, add_metrics_callbacks=[ _addExampleCountMetricCallback, # Note that since everything runs in-process this doesn't # actually test that the py_func can be correctly recreated # on workers in a distributed context. _addPyFuncMetricCallback, post_export_metrics.example_count(), post_export_metrics.example_weight(example_weight_key='age') ]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) metrics, plots = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) | 'ToExampleAnExtracts' >> evaluate.ToExampleAndExtracts() | 'Extract' >> evaluate.Extract(extractors=extractors) | 'Evaluate' >> evaluate.Evaluate(eval_shared_model=eval_shared_model)) def check_result(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( got_values_dict=value, expected_values_dict={ 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.75, 'my_mean_age_times_label': 1.75, 'added_example_count': 4.0, 'py_func_label_sum': 2.0, metric_keys.EXAMPLE_COUNT: 4.0, metric_keys.EXAMPLE_WEIGHT: 15.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_result, label='metrics') util.assert_that(plots, util.is_empty(), label='plots')
def testEvaluateWithSlicingAndDifferentBatchSizes(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = types.EvalSharedModel( model_path=eval_export_dir, add_metrics_callbacks=[_addExampleCountMetricCallback]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor([ slicer.SingleSliceSpec(), slicer.SingleSliceSpec(columns=['slice_key']) ]) ] for batch_size in [1, 2, 4, 8]: with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0, slice_key='first_slice') example2 = self._makeExample(age=3.0, language='chinese', label=0.0, slice_key='first_slice') example3 = self._makeExample(age=4.0, language='english', label=0.0, slice_key='second_slice') example4 = self._makeExample(age=5.0, language='chinese', label=1.0, slice_key='second_slice') example5 = self._makeExample(age=5.0, language='chinese', label=1.0, slice_key='second_slice') metrics, plots = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString(), example5.SerializeToString(), ]) | 'ToExampleAnExtracts' >> evaluate.ToExampleAndExtracts() | 'Extractors' >> evaluate.Extract(extractors=extractors) | 'Evaluate' >> evaluate.Evaluate( eval_shared_model=eval_shared_model, desired_batch_size=batch_size)) def check_result(got): try: self.assertEqual(3, len(got), 'got: %s' % got) slices = {} for slice_key, value in got: slices[slice_key] = value overall_slice = () first_slice = (('slice_key', b'first_slice'), ) second_slice = (('slice_key', b'second_slice'), ) self.assertItemsEqual( list(slices.keys()), [overall_slice, first_slice, second_slice]) self.assertDictElementsAlmostEqual( slices[overall_slice], { 'accuracy': 0.4, 'label/mean': 0.6, 'my_mean_age': 4.0, 'my_mean_age_times_label': 2.6, 'added_example_count': 5.0 }) self.assertDictElementsAlmostEqual( slices[first_slice], { 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.0, 'my_mean_age_times_label': 1.5, 'added_example_count': 2.0 }) self.assertDictElementsAlmostEqual( slices[second_slice], { 'accuracy': 0.0, 'label/mean': 2.0 / 3.0, 'my_mean_age': 14.0 / 3.0, 'my_mean_age_times_label': 10.0 / 3.0, 'added_example_count': 3.0 }) except AssertionError as err: # This function is redefined every iteration, so it will have the # right value of batch_size. raise util.BeamAssertException( 'batch_size = %d, error: %s' % (batch_size, err)) # pylint: disable=cell-var-from-loop util.assert_that(metrics, check_result, label='metrics') util.assert_that(plots, util.is_empty(), label='plots')