def testEvaluateQueryBasedMetrics(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = (fixed_prediction_estimator_extra_fields. simple_fixed_prediction_estimator_extra_fields( None, temp_eval_export_dir)) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir) extractors = [ legacy_predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] with beam.Pipeline() as pipeline: metrics = ( pipeline | 'Create' >> beam.Create(self._get_examples()) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'EvaluateQueryBasedMetrics' >> query_based_metrics_evaluator.EvaluateQueryBasedMetrics( prediction_key='', query_id='fixed_string', combine_fns=[ query_statistics.QueryStatisticsCombineFn(), ndcg.NdcgMetricCombineFn(at_vals=[1, 2], gain_key='fixed_float', weight_key='fixed_int'), min_label_position.MinLabelPositionCombineFn( label_key='', weight_key='fixed_int'), ])) def check_metrics(got): try: self.assertEqual(1, len(got), 'got: %s' % got) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) self.assertDictElementsAlmostEqual( got_metrics, { 'post_export_metrics/total_queries': 3.0, 'post_export_metrics/total_documents': 6.0, 'post_export_metrics/min_documents': 1.0, 'post_export_metrics/max_documents': 3.0, 'post_export_metrics/ndcg@1': 0.9166667, 'post_export_metrics/ndcg@2': 0.9766198, 'post_export_metrics/average_min_label_position/__labels': 0.6666667, }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def testEvaluateNoSlicing(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[_addExampleCountMetricCallback]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) metrics, _ = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator. ComputeMetricsAndPlots(eval_shared_model=eval_shared_model)) def check_result(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( value, { 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.75, 'my_mean_age_times_label': 1.75, 'added_example_count': 4.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_result)
def testEvaluateNoSlicingAddPostExportAndCustomMetricsUnsupervisedModel( self): # Mainly for testing that the ExampleCount post export metric works with # unsupervised models. temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = (fixed_prediction_estimator_no_labels. simple_fixed_prediction_estimator_no_labels( None, temp_eval_export_dir)) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[ post_export_metrics.example_count(), post_export_metrics.example_weight( example_weight_key='prediction') ]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] with beam.Pipeline() as pipeline: example1 = self._makeExample(prediction=1.0) example2 = self._makeExample(prediction=2.0) metrics, plots = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), ]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator. ComputeMetricsAndPlots(eval_shared_model=eval_shared_model)) def check_result(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( got_values_dict=value, expected_values_dict={ 'average_loss': 2.5, metric_keys.EXAMPLE_COUNT: 2.0, metric_keys.EXAMPLE_WEIGHT: 3.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_result, label='metrics') util.assert_that(plots, util.is_empty(), label='plots')
def _runTestWithCustomCheck(self, examples, eval_export_dir, metrics_callbacks, slice_spec=None, custom_metrics_check=None, custom_plots_check=None, custom_result_check=None): # make sure we are doing some checks self.assertTrue(custom_metrics_check is not None or custom_plots_check is not None or custom_result_check is not None) serialized_examples = [ex.SerializeToString() for ex in examples] slicing_specs = None if slice_spec: slicing_specs = [s.to_proto() for s in slice_spec] eval_config = config.EvalConfig(slicing_specs=slicing_specs) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=metrics_callbacks) extractors = model_eval_lib.default_extractors( eval_config=eval_config, eval_shared_model=eval_shared_model) tfx_io = raw_tf_record.RawBeamRecordTFXIO( physical_format='inmemory', raw_record_column_name=constants.ARROW_INPUT_COLUMN, telemetry_descriptors=['TFMATest']) with beam.Pipeline() as pipeline: (metrics, plots), _ = ( pipeline | 'Create' >> beam.Create(serialized_examples) | 'BatchExamples' >> tfx_io.BeamSource() | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> legacy_metrics_and_plots_evaluator._ComputeMetricsAndPlots( # pylint: disable=protected-access eval_shared_model=eval_shared_model, compute_confidence_intervals=self. compute_confidence_intervals, random_seed_for_testing=self.deterministic_test_seed)) if custom_metrics_check is not None: util.assert_that(metrics, custom_metrics_check, label='metrics') if custom_plots_check is not None: util.assert_that(plots, custom_plots_check, label='plot') result = pipeline.run() if custom_result_check is not None: custom_result_check(result)
def _runTestWithCustomCheck(self, examples, eval_export_dir, metrics_callbacks, slice_spec=None, custom_metrics_check=None, custom_plots_check=None, custom_result_check=None): # make sure we are doing some checks self.assertTrue(custom_metrics_check is not None or custom_plots_check is not None or custom_result_check is not None) serialized_examples = [ex.SerializeToString() for ex in examples] slicing_specs = None if slice_spec: slicing_specs = [s.to_proto() for s in slice_spec] eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec()], model_specs=[config.ModelSpec(location=eval_export_dir)], output_data_specs=[config.OutputDataSpec()], slicing_specs=slicing_specs) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=metrics_callbacks) extractors = model_eval_lib.default_extractors( eval_config=eval_config, eval_shared_models=[eval_shared_model]) with beam.Pipeline() as pipeline: (metrics, plots), _ = ( pipeline | 'Create' >> beam.Create(serialized_examples) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator.ComputeMetricsAndPlots( eval_shared_model=eval_shared_model, compute_confidence_intervals=self. compute_confidence_intervals, random_seed_for_testing=self.deterministic_test_seed)) if custom_metrics_check is not None: util.assert_that(metrics, custom_metrics_check, label='metrics') if custom_plots_check is not None: util.assert_that(plots, custom_plots_check, label='plot') result = pipeline.run() if custom_result_check is not None: custom_result_check(result)
def testModelAgnosticConstructFn(self): # End to end test for the entire flow going from tf.Examples -> metrics # with slicing. with beam.Pipeline() as pipeline: # Set up the inputs. All we need is are tf.Examples and an example parsing # spec with explicit mapping for key to (Features, Predictions, Labels). examples = [ self._makeExample(age=3.0, language='english', probabilities=1.0, labels=1.0), self._makeExample(age=3.0, language='chinese', probabilities=3.0, labels=0.0), self._makeExample(age=4.0, language='english', probabilities=2.0, labels=1.0), self._makeExample(age=5.0, language='chinese', probabilities=3.0, labels=0.0), # Add some examples with no language. self._makeExample(age=5.0, probabilities=2.0, labels=10.0), self._makeExample(age=6.0, probabilities=1.0, labels=0.0) ] serialized_examples = [e.SerializeToString() for e in examples] # Set up a config to bucket our example keys. feature_map = { 'age': tf.FixedLenFeature([], tf.float32), 'language': tf.VarLenFeature(tf.string), 'probabilities': tf.FixedLenFeature([], tf.float32), 'labels': tf.FixedLenFeature([], tf.float32) } model_agnostic_config = agnostic_predict.ModelAgnosticConfig( label_keys=['labels'], prediction_keys=['probabilities'], feature_spec=feature_map) # Set up the Model Agnostic Extractor extractors = [ model_agnostic_extractor.ModelAgnosticExtractor( model_agnostic_config=model_agnostic_config, desired_batch_size=3), slice_key_extractor.SliceKeyExtractor([ slicer.SingleSliceSpec(), slicer.SingleSliceSpec(columns=['language']) ]) ] # Set up the metrics we wish to calculate via a metric callback. In # particular, this metric calculates the mean and sum of all labels. eval_shared_model = types.EvalSharedModel( add_metrics_callbacks=[add_mean_callback], construct_fn=model_agnostic_evaluate_graph.make_construct_fn( add_metrics_callbacks=[add_mean_callback], fpl_feed_config=model_agnostic_extractor. ModelAgnosticGetFPLFeedConfig(model_agnostic_config))) # Run our pipeline doing Extract -> Slice -> Fanout -> Calculate Metrics. metrics, _ = ( pipeline | 'Create Examples' >> beam.Create(serialized_examples) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator. ComputeMetricsAndPlots(eval_shared_model=eval_shared_model)) # Verify our metrics are properly generated per slice. def check_result(got): self.assertEqual(3, len(got), 'got: %s' % got) slices = {} for slice_key, metrics in got: slices[slice_key] = metrics overall_slice = () english_slice = (('language', b'english'), ) chinese_slice = (('language', b'chinese'), ) self.assertItemsEqual( list(slices.keys()), [overall_slice, english_slice, chinese_slice]) # Overall slice has label/predictions sum = 24 and 12 elements. self.assertDictElementsAlmostEqual(slices[overall_slice], { 'tf_metric_mean': 2.0, 'py_func_total_label': 24.0, }) # English slice has label/predictions sum = 5 and 4 elements. self.assertDictElementsAlmostEqual(slices[english_slice], { 'tf_metric_mean': 1.25, 'py_func_total_label': 5.0, }) # Chinese slice has label/predictions sum = 6 and 4 elements. self.assertDictElementsAlmostEqual(slices[chinese_slice], { 'tf_metric_mean': 1.5, 'py_func_total_label': 6.0, }) util.assert_that(metrics, check_result)
def testEvaluateWithPlots(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = ( fixed_prediction_estimator.simple_fixed_prediction_estimator( None, temp_eval_export_dir)) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[ post_export_metrics.example_count(), post_export_metrics.auc_plots() ]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] with beam.Pipeline() as pipeline: example1 = self._makeExample(prediction=0.0, label=1.0) example2 = self._makeExample(prediction=0.7, label=0.0) example3 = self._makeExample(prediction=0.8, label=1.0) example4 = self._makeExample(prediction=1.0, label=1.0) metrics, plots = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator. ComputeMetricsAndPlots(eval_shared_model=eval_shared_model)) def check_metrics(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( got_values_dict=value, expected_values_dict={ metric_keys.EXAMPLE_COUNT: 4.0, }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_metrics, label='metrics') def check_plots(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictMatrixRowsAlmostEqual( got_values_dict=value, expected_values_dict={ _full_key(metric_keys.AUC_PLOTS_MATRICES): [(8001, [2, 1, 0, 1, 1.0 / 1.0, 1.0 / 3.0])], }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(plots, check_plots, label='plots')
def testEvaluateNoSlicingAddPostExportAndCustomMetrics(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[ _addExampleCountMetricCallback, # Note that since everything runs in-process this doesn't # actually test that the py_func can be correctly recreated # on workers in a distributed context. _addPyFuncMetricCallback, post_export_metrics.example_count(), post_export_metrics.example_weight(example_weight_key='age') ]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) metrics, plots = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator. ComputeMetricsAndPlots(eval_shared_model=eval_shared_model)) def check_result(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( got_values_dict=value, expected_values_dict={ 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.75, 'my_mean_age_times_label': 1.75, 'added_example_count': 4.0, 'py_func_label_sum': 2.0, metric_keys.EXAMPLE_COUNT: 4.0, metric_keys.EXAMPLE_WEIGHT: 15.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_result, label='metrics') util.assert_that(plots, util.is_empty(), label='plots')
def testEvaluateWithSlicingAndDifferentBatchSizes(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[_addExampleCountMetricCallback]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor([ slicer.SingleSliceSpec(), slicer.SingleSliceSpec(columns=['slice_key']) ]) ] for batch_size in [1, 2, 4, 8]: with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0, slice_key='first_slice') example2 = self._makeExample(age=3.0, language='chinese', label=0.0, slice_key='first_slice') example3 = self._makeExample(age=4.0, language='english', label=0.0, slice_key='second_slice') example4 = self._makeExample(age=5.0, language='chinese', label=1.0, slice_key='second_slice') example5 = self._makeExample(age=5.0, language='chinese', label=1.0, slice_key='second_slice') metrics, plots = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString(), example5.SerializeToString(), ]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator.ComputeMetricsAndPlots( eval_shared_model=eval_shared_model, desired_batch_size=batch_size)) def check_result(got): try: self.assertEqual(3, len(got), 'got: %s' % got) slices = {} for slice_key, value in got: slices[slice_key] = value overall_slice = () first_slice = (('slice_key', b'first_slice'), ) second_slice = (('slice_key', b'second_slice'), ) self.assertItemsEqual( list(slices.keys()), [overall_slice, first_slice, second_slice]) self.assertDictElementsAlmostEqual( slices[overall_slice], { 'accuracy': 0.4, 'label/mean': 0.6, 'my_mean_age': 4.0, 'my_mean_age_times_label': 2.6, 'added_example_count': 5.0 }) self.assertDictElementsAlmostEqual( slices[first_slice], { 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.0, 'my_mean_age_times_label': 1.5, 'added_example_count': 2.0 }) self.assertDictElementsAlmostEqual( slices[second_slice], { 'accuracy': 0.0, 'label/mean': 2.0 / 3.0, 'my_mean_age': 14.0 / 3.0, 'my_mean_age_times_label': 10.0 / 3.0, 'added_example_count': 3.0 }) except AssertionError as err: # This function is redefined every iteration, so it will have the # right value of batch_size. raise util.BeamAssertException( 'batch_size = %d, error: %s' % (batch_size, err)) # pylint: disable=cell-var-from-loop util.assert_that(metrics, check_result, label='metrics') util.assert_that(plots, util.is_empty(), label='plots')