def testMultiModelPredict(self): temp_eval_export_dir = self._getEvalExportDir() _, model1_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) model1 = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model1_dir) _, model2_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) model2 = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model2_dir) eval_shared_model = {'model1': model1, 'model2': model2} eval_config = config.EvalConfig(model_specs=[ config.ModelSpec(name='model1', example_weight_key='age'), config.ModelSpec(name='model2', example_weight_key='age') ]) tfx_io = raw_tf_record.RawBeamRecordTFXIO( physical_format='inmemory', raw_record_column_name=constants.ARROW_INPUT_COLUMN, telemetry_descriptors=['TFMATest']) extractor = predict_extractor.PredictExtractor( eval_shared_model, eval_config=eval_config) with beam.Pipeline() as pipeline: examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=0.0), ] serialized_examples = [e.SerializeToString() for e in examples] predict_extracts = ( pipeline | beam.Create(serialized_examples, reshuffle=False) | 'BatchExamples' >> tfx_io.BeamSource(batch_size=2) | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | 'Predict' >> extractor.ptransform) def check_result(got): try: self.assertLen(got, 2) for item in got: self.assertIn(constants.FEATURES_KEY, item) for feature in ('language', 'age'): for features_dict in item[constants.FEATURES_KEY]: self.assertIn(feature, features_dict) self.assertIn(constants.LABELS_KEY, item) self.assertIn(constants.PREDICTIONS_KEY, item) for model in ('model1', 'model2'): for predictions_dict in item[constants.PREDICTIONS_KEY]: self.assertIn(model, predictions_dict) self.assertIn(constants.EXAMPLE_WEIGHTS_KEY, item) for i in range(len(item[constants.FEATURES_KEY])): self.assertAlmostEqual(item[constants.FEATURES_KEY][i]['age'], item[constants.EXAMPLE_WEIGHTS_KEY][i]) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(predict_extracts, check_result)
def testMultiModelPredict(self): temp_eval_export_dir = self._getEvalExportDir() _, model1_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) model1 = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model1_dir) _, model2_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) model2 = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model2_dir) eval_config = config.EvalConfig(model_specs=[ config.ModelSpec(name='model1', example_weight_key='age'), config.ModelSpec(name='model2', example_weight_key='age') ]) with beam.Pipeline() as pipeline: examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=0.0), ] serialized_examples = [e.SerializeToString() for e in examples] predict_extracts = ( pipeline | beam.Create(serialized_examples, reshuffle=False) # Our diagnostic outputs, pass types.Extracts throughout, however our # aggregating functions do not use this interface. | beam.Map(lambda x: {constants.INPUT_KEY: x}) | 'Predict' >> predict_extractor._TFMAPredict( eval_shared_models={ 'model1': model1, 'model2': model2 }, desired_batch_size=3, eval_config=eval_config)) def check_result(got): try: self.assertLen(got, 4) for item in got: self.assertIn(constants.FEATURES_KEY, item) for feature in ('language', 'age'): self.assertIn(feature, item[constants.FEATURES_KEY]) self.assertIn(constants.LABELS_KEY, item) self.assertIn(constants.PREDICTIONS_KEY, item) for model in ('model1', 'model2'): self.assertIn(model, item[constants.PREDICTIONS_KEY]) self.assertIn(constants.EXAMPLE_WEIGHTS_KEY, item) self.assertAlmostEqual( item[constants.FEATURES_KEY]['age'], item[constants.EXAMPLE_WEIGHTS_KEY]) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(predict_extracts, check_result)
def testEvaluateExistingMetricsWithExportedCustomMetrics(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_saved_model = load.EvalSavedModel(eval_export_dir) example1 = self._makeExample(age=3.0, language='english', label=1.0) features_predictions_labels = self.predict_injective_single_example( eval_saved_model, example1.SerializeToString()) eval_saved_model.perform_metrics_update(features_predictions_labels) example2 = self._makeExample(age=2.0, language='chinese', label=0.0) features_predictions_labels = self.predict_injective_single_example( eval_saved_model, example2.SerializeToString()) eval_saved_model.perform_metrics_update(features_predictions_labels) metric_values = eval_saved_model.get_metric_values() self.assertDictElementsAlmostEqual( metric_values, { 'accuracy': 1.0, 'auc': 1.0, 'my_mean_age': 2.5, 'my_mean_label': 0.5, 'my_mean_age_times_label': 1.5 }) self.assertIn('my_mean_prediction', metric_values) self.assertIn('prediction/mean', metric_values) self.assertAlmostEqual(metric_values['prediction/mean'], metric_values['my_mean_prediction'], places=5)
def testNativeEvalSavedModelMetricComputations(self): temp_export_dir = self._getExportDir() _, export_dir = linear_classifier.simple_linear_classifier( None, temp_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir) computation = ( eval_saved_model_util.metric_computations_using_eval_saved_model( '', eval_shared_model.model_loader)[0]) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=0.0) ] extracts = [] for e in examples: extracts.append({constants.INPUT_KEY: e.SerializeToString()}) with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create(extracts) | 'Process' >> beam.ParDo(computation.preprocessor) | 'ToStandardMetricInputs' >> beam.Map( metric_types.StandardMetricInputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeMetric' >> beam.CombinePerKey(computation.combiner)) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) self.assertDictElementsAlmostEqual( got_metrics, { metric_types.MetricKey(name='accuracy'): 1.0, metric_types.MetricKey(name='label/mean'): 0.5, metric_types.MetricKey(name='my_mean_age'): 3.75, metric_types.MetricKey(name='my_mean_age_times_label'): 1.75 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testEvaluateNoSlicingAddPostExportAndCustomMetrics(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) metrics, plots = (pipeline | beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) | evaluate.Evaluate( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[ _addExampleCountMetricCallback, post_export_metrics.example_count(), post_export_metrics.example_weight( example_weight_key='age') ])) def check_result(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( got_values_dict=value, expected_values_dict={ 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.75, 'my_mean_age_times_label': 1.75, 'added_example_count': 4.0, metric_keys.EXAMPLE_COUNT: 4.0, metric_keys.EXAMPLE_WEIGHT: 15.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_result, label='metrics') util.assert_that(plots, util.is_empty(), label='plots')
def testEvaluateNoSlicing(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[_addExampleCountMetricCallback]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) metrics, _ = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator. ComputeMetricsAndPlots(eval_shared_model=eval_shared_model)) def check_result(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( value, { 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.75, 'my_mean_age_times_label': 1.75, 'added_example_count': 4.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_result)
def testPredict(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) predict_extracts = ( pipeline | beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) # Our diagnostic outputs, pass types.ExampleAndExtracts throughout, # however our aggregating functions do not use this interface. | beam.Map( lambda x: types.ExampleAndExtracts(example=x, extracts={})) | 'Predict' >> predict_extractor.TFMAPredict( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=None, shared_handle=shared.Shared(), desired_batch_size=3)) def check_result(got): try: self.assertEqual(4, len(got), 'got: %s' % got) for item in got: extracts_dict = item.extracts self.assertTrue(extracts_dict.has_key('fpl')) fpl = extracts_dict['fpl'] # Verify fpl contains features, probabilities, and correct labels. self.assertIn('language', fpl.features) self.assertIn('age', fpl.features) self.assertIn('label', fpl.features) self.assertIn('probabilities', fpl.predictions) self.assertAlmostEqual(fpl.features['label'], fpl.labels['__labels']) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(predict_extracts, check_result)
def testAggregateOverallSlice(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_saved_model = load.EvalSavedModel(eval_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir) with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) predict_result = eval_saved_model.as_features_predictions_labels( eval_saved_model.predict_list([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ])) metrics, _ = ( pipeline | 'CreateTestInput' >> beam.Create( create_test_input(predict_result, [()])) | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics( eval_shared_model=eval_shared_model, desired_batch_size=3)) def check_result(got): self.assertEqual(1, len(got), 'got: %s' % got) slice_key, metrics = got[0] self.assertEqual(slice_key, ()) self.assertDictElementsAlmostEqual( metrics, { 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.75, 'my_mean_age_times_label': 1.75, }) util.assert_that(metrics, check_result)
def testPredict(self, features_blacklist): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=eval_export_dir, blacklist_feature_fetches=features_blacklist) with beam.Pipeline() as pipeline: examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=0.0), ] serialized_examples = [e.SerializeToString() for e in examples] predict_extracts = ( pipeline | beam.Create(serialized_examples, reshuffle=False) # Our diagnostic outputs, pass types.Extracts throughout, however our # aggregating functions do not use this interface. | beam.Map(lambda x: {constants.INPUT_KEY: x}) | 'Predict' >> predict_extractor._TFMAPredict( eval_shared_models={'': eval_shared_model}, desired_batch_size=3)) def check_result(got): try: self.assertLen(got, 4) for item in got: self.assertIn( constants.FEATURES_PREDICTIONS_LABELS_KEY, item) fpl = item[constants.FEATURES_PREDICTIONS_LABELS_KEY] # Verify fpl contains features, probabilities, and correct labels. blacklisted_features = set(features_blacklist or []) expected_features = ( set(['language', 'age', 'label']) - blacklisted_features) for feature in expected_features: self.assertIn(feature, fpl.features) for feature in blacklisted_features: self.assertNotIn(feature, fpl.features) self.assertAlmostEqual(fpl.features['label'], fpl.labels['__labels']) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(predict_extracts, check_result)
def testEvaluateNoSlicing(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) metrics, _ = ( pipeline | beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) | evaluate.Evaluate( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[_addExampleCountMetricCallback])) def check_result(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( value, { 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.75, 'my_mean_age_times_label': 1.75, 'added_example_count': 4.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_result)
def testPostExportMetricsLinearClassifier(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=0.0) ] metrics_to_check = [ (metric_keys.EXAMPLE_COUNT, post_export_metrics.example_count(), 4.0), (metric_keys.EXAMPLE_WEIGHT, post_export_metrics.example_weight('age'), 15.0), ] self._runTest(examples, eval_export_dir, metrics_to_check)
def testPostExportMetricsLinearClassifier(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=0.0) ] expected_values_dict = { metric_keys.EXAMPLE_COUNT: 4.0, metric_keys.EXAMPLE_WEIGHT: 15.0, } self._runTest(examples, eval_export_dir, [ post_export_metrics.example_count(), post_export_metrics.example_weight('age') ], expected_values_dict)
def testPredict(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = types.EvalSharedModel(model_path=eval_export_dir) with beam.Pipeline() as pipeline: examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=0.0), ] serialized_examples = [e.SerializeToString() for e in examples] predict_extracts = ( pipeline | beam.Create(serialized_examples) # Our diagnostic outputs, pass types.Extracts throughout, however our # aggregating functions do not use this interface. | beam.Map(lambda x: {constants.INPUT_KEY: x}) | 'Predict' >> predict_extractor._TFMAPredict( eval_shared_model=eval_shared_model, desired_batch_size=3)) def check_result(got): try: self.assertEqual(4, len(got), 'got: %s' % got) for item in got: self.assertTrue( constants.FEATURES_PREDICTIONS_LABELS_KEY in item) fpl = item[constants.FEATURES_PREDICTIONS_LABELS_KEY] # Verify fpl contains features, probabilities, and correct labels. self.assertIn('language', fpl.features) self.assertIn('age', fpl.features) self.assertIn('label', fpl.features) self.assertIn('probabilities', fpl.predictions) self.assertAlmostEqual(fpl.features['label'], fpl.labels['__labels']) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(predict_extracts, check_result)
def testCalibrationPlotAndPredictionHistogramLinearClassifier(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = (linear_classifier.simple_linear_classifier( None, temp_eval_export_dir)) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=0.0) ] def check_result(got): # pylint: disable=invalid-name try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertIn(metric_keys.CALIBRATION_PLOT_MATRICES, value) # We just check that the bucket sums look sane, since we don't know # the exact predictions of the model. # # Note that the correctness of the bucketing is tested in the other # two tests with the fixed prediction estimator. This test is more # for ensuring that this metric is compatible with the canned # Estimators, for which the prediction Tensor returned for a batch # of examples will be a N x 1 Tensor, rather than just an N element # vector. buckets = value[metric_keys.CALIBRATION_PLOT_MATRICES] bucket_sums = np.sum(buckets, axis=0) self.assertAlmostEqual(bucket_sums[1], 2.0) # label sum self.assertAlmostEqual(bucket_sums[2], 4.0) # weight sum except AssertionError as err: raise util.BeamAssertException(err) self._runTestWithCustomCheck( examples, eval_export_dir, [post_export_metrics.calibration_plot_and_prediction_histogram()], custom_plots_check=check_result)
def testEvaluateNoSlicingAddPostExportAndCustomMetrics(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[ _addExampleCountMetricCallback, # Note that since everything runs in-process this doesn't # actually test that the py_func can be correctly recreated # on workers in a distributed context. _addPyFuncMetricCallback, post_export_metrics.example_count(), post_export_metrics.example_weight(example_weight_key='age') ]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) metrics, plots = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator. ComputeMetricsAndPlots(eval_shared_model=eval_shared_model)) def check_result(got): try: self.assertEqual(1, len(got), 'got: %s' % got) (slice_key, value) = got[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( got_values_dict=value, expected_values_dict={ 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.75, 'my_mean_age_times_label': 1.75, 'added_example_count': 4.0, 'py_func_label_sum': 2.0, metric_keys.EXAMPLE_COUNT: 4.0, metric_keys.EXAMPLE_WEIGHT: 15.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics, check_result, label='metrics') util.assert_that(plots, util.is_empty(), label='plots')
def testEvaluateWithSlicingAndDifferentBatchSizes(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[_addExampleCountMetricCallback]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor([ slicer.SingleSliceSpec(), slicer.SingleSliceSpec(columns=['slice_key']) ]) ] for batch_size in [1, 2, 4, 8]: with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0, slice_key='first_slice') example2 = self._makeExample(age=3.0, language='chinese', label=0.0, slice_key='first_slice') example3 = self._makeExample(age=4.0, language='english', label=0.0, slice_key='second_slice') example4 = self._makeExample(age=5.0, language='chinese', label=1.0, slice_key='second_slice') example5 = self._makeExample(age=5.0, language='chinese', label=1.0, slice_key='second_slice') metrics, plots = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString(), example5.SerializeToString(), ]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'Extract' >> tfma_unit.Extract(extractors=extractors) # pylint: disable=no-value-for-parameter | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator.ComputeMetricsAndPlots( eval_shared_model=eval_shared_model, desired_batch_size=batch_size)) def check_result(got): try: self.assertEqual(3, len(got), 'got: %s' % got) slices = {} for slice_key, value in got: slices[slice_key] = value overall_slice = () first_slice = (('slice_key', b'first_slice'), ) second_slice = (('slice_key', b'second_slice'), ) self.assertItemsEqual( list(slices.keys()), [overall_slice, first_slice, second_slice]) self.assertDictElementsAlmostEqual( slices[overall_slice], { 'accuracy': 0.4, 'label/mean': 0.6, 'my_mean_age': 4.0, 'my_mean_age_times_label': 2.6, 'added_example_count': 5.0 }) self.assertDictElementsAlmostEqual( slices[first_slice], { 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.0, 'my_mean_age_times_label': 1.5, 'added_example_count': 2.0 }) self.assertDictElementsAlmostEqual( slices[second_slice], { 'accuracy': 0.0, 'label/mean': 2.0 / 3.0, 'my_mean_age': 14.0 / 3.0, 'my_mean_age_times_label': 10.0 / 3.0, 'added_example_count': 3.0 }) except AssertionError as err: # This function is redefined every iteration, so it will have the # right value of batch_size. raise util.BeamAssertException( 'batch_size = %d, error: %s' % (batch_size, err)) # pylint: disable=cell-var-from-loop util.assert_that(metrics, check_result, label='metrics') util.assert_that(plots, util.is_empty(), label='plots')
def testAggregateMultipleSlices(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir) with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) predict_result_english_slice = ([ example1.SerializeToString(), example3.SerializeToString() ]) predict_result_chinese_slice = ([ example2.SerializeToString(), example4.SerializeToString() ]) test_input = ( create_test_input(predict_result_english_slice, [( ('language', 'english'))]) + create_test_input(predict_result_chinese_slice, [( ('language', 'chinese'))]) + # Overall slice create_test_input( predict_result_english_slice + predict_result_chinese_slice, [()])) metrics = ( pipeline | 'CreateTestInput' >> beam.Create(test_input) | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics( eval_shared_model=eval_shared_model, desired_batch_size=3)) def check_result(got): self.assertEqual(3, len(got), 'got: %s' % got) slices = {} for slice_key, metrics in got: slices[slice_key] = metrics overall_slice = () english_slice = (('language', 'english')) chinese_slice = (('language', 'chinese')) self.assertCountEqual( list(slices.keys()), [overall_slice, english_slice, chinese_slice]) self.assertDictElementsAlmostEqual( slices[overall_slice], { 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.75, 'my_mean_age_times_label': 1.75, }) self.assertDictElementsAlmostEqual( slices[english_slice], { 'accuracy': 1.0, 'label/mean': 1.0, 'my_mean_age': 3.5, 'my_mean_age_times_label': 3.5, }) self.assertDictElementsAlmostEqual( slices[chinese_slice], { 'accuracy': 1.0, 'label/mean': 0.0, 'my_mean_age': 4.0, 'my_mean_age_times_label': 0.0, }) util.assert_that(metrics, check_result)
def testAggregateMultipleSlicesWithSampling(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir) with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) predict_result_english_slice = ([ example1.SerializeToString(), example3.SerializeToString() ]) predict_result_chinese_slice = ([ example2.SerializeToString(), example4.SerializeToString() ]) test_input = ( create_test_input(predict_result_english_slice, [( ('language', 'english'))]) + create_test_input(predict_result_chinese_slice, [( ('language', 'chinese'))]) + # Overall slice create_test_input( predict_result_english_slice + predict_result_chinese_slice, [()])) metrics = ( pipeline | 'CreateTestInput' >> beam.Create(test_input) | 'ComputePerSliceMetrics' >> poisson_bootstrap.ComputeWithConfidenceIntervals( aggregate.ComputePerSliceMetrics, num_bootstrap_samples=10, eval_shared_model=eval_shared_model, desired_batch_size=3)) def assert_almost_equal_to_value_with_t_distribution( target, unsampled_value, sample_mean, sample_standard_deviation, sample_degrees_of_freedom, delta=2): self.assertEqual(target.unsampled_value, unsampled_value) self.assertAlmostEqual(target.sample_mean, sample_mean, delta=delta) self.assertAlmostEqual( target.sample_standard_deviation, sample_standard_deviation, delta=delta) # The possion resampling could return [0, 0, ... ], which will reduce # the number of samples. self.assertLessEqual(target.sample_degrees_of_freedom, sample_degrees_of_freedom) def check_overall_slice(slices): my_dict = slices[()] assert_almost_equal_to_value_with_t_distribution( my_dict['my_mean_age'], 3.75, 3.64, 0.34, 19) assert_almost_equal_to_value_with_t_distribution( my_dict['accuracy'], 1.0, 1.0, 0, 19) assert_almost_equal_to_value_with_t_distribution( my_dict['label/mean'], 0.5, 0.59, 0.29, 19) assert_almost_equal_to_value_with_t_distribution( my_dict['my_mean_age_times_label'], 1.75, 2.15, 1.06, 19) def check_english_slice(slices): my_dict = slices[(('language', 'english'))] assert_almost_equal_to_value_with_t_distribution( my_dict['my_mean_age'], 3.5, 3.18, 0.28, 19) assert_almost_equal_to_value_with_t_distribution( my_dict['accuracy'], 1.0, 1.0, 0, 19) assert_almost_equal_to_value_with_t_distribution( my_dict['label/mean'], 1.0, 1.0, 0, 19) assert_almost_equal_to_value_with_t_distribution( my_dict['my_mean_age_times_label'], 3.5, 3.18, 0.28, 19) def check_chinese_slice(slices): my_dict = slices[(('language', 'chinese'))] assert_almost_equal_to_value_with_t_distribution( my_dict['my_mean_age'], 4.0, 4.12, 0.83, 19) assert_almost_equal_to_value_with_t_distribution( my_dict['accuracy'], 1.0, 1.0, 0, 19) assert_almost_equal_to_value_with_t_distribution( my_dict['label/mean'], 0, 0, 0, 19) assert_almost_equal_to_value_with_t_distribution( my_dict['my_mean_age_times_label'], 0, 0, 0, 19) def check_result(got): self.assertEqual(3, len(got), 'got: %s' % got) slices = {} for slice_key, metrics in got: slices[slice_key] = metrics check_overall_slice(slices) check_english_slice(slices) check_chinese_slice(slices) util.assert_that(metrics, check_result)
def testEvaluateWithEvalSavedModel(self): temp_export_dir = self._getExportDir() _, export_dir = linear_classifier.simple_linear_classifier( None, temp_export_dir) eval_config = config.EvalConfig( model_specs=[config.ModelSpec(signature_name='eval')], slicing_specs=[ config.SlicingSpec(), config.SlicingSpec(feature_keys=['slice_key']), ]) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, add_metrics_callbacks=[_addExampleCountMetricCallback]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_model=eval_shared_model) ] examples = [ self._makeExample(age=3.0, language='english', label=1.0, slice_key='first_slice'), self._makeExample(age=3.0, language='chinese', label=0.0, slice_key='first_slice'), self._makeExample(age=4.0, language='english', label=0.0, slice_key='second_slice'), self._makeExample(age=5.0, language='chinese', label=1.0, slice_key='second_slice'), self._makeExample(age=5.0, language='chinese', label=1.0, slice_key='second_slice') ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 3) slices = {} for slice_key, value in got: slices[slice_key] = value overall_slice = () first_slice = (('slice_key', b'first_slice'), ) second_slice = (('slice_key', b'second_slice'), ) self.assertCountEqual( list(slices.keys()), [overall_slice, first_slice, second_slice]) self.assertDictElementsAlmostEqual( slices[overall_slice], { metric_types.MetricKey(name='accuracy'): 0.4, metric_types.MetricKey(name='label/mean'): 0.6, metric_types.MetricKey(name='my_mean_age'): 4.0, metric_types.MetricKey(name='my_mean_age_times_label'): 2.6, metric_types.MetricKey(name='added_example_count'): 5.0 }) self.assertDictElementsAlmostEqual( slices[first_slice], { metric_types.MetricKey(name='accuracy'): 1.0, metric_types.MetricKey(name='label/mean'): 0.5, metric_types.MetricKey(name='my_mean_age'): 3.0, metric_types.MetricKey(name='my_mean_age_times_label'): 1.5, metric_types.MetricKey(name='added_example_count'): 2.0 }) self.assertDictElementsAlmostEqual( slices[second_slice], { metric_types.MetricKey(name='accuracy'): 0.0, metric_types.MetricKey(name='label/mean'): 2.0 / 3.0, metric_types.MetricKey(name='my_mean_age'): 14.0 / 3.0, metric_types.MetricKey(name='my_mean_age_times_label'): 10.0 / 3.0, metric_types.MetricKey(name='added_example_count'): 3.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def testAggregateMultipleSlicesWithSampling(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_saved_model = load.EvalSavedModel(eval_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir) with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) predict_result_english_slice = ( eval_saved_model.as_features_predictions_labels( eval_saved_model.predict_list([ example1.SerializeToString(), example3.SerializeToString() ]))) predict_result_chinese_slice = ( eval_saved_model.as_features_predictions_labels( eval_saved_model.predict_list([ example2.SerializeToString(), example4.SerializeToString() ]))) test_input = ( create_test_input(predict_result_english_slice, [( ('language', 'english'))]) + create_test_input(predict_result_chinese_slice, [( ('language', 'chinese'))]) + # Overall slice create_test_input( predict_result_english_slice + predict_result_chinese_slice, [()])) metrics, _ = ( pipeline | 'CreateTestInput' >> beam.Create(test_input) | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics( eval_shared_model=eval_shared_model, desired_batch_size=3, num_bootstrap_samples=10)) def check_overall_slice(slices): my_dict = slices[()] self.assertAlmostEqual(3.75, my_dict['my_mean_age'].value, delta=1) self.assertAlmostEqual(3.75, my_dict['my_mean_age'].unsampled_value) for value in my_dict['accuracy']: self.assertAlmostEqual(1.0, value) for value in my_dict['label/mean']: self.assertAlmostEqual(0.5, value, delta=0.5) for value in my_dict['my_mean_age_times_label']: self.assertAlmostEqual(2.5, value, delta=2.5) def check_english_slice(slices): my_dict = slices[(('language', 'english'))] self.assertAlmostEqual(3.5, my_dict['my_mean_age'].value, delta=1) self.assertAlmostEqual(3.5, my_dict['my_mean_age'].unsampled_value) for value in my_dict['accuracy']: self.assertAlmostEqual(1.0, value) for value in my_dict['label/mean']: self.assertAlmostEqual(1.0, value) for value in my_dict['my_mean_age_times_label']: self.assertAlmostEqual(3.5, value, delta=1) def check_chinese_slice(slices): my_dict = slices[(('language', 'chinese'))] self.assertAlmostEqual(4.0, my_dict['my_mean_age'].value, delta=1) self.assertAlmostEqual(4.0, my_dict['my_mean_age'].unsampled_value) for value in my_dict['accuracy']: self.assertAlmostEqual(1.0, value) for value in my_dict['label/mean']: self.assertAlmostEqual(0, value) for value in my_dict['my_mean_age_times_label']: self.assertAlmostEqual(0, value) def check_result(got): self.assertEqual(3, len(got), 'got: %s' % got) slices = {} for slice_key, metrics in got: slices[slice_key] = metrics check_overall_slice(slices) check_english_slice(slices) check_chinese_slice(slices) util.assert_that(metrics, check_result)