def construct_fn(): # pylint: disable=invalid-name """Function for constructing shared models.""" # If we are evaluating on TPU, initialize the TPU. # TODO(b/143484017): Add model warmup for TPU. if tf.saved_model.TPU in tags: tf.tpu.experimental.initialize_tpu_system() if (model_type == constants.TF_ESTIMATOR and eval_constants.EVAL_TAG in tags): model = load.EvalSavedModel( eval_saved_model_path, include_default_metrics, additional_fetches=additional_fetches, blacklist_feature_fetches=blacklist_feature_fetches, tags=tags) if add_metrics_callbacks: model.register_add_metric_callbacks(add_metrics_callbacks) model.graph_finalize() elif model_type == constants.TF_KERAS: # TODO(b/141524386, b/141566408): TPU Inference is not supported # for Keras saved_model yet. model = tf.keras.models.load_model(eval_saved_model_path) elif model_type == constants.TF_LITE: # The tf.lite.Interpreter is not thread-safe so we only load the model # file's contents and leave construction of the Interpreter up to the # PTransform using it. model_filename = os.path.join(eval_saved_model_path, _TFLITE_FILE_NAME) with tf.io.gfile.GFile(model_filename, 'rb') as model_file: model = ModelContents(model_file.read()) elif model_type == constants.TF_JS: # We invoke TFJS models via a subprocess call. So this call is no-op. return None else: model = tf.compat.v1.saved_model.load_v2(eval_saved_model_path, tags=tags) return model
def testEvaluateExistingMetricsBasic(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = multi_head.simple_multi_head( None, temp_eval_export_dir) eval_saved_model = load.EvalSavedModel(eval_export_dir) example1 = self._makeMultiHeadExample('english').SerializeToString() example2 = self._makeMultiHeadExample('chinese').SerializeToString() example3 = self._makeMultiHeadExample('other').SerializeToString() eval_saved_model.metrics_reset_update_get_list( [example1, example2, example3]) metric_values = eval_saved_model.get_metric_values() self.assertDictElementsAlmostEqual( metric_values, { 'accuracy/english_head': 1.0, 'accuracy/chinese_head': 1.0, 'accuracy/other_head': 1.0, 'auc/english_head': 1.0, 'auc/chinese_head': 1.0, 'auc/other_head': 1.0, 'label/mean/english_head': 1.0 / 3.0, 'label/mean/chinese_head': 1.0 / 3.0, 'label/mean/other_head': 1.0 / 3.0 })
def testEvaluateExistingMetricsWithExportedCustomMetricsDNN(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = dnn_classifier.simple_dnn_classifier( None, temp_eval_export_dir) eval_saved_model = load.EvalSavedModel(eval_export_dir) example1 = self._makeExample(age=3.0, language='english', label=1.0) features_predictions_labels = self.predict_injective_single_example( eval_saved_model, example1.SerializeToString()) eval_saved_model.perform_metrics_update(features_predictions_labels) example2 = self._makeExample(age=2.0, language='chinese', label=0.0) features_predictions_labels = self.predict_injective_single_example( eval_saved_model, example2.SerializeToString()) eval_saved_model.perform_metrics_update(features_predictions_labels) metric_values = eval_saved_model.get_metric_values() self.assertDictElementsAlmostEqual( metric_values, { # We don't check accuracy and AUC here because it varies from run to # run due to DNN initialization 'my_mean_age': 2.5, 'my_mean_label': 0.5, 'my_mean_age_times_label': 1.5 }) self.assertIn('my_mean_prediction', metric_values) self.assertIn('prediction/mean', metric_values) self.assertAlmostEqual(metric_values['prediction/mean'], metric_values['my_mean_prediction'], places=5)
def construct(): # pylint: disable=invalid-name """Function for constructing a EvalSavedModel.""" start_time = datetime.datetime.now() result = load.EvalSavedModel(eval_saved_model_path) if add_metrics_callbacks: features_dict, predictions_dict, labels_dict = ( result.get_features_predictions_labels_dicts()) features_dict = util.wrap_tensor_or_dict_of_tensors_in_identity( features_dict) predictions_dict = util.wrap_tensor_or_dict_of_tensors_in_identity( predictions_dict) labels_dict = util.wrap_tensor_or_dict_of_tensors_in_identity(labels_dict) with result.graph_as_default(): metric_ops = {} for add_metrics_callback in add_metrics_callbacks: new_metric_ops = add_metrics_callback(features_dict, predictions_dict, labels_dict) overlap = set(new_metric_ops.keys()) & set(metric_ops.keys()) if overlap: raise ValueError('metric keys should not conflict, but an ' 'earlier callback already added the metrics ' 'named %s' % overlap) metric_ops.update(new_metric_ops) result.register_additional_metric_ops(metric_ops) end_time = datetime.datetime.now() model_load_seconds_distribution.update( int((end_time - start_time).total_seconds())) return result
def testServingGraphAlsoExportedIfSpecified(self): # Most of the example trainers also pass serving_input_receiver_fn to # export_eval_savedmodel, so the serving graph should be included. temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = ( fixed_prediction_estimator.simple_fixed_prediction_estimator( None, temp_eval_export_dir)) # Check the eval graph. eval_saved_model = load.EvalSavedModel(eval_export_dir) example1 = self._makeExample(prediction=0.9, label=0.0) features_predictions_labels = self.predict_injective_single_example( eval_saved_model, example1.SerializeToString()) eval_saved_model.perform_metrics_update(features_predictions_labels) metric_values = eval_saved_model.get_metric_values() self.assertDictElementsAlmostEqual(metric_values, {'average_loss': 0.81}) # Check the serving graph. estimator = tf.contrib.estimator.SavedModelEstimator(eval_export_dir) def predict_input_fn(): return {'inputs': tf.constant([example1.SerializeToString()])} predictions = next(estimator.predict(predict_input_fn)) self.assertAllClose(predictions['outputs'], np.array([0.9]))
def _computeMetricsWithoutBeamNoBatching( self, eval_saved_model_path: Text, serialized_examples: List[bytes]) -> Dict[Text, Any]: """Computes metrics in-memory using the low-level APIs without Beam. This is the non-batched version of computeMetricsWithoutBeam. This can be useful for debugging batching issues with TFMA or with your model (e.g. your model or metrics only works with a fixed-batch size - TFMA requires that your model can accept batches of any size) Args: eval_saved_model_path: Path to the directory containing the EvalSavedModel. serialized_examples: List of serialized example bytes. Returns: Metrics computed by TFMA using your model on the given examples. """ eval_saved_model = load.EvalSavedModel(eval_saved_model_path) for example in serialized_examples: for fpl in eval_saved_model.as_features_predictions_labels( eval_saved_model.predict(example)): eval_saved_model.perform_metrics_update(fpl) return eval_saved_model.get_metric_values()
def testLoadSavedModelDisallowsAdditionalFetchesWithLabels(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = multi_head.simple_multi_head( None, temp_eval_export_dir) with self.assertRaisesRegexp( ValueError, 'additional_fetches should not contain "labels"'): load.EvalSavedModel(eval_export_dir, additional_fetches=['labels'])
def testEvaluateExistingMetricsWithExportedCustomMetrics(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_saved_model = load.EvalSavedModel(eval_export_dir) example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=2.0, language='chinese', label=0.0) eval_saved_model.metrics_reset_update_get_list( [example1.SerializeToString(), example2.SerializeToString()]) metric_values = eval_saved_model.get_metric_values() self.assertDictElementsAlmostEqual( metric_values, { 'accuracy': 1.0, 'auc': 1.0, 'my_mean_age': 2.5, 'my_mean_label': 0.5, 'my_mean_age_times_label': 1.5 }) self.assertIn('my_mean_prediction', metric_values) self.assertIn('prediction/mean', metric_values) self.assertAlmostEqual(metric_values['prediction/mean'], metric_values['my_mean_prediction'], places=5)
def construct(): # pylint: disable=invalid-name """Function for constructing shared ModelTypes.""" start_time = datetime.datetime.now() saved_model = None keras_model = None eval_saved_model = None if model_path: if tf.version.VERSION.split('.')[0] == '1': saved_model = tf.compat.v1.saved_model.load_v2(model_path, tags=[tag]) else: saved_model = tf.saved_model.load(model_path, tags=[tag]) try: keras_model = tf.keras.experimental.load_from_saved_model( model_path) except tf.errors.NotFoundError: pass if eval_saved_model_path: eval_saved_model = load.EvalSavedModel( eval_saved_model_path, include_default_metrics, additional_fetches=additional_fetches, blacklist_feature_fetches=blacklist_feature_fetches) if add_metrics_callbacks: eval_saved_model.register_add_metric_callbacks( add_metrics_callbacks) eval_saved_model.graph_finalize() end_time = datetime.datetime.now() model_load_seconds_callback( int((end_time - start_time).total_seconds())) return types.ModelTypes(saved_model=saved_model, keras_model=keras_model, eval_saved_model=eval_saved_model)
def testServingGraphAlsoExportedIfSpecified(self): # Most of the example trainers also pass serving_input_receiver_fn to # export_eval_savedmodel, so the serving graph should be included. temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = ( fixed_prediction_estimator.simple_fixed_prediction_estimator( None, temp_eval_export_dir)) # Check the eval graph. eval_saved_model = load.EvalSavedModel(eval_export_dir) example1 = self._makeExample(prediction=0.9, label=0.0).SerializeToString() eval_saved_model.metrics_reset_update_get(example1) metric_values = eval_saved_model.get_metric_values() self.assertDictElementsAlmostEqual(metric_values, {'average_loss': 0.81}) # Check the serving graph. # TODO(b/124466113): Remove tf.compat.v2 once TF 2.0 is the default. if hasattr(tf, 'compat.v2'): imported = tf.compat.v2.saved_model.load( eval_export_dir, tags=tf.saved_model.SERVING) predictions = imported.signatures[ tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]( inputs=tf.constant([example1.SerializeToString()])) self.assertAllClose(predictions['outputs'], np.array([[0.9]]))
def construct(): # pylint: disable=invalid-name """Function for constructing shared models.""" start_time = datetime.datetime.now() # If we are evaluating on TPU, initialize the TPU. # TODO(b/143484017): Add model warmup for TPU. if tf.saved_model.TPU in tags: tf.tpu.experimental.initialize_tpu_system() if (model_type == constants.TF_ESTIMATOR and eval_constants.EVAL_TAG in tags): model = load.EvalSavedModel( eval_saved_model_path, include_default_metrics, additional_fetches=additional_fetches, blacklist_feature_fetches=blacklist_feature_fetches, tags=tags) if add_metrics_callbacks: model.register_add_metric_callbacks(add_metrics_callbacks) model.graph_finalize() elif model_type == constants.TF_KERAS: # TODO(b/141524386, b/141566408): TPU Inference is not supported # for Keras saved_model yet. model = tf.keras.models.load_model(eval_saved_model_path) else: model = tf.compat.v1.saved_model.load_v2(eval_saved_model_path, tags=tags) end_time = datetime.datetime.now() model_load_seconds_callback( int((end_time - start_time).total_seconds())) return model
def construct(): # pylint: disable=invalid-name """Function for constructing shared ModelTypes.""" start_time = datetime.datetime.now() saved_model = None keras_model = None eval_saved_model = None if tags == [eval_constants.EVAL_TAG]: eval_saved_model = load.EvalSavedModel( eval_saved_model_path, include_default_metrics, additional_fetches=additional_fetches, blacklist_feature_fetches=blacklist_feature_fetches) if add_metrics_callbacks: eval_saved_model.register_add_metric_callbacks(add_metrics_callbacks) eval_saved_model.graph_finalize() else: try: keras_model = tf.keras.models.load_model(eval_saved_model_path) except Exception: # pylint: disable=broad-except saved_model = tf.compat.v1.saved_model.load_v2( eval_saved_model_path, tags=tags) end_time = datetime.datetime.now() model_load_seconds_callback(int((end_time - start_time).total_seconds())) return types.ModelTypes( saved_model=saved_model, keras_model=keras_model, eval_saved_model=eval_saved_model)
def testEvaluateExistingMetricsBasicForUnsupervisedModel(self): # Test that we can export and load unsupervised models (models which # don't take a labels parameter in their model_fn). temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = ( fixed_prediction_estimator_no_labels .simple_fixed_prediction_estimator_no_labels(None, temp_eval_export_dir)) eval_saved_model = load.EvalSavedModel(eval_export_dir) example1 = self._makeExample(prediction=1.0) features_predictions_labels = self.predict_injective_single_example( eval_saved_model, example1.SerializeToString()) eval_saved_model.perform_metrics_update(features_predictions_labels) example2 = self._makeExample(prediction=0.0) features_predictions_labels = self.predict_injective_single_example( eval_saved_model, example2.SerializeToString()) eval_saved_model.perform_metrics_update(features_predictions_labels) metric_values = eval_saved_model.get_metric_values() self.assertDictElementsAlmostEqual(metric_values, { 'average_loss': 0.5, })
def testEvaluateExistingMetricsCustomEstimatorBasic(self): # Custom estimator aims to predict age * 3 + 1 temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = custom_estimator.simple_custom_estimator( None, temp_eval_export_dir) example1 = example_pb2.Example() example1.features.feature['age'].float_list.value[:] = [1.0] example1.features.feature['label'].float_list.value[:] = [3.0] eval_saved_model = load.EvalSavedModel(eval_export_dir) features_predictions_labels = self.predict_injective_single_example( eval_saved_model, example1.SerializeToString()) eval_saved_model.perform_metrics_update(features_predictions_labels) example2 = example_pb2.Example() example2.features.feature['age'].float_list.value[:] = [2.0] example2.features.feature['label'].float_list.value[:] = [7.0] features_predictions_labels = self.predict_injective_single_example( eval_saved_model, example2.SerializeToString()) eval_saved_model.perform_metrics_update(features_predictions_labels) metric_values = eval_saved_model.get_metric_values() # We don't control the trained model's weights fully, but it should # predict close to what it aims to. The "target" mean prediction is 5.5. self.assertIn('mean_prediction', metric_values) self.assertGreater(metric_values['mean_prediction'], 5.4) self.assertLess(metric_values['mean_prediction'], 5.6) # The "target" mean absolute error is 0.5 self.assertIn('mean_absolute_error', metric_values) self.assertGreater(metric_values['mean_absolute_error'], 0.4) self.assertLess(metric_values['mean_absolute_error'], 0.6) self.assertHasKeyWithValueAlmostEqual(metric_values, 'mean_label', 5.0)
def testEvaluateExistingMetricsBasicForControlDependencyEstimator(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = ( control_dependency_estimator.simple_control_dependency_estimator( None, temp_eval_export_dir)) eval_saved_model = load.EvalSavedModel(eval_export_dir) example1 = self._makeExample(prediction=0.9, label=0.0, fixed_float=1.0, fixed_string='apple', fixed_int=2, var_float=10.0, var_string='banana', var_int=20).SerializeToString() example2 = self._makeExample(prediction=0.1, label=0.0, fixed_float=5.0, fixed_string='avocado', fixed_int=6, var_float=50.0, var_string='berry', var_int=60).SerializeToString() eval_saved_model.metrics_reset_update_get_list([example1, example2]) metric_values = eval_saved_model.get_metric_values() self.assertDictElementsAlmostEqual( metric_values, { 'control_dependency_on_fixed_float': 1.0, 'control_dependency_on_var_float': 10.0, 'control_dependency_on_actual_label': 100.0, 'control_dependency_on_var_int_label': 1000.0, 'control_dependency_on_prediction': 10000.0, })
def benchmarkEvalSavedModelPredict(self): """Benchmark using the EvalSavedModel to make predictions. Runs EvalSavedModel.predict_list and records the wall time taken. """ batch_size = 1000 eval_saved_model = load.EvalSavedModel( path=self._dataset.tfma_saved_model_path(), include_default_metrics=True) records = self._dataset.read_raw_dataset(deserialize=False, limit=MAX_NUM_EXAMPLES) start = time.time() for batch in benchmark_utils.batched_iterator(records, batch_size): eval_saved_model.predict_list(batch) end = time.time() delta = end - start self.report_benchmark( iters=1, wall_time=delta, extras={ "batch_size": batch_size, "num_examples": self._dataset.num_examples(limit=MAX_NUM_EXAMPLES) })
def testEvaluateExistingMetricsBasic(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = multi_head.simple_multi_head( None, temp_eval_export_dir) eval_saved_model = load.EvalSavedModel(eval_export_dir) example1 = self._makeMultiHeadExample('english') features_predictions_labels = self.predict_injective_single_example( eval_saved_model, example1.SerializeToString()) eval_saved_model.perform_metrics_update(features_predictions_labels) example2 = self._makeMultiHeadExample('chinese') features_predictions_labels = self.predict_injective_single_example( eval_saved_model, example2.SerializeToString()) eval_saved_model.perform_metrics_update(features_predictions_labels) metric_values = eval_saved_model.get_metric_values() self.assertDictElementsAlmostEqual( metric_values, { 'accuracy/english_head': 1.0, 'accuracy/chinese_head': 1.0, 'accuracy/other_head': 1.0, 'auc/english_head': 1.0, 'auc/chinese_head': 1.0, 'auc/other_head': 1.0, 'label/mean/english_head': 0.5, 'label/mean/chinese_head': 0.5, 'label/mean/other_head': 0.0 })
def testPredictListMultipleExamplesPerInputModelNoExampleInInput(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = (fake_multi_examples_per_input_estimator. fake_multi_examples_per_input_estimator( None, temp_eval_export_dir)) eval_saved_model = load.EvalSavedModel(eval_export_dir) fetched_list = eval_saved_model.predict_list(['0', '0']) self.assertFalse(fetched_list)
def testEvaluateWithAdditionalMetricsBasic(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = multi_head.simple_multi_head( None, temp_eval_export_dir) eval_saved_model = load.EvalSavedModel(eval_export_dir) _, prediction_dict, label_dict = ( eval_saved_model.get_features_predictions_labels_dicts()) with eval_saved_model.graph_as_default(): metric_ops = {} value_op, update_op = tf.metrics.mean_absolute_error( label_dict['english_head'][0][0], prediction_dict['english_head/probabilities'][0][1]) metric_ops['mean_absolute_error/english_head'] = (value_op, update_op) value_op, update_op = tf.contrib.metrics.count( prediction_dict['english_head/logits']) metric_ops['example_count/english_head'] = (value_op, update_op) eval_saved_model.register_additional_metric_ops(metric_ops) example1 = self._makeMultiHeadExample('english') features_predictions_labels = self.predict_injective_single_example( eval_saved_model, example1.SerializeToString()) eval_saved_model.perform_metrics_update(features_predictions_labels) example2 = self._makeMultiHeadExample('chinese') features_predictions_labels = self.predict_injective_single_example( eval_saved_model, example2.SerializeToString()) eval_saved_model.perform_metrics_update(features_predictions_labels) metric_values = eval_saved_model.get_metric_values() # Check that the original metrics are still there. self.assertDictElementsAlmostEqual( metric_values, { 'accuracy/english_head': 1.0, 'accuracy/chinese_head': 1.0, 'accuracy/other_head': 1.0, 'auc/english_head': 1.0, 'auc/chinese_head': 1.0, 'auc/other_head': 1.0, 'label/mean/english_head': 0.5, 'label/mean/chinese_head': 0.5, 'label/mean/other_head': 0.0 }) # Check the added metrics. # We don't control the trained model's weights fully, but it should # predict probabilities > 0.7. self.assertIn('mean_absolute_error/english_head', metric_values) self.assertLess(metric_values['mean_absolute_error/english_head'], 0.3) self.assertHasKeyWithValueAlmostEqual(metric_values, 'example_count/english_head', 2.0)
def _sharedTestForPredictListMultipleExamplesPerInputModel( self, use_legacy, use_iterator): temp_eval_export_dir = self._getEvalExportDir() if use_legacy: _, eval_export_dir = ( fake_multi_examples_per_input_estimator. legacy_fake_multi_examples_per_input_estimator( None, temp_eval_export_dir)) else: _, eval_export_dir = (fake_multi_examples_per_input_estimator. fake_multi_examples_per_input_estimator( None, temp_eval_export_dir, use_iterator)) eval_saved_model = load.EvalSavedModel(eval_export_dir) fetched_list = eval_saved_model.predict_list( [b'0', b'1', b'3', b'0', b'2']) self.assertEqual(6, len(fetched_list)) input_index = [] example_count = [] labels = [] predictions = [] intra_input_index = [] annotation = [] def _check_and_append_feature(feature_name, one_fetch, feature_values): self.assertEqual((1, ), one_fetch.values['features'][feature_name].shape) feature_values.append( one_fetch.values['features'][feature_name][0]) for fetched in fetched_list: _check_and_append_feature('input_index', fetched, input_index) _check_and_append_feature('example_count', fetched, example_count) _check_and_append_feature('intra_input_index', fetched, intra_input_index) _check_and_append_feature('annotation', fetched, annotation) self.assertAllEqual((1, ), fetched.values['labels'].shape) labels.append(fetched.values['labels']) self.assertAllEqual((1, ), fetched.values['predictions'].shape) predictions.append(fetched.values['predictions']) self.assertSequenceEqual([1, 3, 3, 3, 2, 2], example_count) self.assertSequenceEqual([1, 2, 2, 2, 4, 4], input_index) self.assertSequenceEqual([0, 0, 1, 2, 0, 1], intra_input_index) self.assertAllEqual([ b'raw_input: 1; index: 0', b'raw_input: 3; index: 0', b'raw_input: 3; index: 1', b'raw_input: 3; index: 2', b'raw_input: 2; index: 0', b'raw_input: 2; index: 1' ], annotation) self.assertSequenceEqual([1, 2, 2, 2, 4, 4], labels) self.assertSequenceEqual([1, 2, 2, 2, 4, 4], predictions)
def testPredictListOutOfRangeInputRefs(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = ( fake_multi_examples_per_input_estimator. bad_multi_examples_per_input_estimator_out_of_range_input_refs( None, temp_eval_export_dir)) eval_saved_model = load.EvalSavedModel(eval_export_dir) with self.assertRaisesRegexp(ValueError, 'An index in input_refs is out of range'): eval_saved_model.predict_list(['1'])
def testPredictListMisalignedInputRef(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = ( fake_multi_examples_per_input_estimator. bad_multi_examples_per_input_estimator_misaligned_input_refs( None, temp_eval_export_dir)) eval_saved_model = load.EvalSavedModel(eval_export_dir) with self.assertRaisesRegexp(ValueError, 'input_refs should be batch-aligned'): eval_saved_model.predict_list(['1'])
def testPredictListMultipleExamplesPerInputModel(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = (fake_multi_examples_per_input_estimator. fake_multi_examples_per_input_estimator( None, temp_eval_export_dir)) eval_saved_model = load.EvalSavedModel(eval_export_dir) fpls = eval_saved_model.predict_list([b'0', b'1', b'3', b'0', b'2']) self.assertEqual(6, len(fpls)) input_index = [] example_count = [] labels = [] predictions = [] intra_input_index = [] annotation = [] def _check_and_append_feature(feature_name, one_fpl, feature_values): self.assertEqual( (1, ), one_fpl.features[feature_name][encoding.NODE_SUFFIX].shape) feature_values.append( one_fpl.features[feature_name][encoding.NODE_SUFFIX][0]) for fpl in fpls: _check_and_append_feature('input_index', fpl, input_index) _check_and_append_feature('example_count', fpl, example_count) _check_and_append_feature('intra_input_index', fpl, intra_input_index) _check_and_append_feature('annotation', fpl, annotation) self.assertAllEqual((1, ), fpl.labels[ encoding.DEFAULT_LABELS_DICT_KEY][encoding.NODE_SUFFIX].shape) labels.append(fpl.labels[encoding.DEFAULT_LABELS_DICT_KEY][ encoding.NODE_SUFFIX][0]) self.assertAllEqual( (1, ), fpl.predictions['predictions'][encoding.NODE_SUFFIX].shape) predictions.append( fpl.predictions['predictions'][encoding.NODE_SUFFIX][0]) self.assertSequenceEqual([1, 3, 3, 3, 2, 2], example_count) self.assertSequenceEqual([1, 2, 2, 2, 4, 4], input_index) self.assertSequenceEqual([0, 0, 1, 2, 0, 1], intra_input_index) self.assertAllEqual([ b'raw_input: 1; index: 0', b'raw_input: 3; index: 0', b'raw_input: 3; index: 1', b'raw_input: 3; index: 2', b'raw_input: 2; index: 0', b'raw_input: 2; index: 1' ], annotation) self.assertSequenceEqual([1, 2, 2, 2, 4, 4], labels) self.assertSequenceEqual([1, 2, 2, 2, 4, 4], predictions)
def construct(): # pylint: disable=invalid-name """Function for constructing a EvalSavedModel.""" start_time = datetime.datetime.now() result = load.EvalSavedModel(eval_saved_model_path, include_default_metrics) if add_metrics_callbacks: result.register_add_metric_callbacks(add_metrics_callbacks) result.graph_finalize() end_time = datetime.datetime.now() model_load_seconds.update( int((end_time - start_time).total_seconds())) return result
def testGetAndSetMetricVariables(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = multi_head.simple_multi_head( None, temp_eval_export_dir) eval_saved_model = load.EvalSavedModel(eval_export_dir) _, prediction_dict, _ = ( eval_saved_model.get_features_predictions_labels_dicts()) with eval_saved_model.graph_as_default(): metric_ops = {} value_op, update_op = tf.contrib.metrics.count( prediction_dict['english_head/logits']) metric_ops['example_count/english_head'] = (value_op, update_op) eval_saved_model.register_additional_metric_ops(metric_ops) example1 = self._makeMultiHeadExample('english') features_predictions_labels = self.predict_injective_single_example( eval_saved_model, example1.SerializeToString()) eval_saved_model.perform_metrics_update(features_predictions_labels) metric_values = eval_saved_model.get_metric_values() self.assertDictElementsAlmostEqual( metric_values, { 'label/mean/english_head': 1.0, 'label/mean/chinese_head': 0.0, 'label/mean/other_head': 0.0, 'example_count/english_head': 1.0 }) metric_variables = eval_saved_model.get_metric_variables() example2 = self._makeMultiHeadExample('chinese') features_predictions_labels = self.predict_injective_single_example( eval_saved_model, example2.SerializeToString()) eval_saved_model.perform_metrics_update(features_predictions_labels) metric_values = eval_saved_model.get_metric_values() self.assertDictElementsAlmostEqual( metric_values, { 'label/mean/english_head': 0.5, 'label/mean/chinese_head': 0.5, 'label/mean/other_head': 0.0, 'example_count/english_head': 2.0 }) # Now set metric variables to what they were after the first example. eval_saved_model.set_metric_variables(metric_variables) metric_values = eval_saved_model.get_metric_values() self.assertDictElementsAlmostEqual( metric_values, { 'label/mean/english_head': 1.0, 'label/mean/chinese_head': 0.0, 'label/mean/other_head': 0.0, 'example_count/english_head': 1.0 })
def testAggregateOverallSlice(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = linear_classifier.simple_linear_classifier( None, temp_eval_export_dir) eval_saved_model = load.EvalSavedModel(eval_export_dir) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir) with beam.Pipeline() as pipeline: example1 = self._makeExample(age=3.0, language='english', label=1.0) example2 = self._makeExample(age=3.0, language='chinese', label=0.0) example3 = self._makeExample(age=4.0, language='english', label=1.0) example4 = self._makeExample(age=5.0, language='chinese', label=0.0) predict_result = eval_saved_model.as_features_predictions_labels( eval_saved_model.predict_list([ example1.SerializeToString(), example2.SerializeToString(), example3.SerializeToString(), example4.SerializeToString() ])) metrics, _ = ( pipeline | 'CreateTestInput' >> beam.Create( create_test_input(predict_result, [()])) | 'ComputePerSliceMetrics' >> aggregate.ComputePerSliceMetrics( eval_shared_model=eval_shared_model, desired_batch_size=3)) def check_result(got): self.assertEqual(1, len(got), 'got: %s' % got) slice_key, metrics = got[0] self.assertEqual(slice_key, ()) self.assertDictElementsAlmostEqual( metrics, { 'accuracy': 1.0, 'label/mean': 0.5, 'my_mean_age': 3.75, 'my_mean_age_times_label': 1.75, }) util.assert_that(metrics, check_result)
def construct_fn(): # pylint: disable=invalid-name """Function for constructing shared models.""" # If we are evaluating on TPU, initialize the TPU. # TODO(b/143484017): Add model warmup for TPU. if tf.saved_model.TPU in tags: tf.tpu.experimental.initialize_tpu_system() if (model_type == constants.TF_ESTIMATOR and eval_constants.EVAL_TAG in tags): model = load.EvalSavedModel( eval_saved_model_path, include_default_metrics, additional_fetches=additional_fetches, blacklist_feature_fetches=blacklist_feature_fetches, tags=tags) if add_metrics_callbacks: model.register_add_metric_callbacks(add_metrics_callbacks) model.graph_finalize() elif model_type == constants.TF_KERAS: model = tf.keras.models.load_model(eval_saved_model_path) elif model_type == constants.TF_LITE: # The tf.lite.Interpreter is not thread-safe so we only load the model # file's contents and leave construction of the Interpreter up to the # PTransform using it. model_filename = os.path.join(eval_saved_model_path, _TFLITE_FILE_NAME) with tf.io.gfile.GFile(model_filename, 'rb') as model_file: model_bytes = model_file.read() # If a SavedModel is present in the same directory, load it as well. # This allows the SavedModel to be used for computing the # Transformed Features and Labels. if (tf.io.gfile.exists( os.path.join(eval_saved_model_path, tf.saved_model.SAVED_MODEL_FILENAME_PB)) or tf.io.gfile.exists( os.path.join( eval_saved_model_path, tf.saved_model.SAVED_MODEL_FILENAME_PBTXT))): model = tf.compat.v1.saved_model.load_v2(eval_saved_model_path, tags=tags) model.contents = model_bytes else: model = ModelContents(model_bytes) elif model_type == constants.TF_JS: # We invoke TFJS models via a subprocess call. So this call is no-op. return None else: model = tf.compat.v1.saved_model.load_v2(eval_saved_model_path, tags=tags) return model
def benchmarkEvalSavedModelMetricsResetUpdateGetList(self): """Benchmark using the EvalSavedModel to compute metrics. Runs EvalSavedModel.metrics_reset_update_get_list and records the wall time taken. """ batch_size = 1000 eval_saved_model = load.EvalSavedModel( path=self._dataset.tfma_saved_model_path(), include_default_metrics=True) records = self._dataset.read_raw_dataset( deserialize=False, limit=self._max_num_examples()) start = time.time() accumulators = [] for batch in benchmark_utils.batched_iterator(records, batch_size): accumulators.append( eval_saved_model.metrics_reset_update_get_list(batch)) end = time.time() delta = end - start # Sanity check metric_variables_sum = accumulators[0] for acc in accumulators[1:]: if len(metric_variables_sum) != len(acc): raise ValueError( "all metric variable value lists should have the same length, but " "got lists with different lengths: %d and %d" % (len(metric_variables_sum), len(acc))) metric_variables_sum = [ a + b for a, b in zip(metric_variables_sum, acc) ] metrics = eval_saved_model.metrics_set_variables_and_get_values( metric_variables_sum) if "average_loss" not in metrics: raise ValueError( "metrics should contain average_loss metric, but it did not. " "metrics were: %s" % metrics) self.report_benchmark( iters=1, wall_time=delta, extras={ "batch_size": batch_size, "num_examples": self._dataset.num_examples(limit=self._max_num_examples()) })
def testEvaluateWithOnlyAdditionalMetricsBasic(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = multi_head.simple_multi_head( None, temp_eval_export_dir) eval_saved_model = load.EvalSavedModel(eval_export_dir, include_default_metrics=False) _, prediction_dict, label_dict = ( eval_saved_model.get_features_predictions_labels_dicts()) with eval_saved_model.graph_as_default(): metric_ops = {} value_op, update_op = tf.compat.v1.metrics.mean_absolute_error( label_dict['english_head'][0][0], prediction_dict['english_head/probabilities'][0][1]) metric_ops['mean_absolute_error/english_head'] = (value_op, update_op) value_op, update_op = metrics.total( tf.shape(input=prediction_dict['english_head/logits'])[0]) metric_ops['example_count/english_head'] = (value_op, update_op) eval_saved_model.register_additional_metric_ops(metric_ops) example1 = self._makeMultiHeadExample('english').SerializeToString() example2 = self._makeMultiHeadExample('chinese').SerializeToString() eval_saved_model.metrics_reset_update_get_list([example1, example2]) metric_values = eval_saved_model.get_metric_values() # Check that the original metrics are not there. self.assertNotIn('accuracy/english_head', metric_values) self.assertNotIn('accuracy/chinese_head', metric_values) self.assertNotIn('accuracy/other_head', metric_values) self.assertNotIn('auc/english_head', metric_values) self.assertNotIn('auc/chinese_head', metric_values) self.assertNotIn('auc/other_head', metric_values) self.assertNotIn('label/mean/english_head', metric_values) self.assertNotIn('label/mean/chinese_head', metric_values) self.assertNotIn('label/mean/other_head', metric_values) # Check the added metrics. # We don't control the trained model's weights fully, but it should # predict probabilities > 0.7. self.assertIn('mean_absolute_error/english_head', metric_values) self.assertLess(metric_values['mean_absolute_error/english_head'], 0.3) self.assertHasKeyWithValueAlmostEqual(metric_values, 'example_count/english_head', 2.0)
def testEvaluateExistingMetricsCSVInputBasic(self): temp_eval_export_dir = self._getEvalExportDir() _, eval_export_dir = ( csv_linear_classifier.simple_csv_linear_classifier( None, temp_eval_export_dir)) eval_saved_model = load.EvalSavedModel(eval_export_dir) eval_saved_model.metrics_reset_update_get_list( ['3.0,english,1.0', '3.0,chinese,0.0']) metric_values = eval_saved_model.get_metric_values() self.assertDictElementsAlmostEqual(metric_values, { 'accuracy': 1.0, 'auc': 1.0 })