def testRunModelAnalysis(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slice_spec = [slicer.SingleSliceSpec(columns=['language'])] eval_result = model_eval_lib.run_model_analysis( model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age'), data_location, slice_spec=slice_spec) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { ((b'language', b'chinese'),): { 'accuracy': { 'doubleValue': 0.5 }, 'my_mean_label': { 'doubleValue': 0.5 }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 8.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, }, ((b'language', b'english'),): { 'accuracy': { 'doubleValue': 1.0 }, 'my_mean_label': { 'doubleValue': 1.0 }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 7.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, } } self.assertEqual(eval_result.config.model_location, model_location) self.assertEqual(eval_result.config.data_location, data_location) self.assertEqual(eval_result.config.slice_spec, slice_spec) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) self.assertFalse(eval_result.plots)
def testRunModelAnalysisWithMultiplePlots(self): model_location = self._exportEvalSavedModel( fixed_prediction_estimator.simple_fixed_prediction_estimator) examples = [ self._makeExample(prediction=0.0, label=1.0), self._makeExample(prediction=0.7, label=0.0), self._makeExample(prediction=0.8, label=1.0), self._makeExample(prediction=1.0, label=1.0), self._makeExample(prediction=1.0, label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ]) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, add_metrics_callbacks=[ post_export_metrics.auc_plots(), post_export_metrics.auc_plots(metric_tag='test') ]) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[eval_shared_model]) # pipeline works. expected_metrics = { (): { metric_keys.EXAMPLE_COUNT: { 'doubleValue': 5.0 }, } } expected_matrix = { 'threshold': 0.8, 'falseNegatives': 2.0, 'trueNegatives': 1.0, 'truePositives': 2.0, 'precision': 1.0, 'recall': 0.5 } self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected_metrics) self.assertEqual(len(eval_result.plots), 1) slice_key, plots = eval_result.plots[0] self.assertEqual((), slice_key) self.assertDictElementsAlmostEqual( plots['']['']['post_export_metrics']['confusionMatrixAtThresholds'] ['matrices'][8001], expected_matrix) self.assertDictElementsAlmostEqual( plots['']['']['post_export_metrics/test'] ['confusionMatrixAtThresholds']['matrices'][8001], expected_matrix)
def testNoConstructFn(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [self._makeExample(age=3.0, language='english', label=1.0)] data_location = self._writeTFExamplesToTFRecords(examples) eval_config = config.EvalConfig() # No construct_fn should fail when Beam attempts to call the construct_fn. eval_shared_model = types.EvalSharedModel(model_path=model_location) with self.assertRaisesRegexp(AttributeError, '\'NoneType\' object has no attribute'): model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_model=eval_shared_model, data_location=data_location, output_path=self._getTempDir()) # Using the default_eval_shared_model should pass as it has a construct_fn. eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location) model_eval_lib.run_model_analysis(eval_config=eval_config, eval_shared_model=eval_shared_model, data_location=data_location, output_path=self._getTempDir())
def testRunModelAnalysisForCSVText(self): model_location = self._exportEvalSavedModel( csv_linear_classifier.simple_csv_linear_classifier) examples = [ '3.0,english,1.0', '3.0,chinese,0.0', '4.0,english,1.0', '5.0,chinese,1.0' ] data_location = self._writeCSVToTextFile(examples) eval_result = model_eval_lib.run_model_analysis(model_location, data_location, file_format='text') # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = {(): {'accuracy': 0.75, metric_keys.EXAMPLE_COUNT: 4.0}} self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
def testRunModelAnalysisWithMultiplePlots(self): model_location = self._exportEvalSavedModel( fixed_prediction_estimator.simple_fixed_prediction_estimator) examples = [ self._makeExample(prediction=0.0, label=1.0), self._makeExample(prediction=0.7, label=0.0), self._makeExample(prediction=0.8, label=1.0), self._makeExample(prediction=1.0, label=1.0), self._makeExample(prediction=1.0, label=1.0) ] eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, add_metrics_callbacks=[ post_export_metrics.auc_plots(), post_export_metrics.auc_plots(metric_tag='test') ]) data_location = self._writeTFExamplesToTFRecords(examples) eval_result = model_eval_lib.run_model_analysis( eval_shared_model, data_location) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected_metrics = { (): { metric_keys.EXAMPLE_COUNT: { 'doubleValue': 5.0 }, } } expected_matrix = { 'threshold': 0.8, 'falseNegatives': 2.0, 'trueNegatives': 1.0, 'truePositives': 2.0, 'precision': 1.0, 'recall': 0.5 } self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected_metrics) self.assertEqual(len(eval_result.plots), 1) slice_key, plots = eval_result.plots[0] self.assertEqual((), slice_key) tf.logging.info(plots.keys()) self.assertDictElementsAlmostEqual( plots['post_export_metrics']['confusionMatrixAtThresholds'] ['matrices'][8001], expected_matrix) self.assertDictElementsAlmostEqual( plots['post_export_metrics/test']['confusionMatrixAtThresholds'] ['matrices'][8001], expected_matrix)
def testRunModelAnalysisForCSVText(self): model_location = self._exportEvalSavedModel( csv_linear_classifier.simple_csv_linear_classifier) examples = [ '3.0,english,1.0', '3.0,chinese,0.0', '4.0,english,1.0', '5.0,chinese,1.0' ] data_location = self._writeCSVToTextFile(examples) eval_config = config.EvalConfig( input_data_specs=[ config.InputDataSpec(location=data_location, file_format='text') ], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ]) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[ model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location) ]) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (): { 'accuracy': { 'doubleValue': 0.75 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 4.0 } } } self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
def testRunModelAnalysisWithUncertainty(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=1.0), self._makeExample(age=5.0, language='hindi', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slice_spec = [slicer.SingleSliceSpec(columns=['language'])] eval_result = model_eval_lib.run_model_analysis( model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age'), data_location, slice_spec=slice_spec, num_bootstrap_samples=20, k_anonymization_count=2) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (('language', b'hindi'),): { u'__ERROR__': { 'debugMessage': u'Example count for this slice key is lower than the ' u'minimum required value: 2. No data is aggregated for ' u'this slice.' }, }, (('language', b'chinese'),): { metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 8.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, }, (('language', b'english'),): { 'accuracy': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, 'my_mean_label': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 7.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, } } self.assertEqual(eval_result.config.model_location, model_location) self.assertEqual(eval_result.config.data_location, data_location) self.assertEqual(eval_result.config.slice_spec, slice_spec) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) self.assertFalse(eval_result.plots)
def testRunModelAnalysisWithQueryExtractor(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=0.0), self._makeExample(age=5.0, language='chinese', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slice_spec = [slicer.SingleSliceSpec()] eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') eval_result = model_eval_lib.run_model_analysis( eval_shared_model=eval_shared_model, data_location=data_location, slice_spec=slice_spec, evaluators=[ metrics_and_plots_evaluator.MetricsAndPlotsEvaluator( eval_shared_model), query_based_metrics_evaluator.QueryBasedMetricsEvaluator( query_id='language', prediction_key='logistic', combine_fns=[ query_statistics.QueryStatisticsCombineFn(), ndcg.NdcgMetricCombineFn( at_vals=[1], gain_key='label', weight_key='') ]), ]) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (): { 'post_export_metrics/total_queries': { 'doubleValue': 2.0 }, 'post_export_metrics/min_documents': { 'doubleValue': 2.0 }, 'post_export_metrics/max_documents': { 'doubleValue': 2.0 }, 'post_export_metrics/total_documents': { 'doubleValue': 4.0 }, 'post_export_metrics/ndcg@1': { 'doubleValue': 0.5 }, 'post_export_metrics/example_weight': { 'doubleValue': 15.0 }, 'post_export_metrics/example_count': { 'doubleValue': 4.0 }, } } self.assertEqual(eval_result.config.model_location, model_location) self.assertEqual(eval_result.config.data_location, data_location) self.assertEqual(eval_result.config.slice_spec, slice_spec) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) self.assertFalse(eval_result.plots)
def testRunModelAnalysisWithUncertainty(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=1.0), self._makeExample(age=5.0, language='hindi', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec(feature_keys=['language'])] options = config.Options() options.compute_confidence_intervals.value = True options.k_anonymization_count.value = 2 eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], slicing_specs=slicing_specs, options=options) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[ model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') ]) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (('language', 'hindi'), ): { u'__ERROR__': { 'debugMessage': u'Example count for this slice key is lower than the ' u'minimum required value: 2. No data is aggregated for ' u'this slice.' }, }, (('language', 'chinese'), ): { metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 8.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, }, (('language', 'english'), ): { 'accuracy': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, 'my_mean_label': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 7.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, } } self.assertEqual(eval_result.config.model_specs[0].location, model_location.decode()) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertEqual(eval_result.config.slicing_specs[0], config.SlicingSpec(feature_keys=['language'])) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) self.assertFalse(eval_result.plots)
def testRunModelAnalysisWithQueryBasedMetrics(self): input_layer = tf.keras.layers.Input(shape=(1, ), name='age') output_layer = tf.keras.layers.Dense( 1, activation=tf.nn.sigmoid)(input_layer) model = tf.keras.models.Model(input_layer, output_layer) model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001), loss=tf.keras.losses.binary_crossentropy) features = {'age': [[20.0]]} labels = [[1]] example_weights = [1.0] dataset = tf.data.Dataset.from_tensor_slices( (features, labels, example_weights)) dataset = dataset.shuffle(buffer_size=1).repeat().batch(1) model.fit(dataset, steps_per_epoch=1) model_location = os.path.join(self._getTempDir(), 'export_dir') model.save(model_location, save_format='tf') examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=0.0), self._makeExample(age=3.0, language='english', label=0.0), self._makeExample(age=5.0, language='chinese', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec()] eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[ config.ModelSpec(location=model_location, label_key='label') ], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], slicing_specs=slicing_specs, metrics_specs=metric_specs.specs_from_metrics( [ndcg.NDCG(gain_key='age', name='ndcg')], binarize=config.BinarizationOptions(top_k_list=[1]), query_key='language')) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, tags=[tf.saved_model.SERVING]) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[eval_shared_model], evaluators=[ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ]) self.assertEqual(eval_result.config.model_specs[0].location, model_location) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertLen(eval_result.slicing_metrics, 1) got_slice_key, got_metrics = eval_result.slicing_metrics[0] self.assertEqual(got_slice_key, ()) self.assertIn('', got_metrics) # output_name got_metrics = got_metrics[''] expected_metrics = { '': { 'example_count': True, 'weighted_example_count': True, }, 'topK:1': { 'ndcg': True, }, } for group in expected_metrics: self.assertIn(group, got_metrics) for k in expected_metrics[group]: self.assertIn(k, got_metrics[group])
def testRunModelAnalysisWithKerasModel(self): input_layer = tf.keras.layers.Input(shape=(28 * 28, ), name='data') output_layer = tf.keras.layers.Dense( 10, activation=tf.nn.softmax)(input_layer) model = tf.keras.models.Model(input_layer, output_layer) model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001), loss=tf.keras.losses.categorical_crossentropy) features = {'data': [[0.0] * 28 * 28]} labels = [[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]] example_weights = [1.0] dataset = tf.data.Dataset.from_tensor_slices( (features, labels, example_weights)) dataset = dataset.shuffle(buffer_size=1).repeat().batch(1) model.fit(dataset, steps_per_epoch=1) model_location = os.path.join(self._getTempDir(), 'export_dir') model.save(model_location, save_format='tf') examples = [ self._makeExample(data=[0.0] * 28 * 28, label=1.0), self._makeExample(data=[1.0] * 28 * 28, label=5.0), self._makeExample(data=[1.0] * 28 * 28, label=9.0), ] data_location = self._writeTFExamplesToTFRecords(examples) metrics_spec = config.MetricsSpec() for metric in (tf.keras.metrics.AUC(), ): cfg = tf.keras.utils.serialize_keras_object(metric) metrics_spec.metrics.append( config.MetricConfig(class_name=cfg['class_name'], config=json.dumps(cfg['config']))) for class_id in (0, 5, 9): metrics_spec.binarize.class_ids.append(class_id) eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[ config.ModelSpec(location=model_location, label_key='label') ], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], metrics_specs=[metrics_spec]) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[ model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, tags=[tf.saved_model.SERVING]) ]) self.assertEqual(eval_result.config.model_specs[0].location, model_location) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertLen(eval_result.slicing_metrics, 1) got_slice_key, got_metrics = eval_result.slicing_metrics[0] self.assertEqual(got_slice_key, ()) self.assertIn('', got_metrics) # output_name got_metrics = got_metrics[''] expected_metrics = { 'classId:0': { 'auc': True, }, 'classId:5': { 'auc': True, }, 'classId:9': { 'auc': True, }, } for class_id in expected_metrics: self.assertIn(class_id, got_metrics) for k in expected_metrics[class_id]: self.assertIn(k, got_metrics[class_id])
def testRunModelAnalysisExtraFieldsPlusFeatureExtraction(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0, my_slice='a'), self._makeExample(age=3.0, language='chinese', label=0.0, my_slice='a'), self._makeExample(age=4.0, language='english', label=1.0, my_slice='b'), self._makeExample(age=5.0, language='chinese', label=1.0, my_slice='c'), self._makeExample(age=5.0, language='hindi', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec(feature_keys=['my_slice'])] eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], slicing_specs=slicing_specs) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') slice_spec = [slicer.SingleSliceSpec(spec=slicing_specs[0])] extractors_with_feature_extraction = [ predict_extractor.PredictExtractor(eval_shared_model, desired_batch_size=3, materialize=False), feature_extractor.FeatureExtractor( extract_source=constants.INPUT_KEY, extract_dest=constants.FEATURES_PREDICTIONS_LABELS_KEY), slice_key_extractor.SliceKeyExtractor(slice_spec, materialize=False) ] eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[ model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') ], extractors=extractors_with_feature_extraction) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (('my_slice', 'a'), ): { 'accuracy': { 'doubleValue': 1.0 }, 'my_mean_label': { 'doubleValue': 0.5 }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 6.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, }, (('my_slice', 'b'), ): { 'accuracy': { 'doubleValue': 1.0 }, 'my_mean_label': { 'doubleValue': 1.0 }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 4.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 1.0 }, }, (('my_slice', 'c'), ): { 'accuracy': { 'doubleValue': 0.0 }, 'my_mean_label': { 'doubleValue': 1.0 }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 5.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 1.0 }, }, } self.assertEqual(eval_result.config.model_specs[0].location, model_location.decode()) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertEqual(eval_result.config.slicing_specs[0], config.SlicingSpec(feature_keys=['my_slice'])) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) self.assertFalse(eval_result.plots)
def testRunModelAnalysisWithDeterministicConfidenceIntervals(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=1.0), self._makeExample(age=5.0, language='hindi', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec(feature_keys=['language'])] options = config.Options() options.compute_confidence_intervals.value = True options.k_anonymization_count.value = 2 eval_config = config.EvalConfig(slicing_specs=slicing_specs, options=options) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_model=model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age'), data_location=data_location, output_path=self._getTempDir(), random_seed_for_testing=_TEST_SEED) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (('language', 'hindi'), ): { u'__ERROR__': { 'debugMessage': u'Example count for this slice key is lower than the ' u'minimum required value: 2. No data is aggregated for ' u'this slice.' }, }, (('language', 'chinese'), ): { metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 8.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, }, (('language', 'english'), ): { 'accuracy': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, 'my_mean_label': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 7.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, } } self.assertEqual(eval_result.model_location, model_location.decode()) self.assertEqual(eval_result.data_location, data_location) self.assertEqual(eval_result.config.slicing_specs[0], config.SlicingSpec(feature_keys=['language'])) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) for key, value in eval_result.slicing_metrics: if (('language', 'english'), ) == key: metric = value['']['']['average_loss'] self.assertAlmostEqual(0.171768754720, metric['boundedValue']['value'], delta=0.1) metric = value['']['']['auc_precision_recall'] self.assertAlmostEqual(0.99999940395, metric['boundedValue']['value'], delta=0.1) self.assertFalse(eval_result.plots)
def testRunModelAnalysisWithModelAgnosticPredictions(self): examples = [ self._makeExample(age=3.0, language='english', label=1.0, prediction=0.9), self._makeExample(age=3.0, language='chinese', label=0.0, prediction=0.4), self._makeExample(age=4.0, language='english', label=1.0, prediction=0.7), self._makeExample(age=5.0, language='chinese', label=1.0, prediction=0.2) ] data_location = self._writeTFExamplesToTFRecords(examples) model_specs = [ config.ModelSpec(prediction_key='prediction', label_key='label', example_weight_key='age') ] metrics = [ config.MetricConfig(class_name='ExampleCount'), config.MetricConfig(class_name='WeightedExampleCount'), config.MetricConfig(class_name='BinaryAccuracy') ] slicing_specs = [config.SlicingSpec(feature_keys=['language'])] eval_config = config.EvalConfig( model_specs=model_specs, metrics_specs=[config.MetricsSpec(metrics=metrics)], slicing_specs=slicing_specs) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, data_location=data_location, output_path=self._getTempDir()) expected = { (('language', 'chinese'), ): { 'binary_accuracy': { 'doubleValue': 0.375 }, 'weighted_example_count': { 'doubleValue': 8.0 }, 'example_count': { 'doubleValue': 2.0 }, }, (('language', 'english'), ): { 'binary_accuracy': { 'doubleValue': 1.0 }, 'weighted_example_count': { 'doubleValue': 7.0 }, 'example_count': { 'doubleValue': 2.0 }, } } self.assertEqual(eval_result.data_location, data_location) self.assertEqual(eval_result.config.slicing_specs[0], config.SlicingSpec(feature_keys=['language'])) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)