コード例 #1
0
 def testRunModelAnalysis(self):
   model_location = self._exportEvalSavedModel(
       linear_classifier.simple_linear_classifier)
   examples = [
       self._makeExample(age=3.0, language='english', label=1.0),
       self._makeExample(age=3.0, language='chinese', label=0.0),
       self._makeExample(age=4.0, language='english', label=1.0),
       self._makeExample(age=5.0, language='chinese', label=1.0)
   ]
   data_location = self._writeTFExamplesToTFRecords(examples)
   slice_spec = [slicer.SingleSliceSpec(columns=['language'])]
   eval_result = model_eval_lib.run_model_analysis(
       model_eval_lib.default_eval_shared_model(
           eval_saved_model_path=model_location, example_weight_key='age'),
       data_location,
       slice_spec=slice_spec)
   # We only check some of the metrics to ensure that the end-to-end
   # pipeline works.
   expected = {
       ((b'language', b'chinese'),): {
           'accuracy': {
               'doubleValue': 0.5
           },
           'my_mean_label': {
               'doubleValue': 0.5
           },
           metric_keys.EXAMPLE_WEIGHT: {
               'doubleValue': 8.0
           },
           metric_keys.EXAMPLE_COUNT: {
               'doubleValue': 2.0
           },
       },
       ((b'language', b'english'),): {
           'accuracy': {
               'doubleValue': 1.0
           },
           'my_mean_label': {
               'doubleValue': 1.0
           },
           metric_keys.EXAMPLE_WEIGHT: {
               'doubleValue': 7.0
           },
           metric_keys.EXAMPLE_COUNT: {
               'doubleValue': 2.0
           },
       }
   }
   self.assertEqual(eval_result.config.model_location, model_location)
   self.assertEqual(eval_result.config.data_location, data_location)
   self.assertEqual(eval_result.config.slice_spec, slice_spec)
   self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
   self.assertFalse(eval_result.plots)
コード例 #2
0
    def testRunModelAnalysisWithMultiplePlots(self):
        model_location = self._exportEvalSavedModel(
            fixed_prediction_estimator.simple_fixed_prediction_estimator)
        examples = [
            self._makeExample(prediction=0.0, label=1.0),
            self._makeExample(prediction=0.7, label=0.0),
            self._makeExample(prediction=0.8, label=1.0),
            self._makeExample(prediction=1.0, label=1.0),
            self._makeExample(prediction=1.0, label=1.0)
        ]
        data_location = self._writeTFExamplesToTFRecords(examples)
        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec(location=data_location)],
            model_specs=[config.ModelSpec(location=model_location)],
            output_data_specs=[
                config.OutputDataSpec(default_location=self._getTempDir())
            ])
        eval_shared_model = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=model_location,
            add_metrics_callbacks=[
                post_export_metrics.auc_plots(),
                post_export_metrics.auc_plots(metric_tag='test')
            ])
        eval_result = model_eval_lib.run_model_analysis(
            eval_config=eval_config, eval_shared_models=[eval_shared_model])

        # pipeline works.
        expected_metrics = {
            (): {
                metric_keys.EXAMPLE_COUNT: {
                    'doubleValue': 5.0
                },
            }
        }
        expected_matrix = {
            'threshold': 0.8,
            'falseNegatives': 2.0,
            'trueNegatives': 1.0,
            'truePositives': 2.0,
            'precision': 1.0,
            'recall': 0.5
        }
        self.assertMetricsAlmostEqual(eval_result.slicing_metrics,
                                      expected_metrics)
        self.assertEqual(len(eval_result.plots), 1)
        slice_key, plots = eval_result.plots[0]
        self.assertEqual((), slice_key)
        self.assertDictElementsAlmostEqual(
            plots['']['']['post_export_metrics']['confusionMatrixAtThresholds']
            ['matrices'][8001], expected_matrix)
        self.assertDictElementsAlmostEqual(
            plots['']['']['post_export_metrics/test']
            ['confusionMatrixAtThresholds']['matrices'][8001], expected_matrix)
コード例 #3
0
    def testNoConstructFn(self):
        model_location = self._exportEvalSavedModel(
            linear_classifier.simple_linear_classifier)
        examples = [self._makeExample(age=3.0, language='english', label=1.0)]
        data_location = self._writeTFExamplesToTFRecords(examples)
        eval_config = config.EvalConfig()
        # No construct_fn should fail when Beam attempts to call the construct_fn.
        eval_shared_model = types.EvalSharedModel(model_path=model_location)
        with self.assertRaisesRegexp(AttributeError,
                                     '\'NoneType\' object has no attribute'):
            model_eval_lib.run_model_analysis(
                eval_config=eval_config,
                eval_shared_model=eval_shared_model,
                data_location=data_location,
                output_path=self._getTempDir())

        # Using the default_eval_shared_model should pass as it has a construct_fn.
        eval_shared_model = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=model_location)
        model_eval_lib.run_model_analysis(eval_config=eval_config,
                                          eval_shared_model=eval_shared_model,
                                          data_location=data_location,
                                          output_path=self._getTempDir())
コード例 #4
0
ファイル: model_eval_lib_test.py プロジェクト: hakanhp/cahnel
 def testRunModelAnalysisForCSVText(self):
     model_location = self._exportEvalSavedModel(
         csv_linear_classifier.simple_csv_linear_classifier)
     examples = [
         '3.0,english,1.0', '3.0,chinese,0.0', '4.0,english,1.0',
         '5.0,chinese,1.0'
     ]
     data_location = self._writeCSVToTextFile(examples)
     eval_result = model_eval_lib.run_model_analysis(model_location,
                                                     data_location,
                                                     file_format='text')
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {(): {'accuracy': 0.75, metric_keys.EXAMPLE_COUNT: 4.0}}
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
コード例 #5
0
 def testRunModelAnalysisWithMultiplePlots(self):
     model_location = self._exportEvalSavedModel(
         fixed_prediction_estimator.simple_fixed_prediction_estimator)
     examples = [
         self._makeExample(prediction=0.0, label=1.0),
         self._makeExample(prediction=0.7, label=0.0),
         self._makeExample(prediction=0.8, label=1.0),
         self._makeExample(prediction=1.0, label=1.0),
         self._makeExample(prediction=1.0, label=1.0)
     ]
     eval_shared_model = model_eval_lib.default_eval_shared_model(
         eval_saved_model_path=model_location,
         add_metrics_callbacks=[
             post_export_metrics.auc_plots(),
             post_export_metrics.auc_plots(metric_tag='test')
         ])
     data_location = self._writeTFExamplesToTFRecords(examples)
     eval_result = model_eval_lib.run_model_analysis(
         eval_shared_model, data_location)
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected_metrics = {
         (): {
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 5.0
             },
         }
     }
     expected_matrix = {
         'threshold': 0.8,
         'falseNegatives': 2.0,
         'trueNegatives': 1.0,
         'truePositives': 2.0,
         'precision': 1.0,
         'recall': 0.5
     }
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics,
                                   expected_metrics)
     self.assertEqual(len(eval_result.plots), 1)
     slice_key, plots = eval_result.plots[0]
     self.assertEqual((), slice_key)
     tf.logging.info(plots.keys())
     self.assertDictElementsAlmostEqual(
         plots['post_export_metrics']['confusionMatrixAtThresholds']
         ['matrices'][8001], expected_matrix)
     self.assertDictElementsAlmostEqual(
         plots['post_export_metrics/test']['confusionMatrixAtThresholds']
         ['matrices'][8001], expected_matrix)
コード例 #6
0
 def testRunModelAnalysisForCSVText(self):
     model_location = self._exportEvalSavedModel(
         csv_linear_classifier.simple_csv_linear_classifier)
     examples = [
         '3.0,english,1.0', '3.0,chinese,0.0', '4.0,english,1.0',
         '5.0,chinese,1.0'
     ]
     data_location = self._writeCSVToTextFile(examples)
     eval_config = config.EvalConfig(
         input_data_specs=[
             config.InputDataSpec(location=data_location,
                                  file_format='text')
         ],
         model_specs=[config.ModelSpec(location=model_location)],
         output_data_specs=[
             config.OutputDataSpec(default_location=self._getTempDir())
         ])
     eval_result = model_eval_lib.run_model_analysis(
         eval_config=eval_config,
         eval_shared_models=[
             model_eval_lib.default_eval_shared_model(
                 eval_saved_model_path=model_location)
         ])
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
         (): {
             'accuracy': {
                 'doubleValue': 0.75
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 4.0
             }
         }
     }
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
コード例 #7
0
 def testRunModelAnalysisWithUncertainty(self):
   model_location = self._exportEvalSavedModel(
       linear_classifier.simple_linear_classifier)
   examples = [
       self._makeExample(age=3.0, language='english', label=1.0),
       self._makeExample(age=3.0, language='chinese', label=0.0),
       self._makeExample(age=4.0, language='english', label=1.0),
       self._makeExample(age=5.0, language='chinese', label=1.0),
       self._makeExample(age=5.0, language='hindi', label=1.0)
   ]
   data_location = self._writeTFExamplesToTFRecords(examples)
   slice_spec = [slicer.SingleSliceSpec(columns=['language'])]
   eval_result = model_eval_lib.run_model_analysis(
       model_eval_lib.default_eval_shared_model(
           eval_saved_model_path=model_location, example_weight_key='age'),
       data_location,
       slice_spec=slice_spec,
       num_bootstrap_samples=20,
       k_anonymization_count=2)
   # We only check some of the metrics to ensure that the end-to-end
   # pipeline works.
   expected = {
       (('language', b'hindi'),): {
           u'__ERROR__': {
               'debugMessage':
                   u'Example count for this slice key is lower than the '
                   u'minimum required value: 2. No data is aggregated for '
                   u'this slice.'
           },
       },
       (('language', b'chinese'),): {
           metric_keys.EXAMPLE_WEIGHT: {
               'doubleValue': 8.0
           },
           metric_keys.EXAMPLE_COUNT: {
               'doubleValue': 2.0
           },
       },
       (('language', b'english'),): {
           'accuracy': {
               'boundedValue': {
                   'value': 1.0,
                   'lowerBound': 1.0,
                   'upperBound': 1.0,
                   'methodology': 'POISSON_BOOTSTRAP'
               }
           },
           'my_mean_label': {
               'boundedValue': {
                   'value': 1.0,
                   'lowerBound': 1.0,
                   'upperBound': 1.0,
                   'methodology': 'POISSON_BOOTSTRAP'
               }
           },
           metric_keys.EXAMPLE_WEIGHT: {
               'doubleValue': 7.0
           },
           metric_keys.EXAMPLE_COUNT: {
               'doubleValue': 2.0
           },
       }
   }
   self.assertEqual(eval_result.config.model_location, model_location)
   self.assertEqual(eval_result.config.data_location, data_location)
   self.assertEqual(eval_result.config.slice_spec, slice_spec)
   self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
   self.assertFalse(eval_result.plots)
コード例 #8
0
 def testRunModelAnalysisWithQueryExtractor(self):
   model_location = self._exportEvalSavedModel(
       linear_classifier.simple_linear_classifier)
   examples = [
       self._makeExample(age=3.0, language='english', label=1.0),
       self._makeExample(age=3.0, language='chinese', label=0.0),
       self._makeExample(age=4.0, language='english', label=0.0),
       self._makeExample(age=5.0, language='chinese', label=1.0)
   ]
   data_location = self._writeTFExamplesToTFRecords(examples)
   slice_spec = [slicer.SingleSliceSpec()]
   eval_shared_model = model_eval_lib.default_eval_shared_model(
       eval_saved_model_path=model_location, example_weight_key='age')
   eval_result = model_eval_lib.run_model_analysis(
       eval_shared_model=eval_shared_model,
       data_location=data_location,
       slice_spec=slice_spec,
       evaluators=[
           metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(
               eval_shared_model),
           query_based_metrics_evaluator.QueryBasedMetricsEvaluator(
               query_id='language',
               prediction_key='logistic',
               combine_fns=[
                   query_statistics.QueryStatisticsCombineFn(),
                   ndcg.NdcgMetricCombineFn(
                       at_vals=[1], gain_key='label', weight_key='')
               ]),
       ])
   # We only check some of the metrics to ensure that the end-to-end
   # pipeline works.
   expected = {
       (): {
           'post_export_metrics/total_queries': {
               'doubleValue': 2.0
           },
           'post_export_metrics/min_documents': {
               'doubleValue': 2.0
           },
           'post_export_metrics/max_documents': {
               'doubleValue': 2.0
           },
           'post_export_metrics/total_documents': {
               'doubleValue': 4.0
           },
           'post_export_metrics/ndcg@1': {
               'doubleValue': 0.5
           },
           'post_export_metrics/example_weight': {
               'doubleValue': 15.0
           },
           'post_export_metrics/example_count': {
               'doubleValue': 4.0
           },
       }
   }
   self.assertEqual(eval_result.config.model_location, model_location)
   self.assertEqual(eval_result.config.data_location, data_location)
   self.assertEqual(eval_result.config.slice_spec, slice_spec)
   self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
   self.assertFalse(eval_result.plots)
コード例 #9
0
 def testRunModelAnalysisWithUncertainty(self):
     model_location = self._exportEvalSavedModel(
         linear_classifier.simple_linear_classifier)
     examples = [
         self._makeExample(age=3.0, language='english', label=1.0),
         self._makeExample(age=3.0, language='chinese', label=0.0),
         self._makeExample(age=4.0, language='english', label=1.0),
         self._makeExample(age=5.0, language='chinese', label=1.0),
         self._makeExample(age=5.0, language='hindi', label=1.0)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     slicing_specs = [config.SlicingSpec(feature_keys=['language'])]
     options = config.Options()
     options.compute_confidence_intervals.value = True
     options.k_anonymization_count.value = 2
     eval_config = config.EvalConfig(
         input_data_specs=[config.InputDataSpec(location=data_location)],
         model_specs=[config.ModelSpec(location=model_location)],
         output_data_specs=[
             config.OutputDataSpec(default_location=self._getTempDir())
         ],
         slicing_specs=slicing_specs,
         options=options)
     eval_result = model_eval_lib.run_model_analysis(
         eval_config=eval_config,
         eval_shared_models=[
             model_eval_lib.default_eval_shared_model(
                 eval_saved_model_path=model_location,
                 example_weight_key='age')
         ])
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
         (('language', 'hindi'), ): {
             u'__ERROR__': {
                 'debugMessage':
                 u'Example count for this slice key is lower than the '
                 u'minimum required value: 2. No data is aggregated for '
                 u'this slice.'
             },
         },
         (('language', 'chinese'), ): {
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 8.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 2.0
             },
         },
         (('language', 'english'), ): {
             'accuracy': {
                 'boundedValue': {
                     'value': 1.0,
                     'lowerBound': 1.0,
                     'upperBound': 1.0,
                     'methodology': 'POISSON_BOOTSTRAP'
                 }
             },
             'my_mean_label': {
                 'boundedValue': {
                     'value': 1.0,
                     'lowerBound': 1.0,
                     'upperBound': 1.0,
                     'methodology': 'POISSON_BOOTSTRAP'
                 }
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 7.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 2.0
             },
         }
     }
     self.assertEqual(eval_result.config.model_specs[0].location,
                      model_location.decode())
     self.assertEqual(eval_result.config.input_data_specs[0].location,
                      data_location)
     self.assertEqual(eval_result.config.slicing_specs[0],
                      config.SlicingSpec(feature_keys=['language']))
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
     self.assertFalse(eval_result.plots)
コード例 #10
0
    def testRunModelAnalysisWithQueryBasedMetrics(self):
        input_layer = tf.keras.layers.Input(shape=(1, ), name='age')
        output_layer = tf.keras.layers.Dense(
            1, activation=tf.nn.sigmoid)(input_layer)
        model = tf.keras.models.Model(input_layer, output_layer)
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001),
                      loss=tf.keras.losses.binary_crossentropy)

        features = {'age': [[20.0]]}
        labels = [[1]]
        example_weights = [1.0]
        dataset = tf.data.Dataset.from_tensor_slices(
            (features, labels, example_weights))
        dataset = dataset.shuffle(buffer_size=1).repeat().batch(1)
        model.fit(dataset, steps_per_epoch=1)

        model_location = os.path.join(self._getTempDir(), 'export_dir')
        model.save(model_location, save_format='tf')

        examples = [
            self._makeExample(age=3.0, language='english', label=1.0),
            self._makeExample(age=5.0, language='chinese', label=0.0),
            self._makeExample(age=3.0, language='english', label=0.0),
            self._makeExample(age=5.0, language='chinese', label=1.0)
        ]
        data_location = self._writeTFExamplesToTFRecords(examples)
        slicing_specs = [config.SlicingSpec()]
        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec(location=data_location)],
            model_specs=[
                config.ModelSpec(location=model_location, label_key='label')
            ],
            output_data_specs=[
                config.OutputDataSpec(default_location=self._getTempDir())
            ],
            slicing_specs=slicing_specs,
            metrics_specs=metric_specs.specs_from_metrics(
                [ndcg.NDCG(gain_key='age', name='ndcg')],
                binarize=config.BinarizationOptions(top_k_list=[1]),
                query_key='language'))
        eval_shared_model = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=model_location,
            tags=[tf.saved_model.SERVING])
        eval_result = model_eval_lib.run_model_analysis(
            eval_config=eval_config,
            eval_shared_models=[eval_shared_model],
            evaluators=[
                metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                    eval_config=eval_config,
                    eval_shared_models=[eval_shared_model])
            ])

        self.assertEqual(eval_result.config.model_specs[0].location,
                         model_location)
        self.assertEqual(eval_result.config.input_data_specs[0].location,
                         data_location)
        self.assertLen(eval_result.slicing_metrics, 1)
        got_slice_key, got_metrics = eval_result.slicing_metrics[0]
        self.assertEqual(got_slice_key, ())
        self.assertIn('', got_metrics)  # output_name
        got_metrics = got_metrics['']
        expected_metrics = {
            '': {
                'example_count': True,
                'weighted_example_count': True,
            },
            'topK:1': {
                'ndcg': True,
            },
        }
        for group in expected_metrics:
            self.assertIn(group, got_metrics)
            for k in expected_metrics[group]:
                self.assertIn(k, got_metrics[group])
コード例 #11
0
    def testRunModelAnalysisWithKerasModel(self):
        input_layer = tf.keras.layers.Input(shape=(28 * 28, ), name='data')
        output_layer = tf.keras.layers.Dense(
            10, activation=tf.nn.softmax)(input_layer)
        model = tf.keras.models.Model(input_layer, output_layer)
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001),
                      loss=tf.keras.losses.categorical_crossentropy)

        features = {'data': [[0.0] * 28 * 28]}
        labels = [[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]]
        example_weights = [1.0]
        dataset = tf.data.Dataset.from_tensor_slices(
            (features, labels, example_weights))
        dataset = dataset.shuffle(buffer_size=1).repeat().batch(1)
        model.fit(dataset, steps_per_epoch=1)

        model_location = os.path.join(self._getTempDir(), 'export_dir')
        model.save(model_location, save_format='tf')

        examples = [
            self._makeExample(data=[0.0] * 28 * 28, label=1.0),
            self._makeExample(data=[1.0] * 28 * 28, label=5.0),
            self._makeExample(data=[1.0] * 28 * 28, label=9.0),
        ]
        data_location = self._writeTFExamplesToTFRecords(examples)
        metrics_spec = config.MetricsSpec()
        for metric in (tf.keras.metrics.AUC(), ):
            cfg = tf.keras.utils.serialize_keras_object(metric)
            metrics_spec.metrics.append(
                config.MetricConfig(class_name=cfg['class_name'],
                                    config=json.dumps(cfg['config'])))
        for class_id in (0, 5, 9):
            metrics_spec.binarize.class_ids.append(class_id)
        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec(location=data_location)],
            model_specs=[
                config.ModelSpec(location=model_location, label_key='label')
            ],
            output_data_specs=[
                config.OutputDataSpec(default_location=self._getTempDir())
            ],
            metrics_specs=[metrics_spec])
        eval_result = model_eval_lib.run_model_analysis(
            eval_config=eval_config,
            eval_shared_models=[
                model_eval_lib.default_eval_shared_model(
                    eval_saved_model_path=model_location,
                    tags=[tf.saved_model.SERVING])
            ])
        self.assertEqual(eval_result.config.model_specs[0].location,
                         model_location)
        self.assertEqual(eval_result.config.input_data_specs[0].location,
                         data_location)
        self.assertLen(eval_result.slicing_metrics, 1)
        got_slice_key, got_metrics = eval_result.slicing_metrics[0]
        self.assertEqual(got_slice_key, ())
        self.assertIn('', got_metrics)  # output_name
        got_metrics = got_metrics['']
        expected_metrics = {
            'classId:0': {
                'auc': True,
            },
            'classId:5': {
                'auc': True,
            },
            'classId:9': {
                'auc': True,
            },
        }
        for class_id in expected_metrics:
            self.assertIn(class_id, got_metrics)
            for k in expected_metrics[class_id]:
                self.assertIn(k, got_metrics[class_id])
コード例 #12
0
 def testRunModelAnalysisExtraFieldsPlusFeatureExtraction(self):
     model_location = self._exportEvalSavedModel(
         linear_classifier.simple_linear_classifier)
     examples = [
         self._makeExample(age=3.0,
                           language='english',
                           label=1.0,
                           my_slice='a'),
         self._makeExample(age=3.0,
                           language='chinese',
                           label=0.0,
                           my_slice='a'),
         self._makeExample(age=4.0,
                           language='english',
                           label=1.0,
                           my_slice='b'),
         self._makeExample(age=5.0,
                           language='chinese',
                           label=1.0,
                           my_slice='c'),
         self._makeExample(age=5.0, language='hindi', label=1.0)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     slicing_specs = [config.SlicingSpec(feature_keys=['my_slice'])]
     eval_config = config.EvalConfig(
         input_data_specs=[config.InputDataSpec(location=data_location)],
         model_specs=[config.ModelSpec(location=model_location)],
         output_data_specs=[
             config.OutputDataSpec(default_location=self._getTempDir())
         ],
         slicing_specs=slicing_specs)
     eval_shared_model = model_eval_lib.default_eval_shared_model(
         eval_saved_model_path=model_location, example_weight_key='age')
     slice_spec = [slicer.SingleSliceSpec(spec=slicing_specs[0])]
     extractors_with_feature_extraction = [
         predict_extractor.PredictExtractor(eval_shared_model,
                                            desired_batch_size=3,
                                            materialize=False),
         feature_extractor.FeatureExtractor(
             extract_source=constants.INPUT_KEY,
             extract_dest=constants.FEATURES_PREDICTIONS_LABELS_KEY),
         slice_key_extractor.SliceKeyExtractor(slice_spec,
                                               materialize=False)
     ]
     eval_result = model_eval_lib.run_model_analysis(
         eval_config=eval_config,
         eval_shared_models=[
             model_eval_lib.default_eval_shared_model(
                 eval_saved_model_path=model_location,
                 example_weight_key='age')
         ],
         extractors=extractors_with_feature_extraction)
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
         (('my_slice', 'a'), ): {
             'accuracy': {
                 'doubleValue': 1.0
             },
             'my_mean_label': {
                 'doubleValue': 0.5
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 6.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 2.0
             },
         },
         (('my_slice', 'b'), ): {
             'accuracy': {
                 'doubleValue': 1.0
             },
             'my_mean_label': {
                 'doubleValue': 1.0
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 4.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 1.0
             },
         },
         (('my_slice', 'c'), ): {
             'accuracy': {
                 'doubleValue': 0.0
             },
             'my_mean_label': {
                 'doubleValue': 1.0
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 5.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 1.0
             },
         },
     }
     self.assertEqual(eval_result.config.model_specs[0].location,
                      model_location.decode())
     self.assertEqual(eval_result.config.input_data_specs[0].location,
                      data_location)
     self.assertEqual(eval_result.config.slicing_specs[0],
                      config.SlicingSpec(feature_keys=['my_slice']))
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
     self.assertFalse(eval_result.plots)
コード例 #13
0
    def testRunModelAnalysisWithDeterministicConfidenceIntervals(self):
        model_location = self._exportEvalSavedModel(
            linear_classifier.simple_linear_classifier)
        examples = [
            self._makeExample(age=3.0, language='english', label=1.0),
            self._makeExample(age=3.0, language='chinese', label=0.0),
            self._makeExample(age=4.0, language='english', label=1.0),
            self._makeExample(age=5.0, language='chinese', label=1.0),
            self._makeExample(age=5.0, language='hindi', label=1.0)
        ]
        data_location = self._writeTFExamplesToTFRecords(examples)
        slicing_specs = [config.SlicingSpec(feature_keys=['language'])]
        options = config.Options()
        options.compute_confidence_intervals.value = True
        options.k_anonymization_count.value = 2
        eval_config = config.EvalConfig(slicing_specs=slicing_specs,
                                        options=options)
        eval_result = model_eval_lib.run_model_analysis(
            eval_config=eval_config,
            eval_shared_model=model_eval_lib.default_eval_shared_model(
                eval_saved_model_path=model_location,
                example_weight_key='age'),
            data_location=data_location,
            output_path=self._getTempDir(),
            random_seed_for_testing=_TEST_SEED)
        # We only check some of the metrics to ensure that the end-to-end
        # pipeline works.
        expected = {
            (('language', 'hindi'), ): {
                u'__ERROR__': {
                    'debugMessage':
                    u'Example count for this slice key is lower than the '
                    u'minimum required value: 2. No data is aggregated for '
                    u'this slice.'
                },
            },
            (('language', 'chinese'), ): {
                metric_keys.EXAMPLE_WEIGHT: {
                    'doubleValue': 8.0
                },
                metric_keys.EXAMPLE_COUNT: {
                    'doubleValue': 2.0
                },
            },
            (('language', 'english'), ): {
                'accuracy': {
                    'boundedValue': {
                        'value': 1.0,
                        'lowerBound': 1.0,
                        'upperBound': 1.0,
                        'methodology': 'POISSON_BOOTSTRAP'
                    }
                },
                'my_mean_label': {
                    'boundedValue': {
                        'value': 1.0,
                        'lowerBound': 1.0,
                        'upperBound': 1.0,
                        'methodology': 'POISSON_BOOTSTRAP'
                    }
                },
                metric_keys.EXAMPLE_WEIGHT: {
                    'doubleValue': 7.0
                },
                metric_keys.EXAMPLE_COUNT: {
                    'doubleValue': 2.0
                },
            }
        }
        self.assertEqual(eval_result.model_location, model_location.decode())
        self.assertEqual(eval_result.data_location, data_location)
        self.assertEqual(eval_result.config.slicing_specs[0],
                         config.SlicingSpec(feature_keys=['language']))
        self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)

        for key, value in eval_result.slicing_metrics:
            if (('language', 'english'), ) == key:
                metric = value['']['']['average_loss']
                self.assertAlmostEqual(0.171768754720,
                                       metric['boundedValue']['value'],
                                       delta=0.1)

                metric = value['']['']['auc_precision_recall']
                self.assertAlmostEqual(0.99999940395,
                                       metric['boundedValue']['value'],
                                       delta=0.1)

        self.assertFalse(eval_result.plots)
コード例 #14
0
 def testRunModelAnalysisWithModelAgnosticPredictions(self):
     examples = [
         self._makeExample(age=3.0,
                           language='english',
                           label=1.0,
                           prediction=0.9),
         self._makeExample(age=3.0,
                           language='chinese',
                           label=0.0,
                           prediction=0.4),
         self._makeExample(age=4.0,
                           language='english',
                           label=1.0,
                           prediction=0.7),
         self._makeExample(age=5.0,
                           language='chinese',
                           label=1.0,
                           prediction=0.2)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     model_specs = [
         config.ModelSpec(prediction_key='prediction',
                          label_key='label',
                          example_weight_key='age')
     ]
     metrics = [
         config.MetricConfig(class_name='ExampleCount'),
         config.MetricConfig(class_name='WeightedExampleCount'),
         config.MetricConfig(class_name='BinaryAccuracy')
     ]
     slicing_specs = [config.SlicingSpec(feature_keys=['language'])]
     eval_config = config.EvalConfig(
         model_specs=model_specs,
         metrics_specs=[config.MetricsSpec(metrics=metrics)],
         slicing_specs=slicing_specs)
     eval_result = model_eval_lib.run_model_analysis(
         eval_config=eval_config,
         data_location=data_location,
         output_path=self._getTempDir())
     expected = {
         (('language', 'chinese'), ): {
             'binary_accuracy': {
                 'doubleValue': 0.375
             },
             'weighted_example_count': {
                 'doubleValue': 8.0
             },
             'example_count': {
                 'doubleValue': 2.0
             },
         },
         (('language', 'english'), ): {
             'binary_accuracy': {
                 'doubleValue': 1.0
             },
             'weighted_example_count': {
                 'doubleValue': 7.0
             },
             'example_count': {
                 'doubleValue': 2.0
             },
         }
     }
     self.assertEqual(eval_result.data_location, data_location)
     self.assertEqual(eval_result.config.slicing_specs[0],
                      config.SlicingSpec(feature_keys=['language']))
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)