コード例 #1
0
 def testValidateMetricsValueThresholdUpperBoundFail(self):
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=[config.SlicingSpec()],
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='WeightedExampleCount',
                         # 1.5 < 1, NOT OK.
                         threshold=config.MetricThreshold(
                             value_threshold=config.GenericValueThreshold(
                                 upper_bound={'value': 1}))),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = ((()), {
         metric_types.MetricKey(name='weighted_example_count'):
         1.5,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
コード例 #2
0
 def testValidateMetricsChangeThresholdAbsoluteFail(self):
   eval_config = config.EvalConfig(
       model_specs=[
           config.ModelSpec(),
           config.ModelSpec(name='baseline', is_baseline=True)
       ],
       slicing_specs=[config.SlicingSpec()],
       metrics_specs=[
           config.MetricsSpec(
               metrics=[
                   config.MetricConfig(
                       class_name='MeanPrediction',
                       # Diff = 0 - .333 = -.333 < -1, NOT OK.
                       threshold=config.MetricThreshold(
                           change_threshold=config.GenericChangeThreshold(
                               direction=config.MetricDirection
                               .LOWER_IS_BETTER,
                               absolute={'value': -1})))
               ],
               model_names=['']),
       ],
   )
   sliced_metrics = ((()), {
       metric_types.MetricKey(name='mean_prediction', model_name='baseline'):
           0.333,
       metric_types.MetricKey(name='mean_prediction', is_diff=True):
           -0.333,
   })
   result = metrics_validator.validate_metrics(sliced_metrics, eval_config)
   self.assertFalse(result.validation_ok)
コード例 #3
0
def get_missing_slices(
    slicing_details: Iterable[validation_result_pb2.SlicingDetails],
    eval_config: config.EvalConfig
) -> List[Union[config.SlicingSpec, config.CrossSlicingSpec]]:
    """Returns specs that are defined in the EvalConfig but not found in details.

  Args:
    slicing_details: Slicing details.
    eval_config: Eval config.

  Returns:
    List of missing slices or empty list if none are missing.
  """
    hashed_details = _hashed_slicing_details(slicing_details)
    thresholds = metric_specs.metric_thresholds_from_metrics_specs(
        eval_config.metrics_specs)
    baseline_spec = model_util.get_baseline_model_spec(eval_config)
    baseline_model_name = baseline_spec.name if baseline_spec else None
    missing_slices = []
    for metric_key, sliced_thresholds in thresholds.items():
        # Skip baseline.
        if metric_key.model_name == baseline_model_name:
            continue
        for slice_spec, _ in sliced_thresholds:
            if not slice_spec:
                slice_spec = config.SlicingSpec()
            slice_hash = slice_spec.SerializeToString()
            if slice_hash not in hashed_details:
                missing_slices.append(slice_spec)
                # Same slice may be used by other metrics/thresholds, only add once
                hashed_details[
                    slice_hash] = validation_result_pb2.SlicingDetails()
    return missing_slices
コード例 #4
0
 def testValidateMetricsMetricTDistributionValueAndThreshold(
     self, slicing_specs, slice_key):
   threshold = config.MetricThreshold(
       value_threshold=config.GenericValueThreshold(
           lower_bound={'value': 0.9}))
   eval_config = config.EvalConfig(
       model_specs=[
           config.ModelSpec(),
       ],
       slicing_specs=slicing_specs,
       metrics_specs=[
           config.MetricsSpec(
               metrics=[
                   config.MetricConfig(
                       class_name='AUC',
                       threshold=threshold if slicing_specs is None else None,
                       per_slice_thresholds=[
                           config.PerSliceMetricThreshold(
                               slicing_specs=slicing_specs,
                               threshold=threshold)
                       ]),
               ],
               model_names=['']),
       ],
   )
   sliced_metrics = (slice_key, {
       metric_types.MetricKey(name='auc'):
           types.ValueWithTDistribution(sample_mean=0.91, unsampled_value=0.8)
   })
   result = metrics_validator.validate_metrics(sliced_metrics, eval_config)
   self.assertFalse(result.validation_ok)
   expected = text_format.Parse(
       """
       metric_validations_per_slice {
         failures {
           metric_key {
             name: "auc"
           }
           metric_value {
             double_value {
               value: 0.8
             }
           }
         }
       }""", validation_result_pb2.ValidationResult())
   expected.metric_validations_per_slice[0].failures[
       0].metric_threshold.CopyFrom(threshold)
   expected.metric_validations_per_slice[0].slice_key.CopyFrom(
       slicer.serialize_slice_key(slice_key))
   for spec in slicing_specs or [None]:
     if (spec is None or
         slicer.SingleSliceSpec(spec=spec).is_slice_applicable(slice_key)):
       slicing_details = expected.validation_details.slicing_details.add()
       if spec is not None:
         slicing_details.slicing_spec.CopyFrom(spec)
       else:
         slicing_details.slicing_spec.CopyFrom(config.SlicingSpec())
       slicing_details.num_matching_slices = 1
   self.assertEqual(result, expected)
コード例 #5
0
 def testSerializeDeserializeLegacyEvalConfig(self):
     output_path = self._getTempDir()
     old_config = LegacyConfig(
         model_location='/path/to/model',
         data_location='/path/to/data',
         slice_spec=[
             slicer.SingleSliceSpec(columns=['country'],
                                    features=[('age', 5), ('gender', 'f')]),
             slicer.SingleSliceSpec(columns=['interest'],
                                    features=[('age', 6), ('gender', 'm')])
         ],
         example_count_metric_key=None,
         example_weight_metric_key='key',
         compute_confidence_intervals=False,
         k_anonymization_count=1)
     final_dict = {}
     final_dict['tfma_version'] = tfma_version.VERSION_STRING
     final_dict['eval_config'] = old_config
     with tf.io.TFRecordWriter(os.path.join(output_path,
                                            'eval_config')) as w:
         w.write(pickle.dumps(final_dict))
     got_eval_config = model_eval_lib.load_eval_config(output_path)
     eval_config = config.EvalConfig(
         input_data_specs=[
             config.InputDataSpec(location=old_config.data_location)
         ],
         model_specs=[config.ModelSpec(location=old_config.model_location)],
         output_data_specs=[
             config.OutputDataSpec(default_location=output_path)
         ],
         slicing_specs=[
             config.SlicingSpec(feature_keys=['country'],
                                feature_values={
                                    'age': '5',
                                    'gender': 'f'
                                }),
             config.SlicingSpec(feature_keys=['interest'],
                                feature_values={
                                    'age': '6',
                                    'gender': 'm'
                                })
         ],
         compute_confidence_intervals=old_config.
         compute_confidence_intervals,
         k_anonymization_count=old_config.k_anonymization_count)
     self.assertEqual(eval_config, got_eval_config)
コード例 #6
0
 def testSerializeDeserializeLegacyEvalConfig(self):
     output_path = self._getTempDir()
     old_config = LegacyConfig(
         model_location='/path/to/model',
         data_location='/path/to/data',
         slice_spec=[
             slicer.SingleSliceSpec(columns=['country'],
                                    features=[('age', 5), ('gender', 'f')]),
             slicer.SingleSliceSpec(columns=['interest'],
                                    features=[('age', 6), ('gender', 'm')])
         ],
         example_count_metric_key=None,
         example_weight_metric_key='key',
         compute_confidence_intervals=False,
         k_anonymization_count=1)
     final_dict = {}
     final_dict['tfma_version'] = tfma_version.VERSION
     final_dict['eval_config'] = old_config
     with tf.io.TFRecordWriter(os.path.join(output_path,
                                            'eval_config')) as w:
         w.write(pickle.dumps(final_dict))
     got_eval_config, got_data_location, _, got_model_locations = (
         eval_config_writer.load_eval_run(output_path))
     options = config.Options()
     options.compute_confidence_intervals.value = (
         old_config.compute_confidence_intervals)
     options.min_slice_size.value = old_config.k_anonymization_count
     eval_config = config.EvalConfig(slicing_specs=[
         config.SlicingSpec(feature_keys=['country'],
                            feature_values={
                                'age': '5',
                                'gender': 'f'
                            }),
         config.SlicingSpec(feature_keys=['interest'],
                            feature_values={
                                'age': '6',
                                'gender': 'm'
                            })
     ],
                                     options=options)
     self.assertEqual(eval_config, got_eval_config)
     self.assertEqual(old_config.data_location, got_data_location)
     self.assertLen(got_model_locations, 1)
     self.assertEqual(old_config.model_location,
                      list(got_model_locations.values())[0])
コード例 #7
0
 def testValidateMetricsMetricValueAndThreshold(self):
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=[config.SlicingSpec()],
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='WeightedExampleCount',
                         # 1.5 < 1, NOT OK.
                         threshold=config.MetricThreshold(
                             value_threshold=config.GenericValueThreshold(
                                 upper_bound={'value': 1}))),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = ((()), {
         metric_types.MetricKey(name='weighted_example_count'):
         1.5,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
     expected = text_format.Parse(
         """
     metric_validations_per_slice {
       slice_key {
       }
       failures {
         metric_key {
           name: "weighted_example_count"
         }
         metric_threshold {
           value_threshold {
             upper_bound {
               value: 1.0
             }
           }
         }
         metric_value {
           double_value {
             value: 1.5
           }
         }
       }
     }""", validation_result_pb2.ValidationResult())
     self.assertEqual(result, expected)
コード例 #8
0
 def testValidateMetricsInvalidThreshold(self):
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=[config.SlicingSpec()],
         metrics_specs=[
             config.MetricsSpec(
                 thresholds={
                     'invalid_threshold':
                     config.MetricThreshold(
                         value_threshold=config.GenericValueThreshold(
                             lower_bound={'value': 0.2}))
                 })
         ],
     )
     sliced_metrics = ((()), {
         metric_types.MetricKey(name='weighted_example_count'):
         1.5,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
     expected = text_format.Parse(
         """
     metric_validations_per_slice {
       slice_key {
       }
       failures {
         metric_key {
           name: "invalid_threshold"
         }
         metric_threshold {
           value_threshold {
             lower_bound {
               value: 0.2
             }
           }
         }
         message: 'Metric not found.'
       }
     }""", validation_result_pb2.ValidationResult())
     self.assertProtoEquals(expected, result)
コード例 #9
0
 def testValidateMetricsDivByZero(self):
   threshold = config.MetricThreshold(
       change_threshold=config.GenericChangeThreshold(
           direction=config.MetricDirection.HIGHER_IS_BETTER,
           relative={'value': 0.1}))
   slicing_specs = [config.SlicingSpec()]
   eval_config = config.EvalConfig(
       model_specs=[
           config.ModelSpec(name='candidate'),
           config.ModelSpec(name='baseline', is_baseline=True)
       ],
       slicing_specs=slicing_specs,
       metrics_specs=[
           config.MetricsSpec(
               metrics=[
                   config.MetricConfig(
                       class_name='MeanPrediction',
                       threshold=threshold if slicing_specs is None else None,
                       per_slice_thresholds=[
                           config.PerSliceMetricThreshold(
                               slicing_specs=slicing_specs,
                               threshold=threshold)
                       ])
               ],
               model_names=['baseline', 'candidate']),
       ],
   )
   sliced_metrics = ((()), {
       metric_types.MetricKey(name='mean_prediction', model_name='baseline'):
           0.0,
       metric_types.MetricKey(
           name='mean_prediction', model_name='candidate', is_diff=True):
           0.1,
   })
   result = metrics_validator.validate_metrics(sliced_metrics, eval_config)
   self.assertFalse(result.validation_ok)
コード例 #10
0
 def testRunModelAnalysisWithUncertainty(self):
     model_location = self._exportEvalSavedModel(
         linear_classifier.simple_linear_classifier)
     examples = [
         self._makeExample(age=3.0, language='english', label=1.0),
         self._makeExample(age=3.0, language='chinese', label=0.0),
         self._makeExample(age=4.0, language='english', label=1.0),
         self._makeExample(age=5.0, language='chinese', label=1.0),
         self._makeExample(age=5.0, language='hindi', label=1.0)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     slicing_specs = [config.SlicingSpec(feature_keys=['language'])]
     options = config.Options()
     options.compute_confidence_intervals.value = True
     options.k_anonymization_count.value = 2
     eval_config = config.EvalConfig(
         input_data_specs=[config.InputDataSpec(location=data_location)],
         model_specs=[config.ModelSpec(location=model_location)],
         output_data_specs=[
             config.OutputDataSpec(default_location=self._getTempDir())
         ],
         slicing_specs=slicing_specs,
         options=options)
     eval_result = model_eval_lib.run_model_analysis(
         eval_config=eval_config,
         eval_shared_models=[
             model_eval_lib.default_eval_shared_model(
                 eval_saved_model_path=model_location,
                 example_weight_key='age')
         ])
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
         (('language', 'hindi'), ): {
             u'__ERROR__': {
                 'debugMessage':
                 u'Example count for this slice key is lower than the '
                 u'minimum required value: 2. No data is aggregated for '
                 u'this slice.'
             },
         },
         (('language', 'chinese'), ): {
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 8.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 2.0
             },
         },
         (('language', 'english'), ): {
             'accuracy': {
                 'boundedValue': {
                     'value': 1.0,
                     'lowerBound': 1.0,
                     'upperBound': 1.0,
                     'methodology': 'POISSON_BOOTSTRAP'
                 }
             },
             'my_mean_label': {
                 'boundedValue': {
                     'value': 1.0,
                     'lowerBound': 1.0,
                     'upperBound': 1.0,
                     'methodology': 'POISSON_BOOTSTRAP'
                 }
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 7.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 2.0
             },
         }
     }
     self.assertEqual(eval_result.config.model_specs[0].location,
                      model_location.decode())
     self.assertEqual(eval_result.config.input_data_specs[0].location,
                      data_location)
     self.assertEqual(eval_result.config.slicing_specs[0],
                      config.SlicingSpec(feature_keys=['language']))
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
     self.assertFalse(eval_result.plots)
コード例 #11
0
 def testRunModelAnalysisWithLegacyQueryExtractor(self):
     model_location = self._exportEvalSavedModel(
         linear_classifier.simple_linear_classifier)
     examples = [
         self._makeExample(age=3.0, language='english', label=1.0),
         self._makeExample(age=3.0, language='chinese', label=0.0),
         self._makeExample(age=4.0, language='english', label=0.0),
         self._makeExample(age=5.0, language='chinese', label=1.0)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     slicing_specs = [config.SlicingSpec()]
     eval_config = config.EvalConfig(
         input_data_specs=[config.InputDataSpec(location=data_location)],
         model_specs=[config.ModelSpec(location=model_location)],
         output_data_specs=[
             config.OutputDataSpec(default_location=self._getTempDir())
         ],
         slicing_specs=slicing_specs)
     eval_shared_model = model_eval_lib.default_eval_shared_model(
         eval_saved_model_path=model_location, example_weight_key='age')
     eval_result = model_eval_lib.run_model_analysis(
         eval_config=eval_config,
         eval_shared_models=[eval_shared_model],
         evaluators=[
             metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(
                 eval_shared_model),
             query_based_metrics_evaluator.QueryBasedMetricsEvaluator(
                 query_id='language',
                 prediction_key='logistic',
                 combine_fns=[
                     query_statistics.QueryStatisticsCombineFn(),
                     legacy_ndcg.NdcgMetricCombineFn(at_vals=[1],
                                                     gain_key='label',
                                                     weight_key='')
                 ]),
         ])
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
         (): {
             'post_export_metrics/total_queries': {
                 'doubleValue': 2.0
             },
             'post_export_metrics/min_documents': {
                 'doubleValue': 2.0
             },
             'post_export_metrics/max_documents': {
                 'doubleValue': 2.0
             },
             'post_export_metrics/total_documents': {
                 'doubleValue': 4.0
             },
             'post_export_metrics/ndcg@1': {
                 'doubleValue': 0.5
             },
             'post_export_metrics/example_weight': {
                 'doubleValue': 15.0
             },
             'post_export_metrics/example_count': {
                 'doubleValue': 4.0
             },
         }
     }
     self.assertEqual(eval_result.config.model_specs[0].location,
                      model_location.decode())
     self.assertEqual(eval_result.config.input_data_specs[0].location,
                      data_location)
     self.assertEqual(eval_result.config.slicing_specs[0],
                      config.SlicingSpec())
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
     self.assertFalse(eval_result.plots)
コード例 #12
0
    def testRunModelAnalysisWithQueryBasedMetrics(self):
        input_layer = tf.keras.layers.Input(shape=(1, ), name='age')
        output_layer = tf.keras.layers.Dense(
            1, activation=tf.nn.sigmoid)(input_layer)
        model = tf.keras.models.Model(input_layer, output_layer)
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001),
                      loss=tf.keras.losses.binary_crossentropy)

        features = {'age': [[20.0]]}
        labels = [[1]]
        example_weights = [1.0]
        dataset = tf.data.Dataset.from_tensor_slices(
            (features, labels, example_weights))
        dataset = dataset.shuffle(buffer_size=1).repeat().batch(1)
        model.fit(dataset, steps_per_epoch=1)

        model_location = os.path.join(self._getTempDir(), 'export_dir')
        model.save(model_location, save_format='tf')

        examples = [
            self._makeExample(age=3.0, language='english', label=1.0),
            self._makeExample(age=5.0, language='chinese', label=0.0),
            self._makeExample(age=3.0, language='english', label=0.0),
            self._makeExample(age=5.0, language='chinese', label=1.0)
        ]
        data_location = self._writeTFExamplesToTFRecords(examples)
        slicing_specs = [config.SlicingSpec()]
        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec(location=data_location)],
            model_specs=[
                config.ModelSpec(location=model_location, label_key='label')
            ],
            output_data_specs=[
                config.OutputDataSpec(default_location=self._getTempDir())
            ],
            slicing_specs=slicing_specs,
            metrics_specs=metric_specs.specs_from_metrics(
                [ndcg.NDCG(gain_key='age', name='ndcg')],
                binarize=config.BinarizationOptions(top_k_list=[1]),
                query_key='language'))
        eval_shared_model = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=model_location,
            tags=[tf.saved_model.SERVING])
        eval_result = model_eval_lib.run_model_analysis(
            eval_config=eval_config,
            eval_shared_models=[eval_shared_model],
            evaluators=[
                metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                    eval_config=eval_config,
                    eval_shared_models=[eval_shared_model])
            ])

        self.assertEqual(eval_result.config.model_specs[0].location,
                         model_location)
        self.assertEqual(eval_result.config.input_data_specs[0].location,
                         data_location)
        self.assertLen(eval_result.slicing_metrics, 1)
        got_slice_key, got_metrics = eval_result.slicing_metrics[0]
        self.assertEqual(got_slice_key, ())
        self.assertIn('', got_metrics)  # output_name
        got_metrics = got_metrics['']
        expected_metrics = {
            '': {
                'example_count': True,
                'weighted_example_count': True,
            },
            'topK:1': {
                'ndcg': True,
            },
        }
        for group in expected_metrics:
            self.assertIn(group, got_metrics)
            for k in expected_metrics[group]:
                self.assertIn(k, got_metrics[group])
コード例 #13
0
 def testRunModelAnalysisExtraFieldsPlusFeatureExtraction(self):
     model_location = self._exportEvalSavedModel(
         linear_classifier.simple_linear_classifier)
     examples = [
         self._makeExample(age=3.0,
                           language='english',
                           label=1.0,
                           my_slice='a'),
         self._makeExample(age=3.0,
                           language='chinese',
                           label=0.0,
                           my_slice='a'),
         self._makeExample(age=4.0,
                           language='english',
                           label=1.0,
                           my_slice='b'),
         self._makeExample(age=5.0,
                           language='chinese',
                           label=1.0,
                           my_slice='c'),
         self._makeExample(age=5.0, language='hindi', label=1.0)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     slicing_specs = [config.SlicingSpec(feature_keys=['my_slice'])]
     eval_config = config.EvalConfig(
         input_data_specs=[config.InputDataSpec(location=data_location)],
         model_specs=[config.ModelSpec(location=model_location)],
         output_data_specs=[
             config.OutputDataSpec(default_location=self._getTempDir())
         ],
         slicing_specs=slicing_specs)
     eval_shared_model = model_eval_lib.default_eval_shared_model(
         eval_saved_model_path=model_location, example_weight_key='age')
     slice_spec = [slicer.SingleSliceSpec(spec=slicing_specs[0])]
     extractors_with_feature_extraction = [
         predict_extractor.PredictExtractor(eval_shared_model,
                                            desired_batch_size=3,
                                            materialize=False),
         feature_extractor.FeatureExtractor(
             extract_source=constants.INPUT_KEY,
             extract_dest=constants.FEATURES_PREDICTIONS_LABELS_KEY),
         slice_key_extractor.SliceKeyExtractor(slice_spec,
                                               materialize=False)
     ]
     eval_result = model_eval_lib.run_model_analysis(
         eval_config=eval_config,
         eval_shared_models=[
             model_eval_lib.default_eval_shared_model(
                 eval_saved_model_path=model_location,
                 example_weight_key='age')
         ],
         extractors=extractors_with_feature_extraction)
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
         (('my_slice', 'a'), ): {
             'accuracy': {
                 'doubleValue': 1.0
             },
             'my_mean_label': {
                 'doubleValue': 0.5
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 6.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 2.0
             },
         },
         (('my_slice', 'b'), ): {
             'accuracy': {
                 'doubleValue': 1.0
             },
             'my_mean_label': {
                 'doubleValue': 1.0
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 4.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 1.0
             },
         },
         (('my_slice', 'c'), ): {
             'accuracy': {
                 'doubleValue': 0.0
             },
             'my_mean_label': {
                 'doubleValue': 1.0
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 5.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 1.0
             },
         },
     }
     self.assertEqual(eval_result.config.model_specs[0].location,
                      model_location.decode())
     self.assertEqual(eval_result.config.input_data_specs[0].location,
                      data_location)
     self.assertEqual(eval_result.config.slicing_specs[0],
                      config.SlicingSpec(feature_keys=['my_slice']))
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
     self.assertFalse(eval_result.plots)
コード例 #14
0
    def testRunModelAnalysisWithDeterministicConfidenceIntervals(self):
        model_location = self._exportEvalSavedModel(
            linear_classifier.simple_linear_classifier)
        examples = [
            self._makeExample(age=3.0, language='english', label=1.0),
            self._makeExample(age=3.0, language='chinese', label=0.0),
            self._makeExample(age=4.0, language='english', label=1.0),
            self._makeExample(age=5.0, language='chinese', label=1.0),
            self._makeExample(age=5.0, language='hindi', label=1.0)
        ]
        data_location = self._writeTFExamplesToTFRecords(examples)
        slicing_specs = [config.SlicingSpec(feature_keys=['language'])]
        options = config.Options()
        options.compute_confidence_intervals.value = True
        options.k_anonymization_count.value = 2
        eval_config = config.EvalConfig(slicing_specs=slicing_specs,
                                        options=options)
        eval_result = model_eval_lib.run_model_analysis(
            eval_config=eval_config,
            eval_shared_model=model_eval_lib.default_eval_shared_model(
                eval_saved_model_path=model_location,
                example_weight_key='age'),
            data_location=data_location,
            output_path=self._getTempDir(),
            random_seed_for_testing=_TEST_SEED)
        # We only check some of the metrics to ensure that the end-to-end
        # pipeline works.
        expected = {
            (('language', 'hindi'), ): {
                u'__ERROR__': {
                    'debugMessage':
                    u'Example count for this slice key is lower than the '
                    u'minimum required value: 2. No data is aggregated for '
                    u'this slice.'
                },
            },
            (('language', 'chinese'), ): {
                metric_keys.EXAMPLE_WEIGHT: {
                    'doubleValue': 8.0
                },
                metric_keys.EXAMPLE_COUNT: {
                    'doubleValue': 2.0
                },
            },
            (('language', 'english'), ): {
                'accuracy': {
                    'boundedValue': {
                        'value': 1.0,
                        'lowerBound': 1.0,
                        'upperBound': 1.0,
                        'methodology': 'POISSON_BOOTSTRAP'
                    }
                },
                'my_mean_label': {
                    'boundedValue': {
                        'value': 1.0,
                        'lowerBound': 1.0,
                        'upperBound': 1.0,
                        'methodology': 'POISSON_BOOTSTRAP'
                    }
                },
                metric_keys.EXAMPLE_WEIGHT: {
                    'doubleValue': 7.0
                },
                metric_keys.EXAMPLE_COUNT: {
                    'doubleValue': 2.0
                },
            }
        }
        self.assertEqual(eval_result.model_location, model_location.decode())
        self.assertEqual(eval_result.data_location, data_location)
        self.assertEqual(eval_result.config.slicing_specs[0],
                         config.SlicingSpec(feature_keys=['language']))
        self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)

        for key, value in eval_result.slicing_metrics:
            if (('language', 'english'), ) == key:
                metric = value['']['']['average_loss']
                self.assertAlmostEqual(0.171768754720,
                                       metric['boundedValue']['value'],
                                       delta=0.1)

                metric = value['']['']['auc_precision_recall']
                self.assertAlmostEqual(0.99999940395,
                                       metric['boundedValue']['value'],
                                       delta=0.1)

        self.assertFalse(eval_result.plots)
    def testEvaluateWithBinaryClassificationModel(self):
        n_classes = 2
        temp_export_dir = self._getExportDir()
        _, export_dir = dnn_classifier.simple_dnn_classifier(
            None, temp_export_dir, n_classes=n_classes)

        # Add mean_label, example_count, weighted_example_count, calibration_plot
        eval_config = config.EvalConfig(
            model_specs=[
                config.ModelSpec(location=export_dir,
                                 label_key='label',
                                 example_weight_key='age')
            ],
            slicing_specs=[config.SlicingSpec()],
            metrics_specs=metric_specs.specs_from_metrics([
                calibration.MeanLabel('mean_label'),
                calibration_plot.CalibrationPlot(name='calibration_plot',
                                                 num_buckets=10)
            ]))
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])

        slice_spec = [
            slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
        ]
        extractors = [
            input_extractor.InputExtractor(eval_config=eval_config),
            predict_extractor_v2.PredictExtractor(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model]),
            slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
        ]
        evaluators = [
            metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model])
        ]

        examples = [
            self._makeExample(age=1.0, language='english', label=0.0),
            self._makeExample(age=2.0, language='chinese', label=1.0),
            self._makeExample(age=3.0, language='chinese', label=0.0),
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            metrics_and_plots = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate(
                    extractors=extractors, evaluators=evaluators))

            # pylint: enable=no-value-for-parameter

            def check_metrics(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    example_count_key = metric_types.MetricKey(
                        name='example_count')
                    weighted_example_count_key = metric_types.MetricKey(
                        name='weighted_example_count')
                    label_key = metric_types.MetricKey(name='mean_label')
                    self.assertDictElementsAlmostEqual(
                        got_metrics, {
                            example_count_key:
                            3,
                            weighted_example_count_key: (1.0 + 2.0 + 3.0),
                            label_key:
                            (0 * 1.0 + 1 * 2.0 + 0 * 3.0) / (1.0 + 2.0 + 3.0),
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            def check_plots(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_plots = got[0]
                    self.assertEqual(got_slice_key, ())
                    plot_key = metric_types.PlotKey('calibration_plot')
                    self.assertIn(plot_key, got_plots)
                    # 10 buckets + 2 for edge cases
                    self.assertLen(got_plots[plot_key].buckets, 12)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics_and_plots[constants.METRICS_KEY],
                             check_metrics,
                             label='metrics')
            util.assert_that(metrics_and_plots[constants.PLOTS_KEY],
                             check_plots,
                             label='plots')
コード例 #16
0
from absl.testing import parameterized
import tensorflow as tf

from tensorflow_model_analysis import config
from tensorflow_model_analysis import types
from tensorflow_model_analysis.eval_saved_model import testutil
from tensorflow_model_analysis.evaluators import metrics_validator
from tensorflow_model_analysis.metrics import metric_types
from tensorflow_model_analysis.proto import validation_result_pb2
from tensorflow_model_analysis.slicer import slicer_lib as slicer
from google.protobuf import text_format

# Tests involiving slices: (<test_name>, <slice_config> , <slice_key>)
_NO_SLICE_TEST = ('no_slice', None, (()))
_GLOBAL_SLICE_TEST = ('global_slice', [config.SlicingSpec()], (()))
_FEATURE_SLICE_TEST = ('feature_slice',
                       [config.SlicingSpec(feature_keys=['feature1'])
                        ], (('feature1', 'value1'), ))
_FEATURE_VALUE_SLICE_TEST = ('feature_value_slice', [
    config.SlicingSpec(feature_values={'feature1': 'value1'})
], (('feature1', 'value1'), ))
_MULTIPLE_SLICES_TEST = ('multiple_slices', [
    config.SlicingSpec(feature_values={'feature1': 'value1'}),
    config.SlicingSpec(feature_values={'feature2': 'value2'})
], (('feature1', 'value1'), ))
_UNMATCHED_SINGLE_SLICE_TEST = ('single_slice',
                                [config.SlicingSpec(feature_keys='feature1')],
                                (('unmatched_feature', 'unmatched_value'), ))
_UNMATCHED_MULTIPLE_SLICES_TEST = ('multiple_slices', [
    config.SlicingSpec(feature_values={'feature1': 'value1'}),
コード例 #17
0
    def testEvaluateWithEvalSavedModel(self):
        temp_export_dir = self._getExportDir()
        _, export_dir = linear_classifier.simple_linear_classifier(
            None, temp_export_dir)
        eval_config = config.EvalConfig(
            model_specs=[config.ModelSpec(signature_name='eval')],
            slicing_specs=[
                config.SlicingSpec(),
                config.SlicingSpec(feature_keys=['slice_key']),
            ])
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir,
            add_metrics_callbacks=[_addExampleCountMetricCallback])
        slice_spec = [
            slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
        ]
        extractors = [
            predict_extractor.PredictExtractor(eval_shared_model),
            slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
        ]
        evaluators = [
            metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                eval_config=eval_config, eval_shared_model=eval_shared_model)
        ]

        examples = [
            self._makeExample(age=3.0,
                              language='english',
                              label=1.0,
                              slice_key='first_slice'),
            self._makeExample(age=3.0,
                              language='chinese',
                              label=0.0,
                              slice_key='first_slice'),
            self._makeExample(age=4.0,
                              language='english',
                              label=0.0,
                              slice_key='second_slice'),
            self._makeExample(age=5.0,
                              language='chinese',
                              label=1.0,
                              slice_key='second_slice'),
            self._makeExample(age=5.0,
                              language='chinese',
                              label=1.0,
                              slice_key='second_slice')
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            metrics = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate(
                    extractors=extractors, evaluators=evaluators))

            # pylint: enable=no-value-for-parameter

            def check_metrics(got):
                try:
                    self.assertLen(got, 3)
                    slices = {}
                    for slice_key, value in got:
                        slices[slice_key] = value
                    overall_slice = ()
                    first_slice = (('slice_key', b'first_slice'), )
                    second_slice = (('slice_key', b'second_slice'), )
                    self.assertCountEqual(
                        list(slices.keys()),
                        [overall_slice, first_slice, second_slice])
                    self.assertDictElementsAlmostEqual(
                        slices[overall_slice], {
                            metric_types.MetricKey(name='accuracy'):
                            0.4,
                            metric_types.MetricKey(name='label/mean'):
                            0.6,
                            metric_types.MetricKey(name='my_mean_age'):
                            4.0,
                            metric_types.MetricKey(name='my_mean_age_times_label'):
                            2.6,
                            metric_types.MetricKey(name='added_example_count'):
                            5.0
                        })
                    self.assertDictElementsAlmostEqual(
                        slices[first_slice], {
                            metric_types.MetricKey(name='accuracy'):
                            1.0,
                            metric_types.MetricKey(name='label/mean'):
                            0.5,
                            metric_types.MetricKey(name='my_mean_age'):
                            3.0,
                            metric_types.MetricKey(name='my_mean_age_times_label'):
                            1.5,
                            metric_types.MetricKey(name='added_example_count'):
                            2.0
                        })
                    self.assertDictElementsAlmostEqual(
                        slices[second_slice], {
                            metric_types.MetricKey(name='accuracy'):
                            0.0,
                            metric_types.MetricKey(name='label/mean'):
                            2.0 / 3.0,
                            metric_types.MetricKey(name='my_mean_age'):
                            14.0 / 3.0,
                            metric_types.MetricKey(name='my_mean_age_times_label'):
                            10.0 / 3.0,
                            metric_types.MetricKey(name='added_example_count'):
                            3.0
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics[constants.METRICS_KEY],
                             check_metrics,
                             label='metrics')
    def testEvaluateWithKerasModel(self):
        input1 = tf.keras.layers.Input(shape=(1, ), name='input1')
        input2 = tf.keras.layers.Input(shape=(1, ), name='input2')
        inputs = [input1, input2]
        input_layer = tf.keras.layers.concatenate(inputs)
        output_layer = tf.keras.layers.Dense(1,
                                             activation=tf.nn.sigmoid,
                                             name='output')(input_layer)
        model = tf.keras.models.Model(inputs, output_layer)
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001),
                      loss=tf.keras.losses.binary_crossentropy,
                      metrics=['accuracy'])

        features = {'input1': [[0.0], [1.0]], 'input2': [[1.0], [0.0]]}
        labels = [[1], [0]]
        example_weights = [1.0, 0.5]
        dataset = tf.data.Dataset.from_tensor_slices(
            (features, labels, example_weights))
        dataset = dataset.shuffle(buffer_size=1).repeat().batch(2)
        model.fit(dataset, steps_per_epoch=1)

        export_dir = self._getExportDir()
        model.save(export_dir, save_format='tf')

        eval_config = config.EvalConfig(
            model_specs=[
                config.ModelSpec(location=export_dir,
                                 label_key='label',
                                 example_weight_key='example_weight')
            ],
            slicing_specs=[config.SlicingSpec()],
            metrics_specs=metric_specs.specs_from_metrics(
                [calibration.MeanLabel('mean_label')]))
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])

        slice_spec = [
            slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
        ]
        extractors = [
            input_extractor.InputExtractor(eval_config=eval_config),
            predict_extractor_v2.PredictExtractor(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model]),
            slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
        ]
        evaluators = [
            metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model])
        ]

        examples = [
            self._makeExample(input1=0.0,
                              input2=1.0,
                              label=1.0,
                              example_weight=1.0,
                              extra_feature='non_model_feature'),
            self._makeExample(input1=1.0,
                              input2=0.0,
                              label=0.0,
                              example_weight=0.5,
                              extra_feature='non_model_feature'),
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            metrics = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate(
                    extractors=extractors, evaluators=evaluators))

            # pylint: enable=no-value-for-parameter

            def check_metrics(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    example_count_key = metric_types.MetricKey(
                        name='example_count')
                    weighted_example_count_key = metric_types.MetricKey(
                        name='weighted_example_count')
                    label_key = metric_types.MetricKey(name='mean_label')
                    self.assertDictElementsAlmostEqual(
                        got_metrics, {
                            example_count_key: 2,
                            weighted_example_count_key: (1.0 + 0.5),
                            label_key: (1.0 * 1.0 + 0.0 * 0.5) / (1.0 + 0.5),
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics[constants.METRICS_KEY],
                             check_metrics,
                             label='metrics')
コード例 #19
0
 def testValidateMetricsMetricTDistributionChangeAndThreshold(
         self, slicing_specs, slice_key):
     threshold = config.MetricThreshold(
         change_threshold=config.GenericChangeThreshold(
             direction=config.MetricDirection.LOWER_IS_BETTER,
             absolute={'value': -1}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
             config.ModelSpec(name='baseline', is_baseline=True)
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(metrics=[
                 config.MetricConfig(
                     class_name='AUC',
                     threshold=threshold if slicing_specs is None else None,
                     per_slice_thresholds=[
                         config.PerSliceMetricThreshold(
                             slicing_specs=slicing_specs,
                             threshold=threshold)
                     ]),
             ],
                                model_names=['']),
         ],
     )
     sliced_metrics = (
         slice_key,
         {
             # This is the mean of the diff.
             metric_types.MetricKey(name='auc', model_name='baseline'):
             types.ValueWithTDistribution(sample_mean=0.91,
                                          unsampled_value=0.6),
             metric_types.MetricKey(name='auc', is_diff=True):
             types.ValueWithTDistribution(sample_mean=0.1,
                                          unsampled_value=0.1),
         })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
     expected = text_format.Parse(
         """
     metric_validations_per_slice {
       failures {
         metric_key {
           name: "auc"
           is_diff: true
         }
         metric_value {
           double_value {
             value: 0.1
           }
         }
       }
     }""", validation_result_pb2.ValidationResult())
     expected.metric_validations_per_slice[0].failures[
         0].metric_threshold.CopyFrom(threshold)
     expected.metric_validations_per_slice[0].slice_key.CopyFrom(
         slicer.serialize_slice_key(slice_key))
     for spec in slicing_specs or [None]:
         if (spec is None or slicer.SingleSliceSpec(
                 spec=spec).is_slice_applicable(slice_key)):
             slicing_details = expected.validation_details.slicing_details.add(
             )
             if spec is not None:
                 slicing_details.slicing_spec.CopyFrom(spec)
             else:
                 slicing_details.slicing_spec.CopyFrom(config.SlicingSpec())
             slicing_details.num_matching_slices = 1
     self.assertAlmostEqual(result, expected)
    def testEvaluateWithMultiOutputModel(self):
        temp_export_dir = self._getExportDir()
        _, export_dir = multi_head.simple_multi_head(None, temp_export_dir)

        eval_config = config.EvalConfig(
            model_specs=[
                config.ModelSpec(location=export_dir,
                                 label_keys={
                                     'chinese_head': 'chinese_label',
                                     'english_head': 'english_label',
                                     'other_head': 'other_label'
                                 },
                                 example_weight_keys={
                                     'chinese_head': 'age',
                                     'english_head': 'age',
                                     'other_head': 'age'
                                 })
            ],
            slicing_specs=[config.SlicingSpec()],
            metrics_specs=metric_specs.specs_from_metrics({
                'chinese_head': [calibration.MeanLabel('mean_label')],
                'english_head': [calibration.MeanLabel('mean_label')],
                'other_head': [calibration.MeanLabel('mean_label')],
            }))
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])

        slice_spec = [
            slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
        ]
        extractors = [
            input_extractor.InputExtractor(eval_config=eval_config),
            predict_extractor_v2.PredictExtractor(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model]),
            slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
        ]
        evaluators = [
            metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model])
        ]

        examples = [
            self._makeExample(age=1.0,
                              language='english',
                              english_label=1.0,
                              chinese_label=0.0,
                              other_label=0.0),
            self._makeExample(age=1.0,
                              language='chinese',
                              english_label=0.0,
                              chinese_label=1.0,
                              other_label=0.0),
            self._makeExample(age=2.0,
                              language='english',
                              english_label=1.0,
                              chinese_label=0.0,
                              other_label=0.0),
            self._makeExample(age=2.0,
                              language='other',
                              english_label=0.0,
                              chinese_label=1.0,
                              other_label=1.0),
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            metrics = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate(
                    extractors=extractors, evaluators=evaluators))

            # pylint: enable=no-value-for-parameter

            def check_metrics(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    example_count_key = metric_types.MetricKey(
                        name='example_count')
                    chinese_weighted_example_count_key = metric_types.MetricKey(
                        name='weighted_example_count',
                        output_name='chinese_head')
                    chinese_label_key = metric_types.MetricKey(
                        name='mean_label', output_name='chinese_head')
                    english_weighted_example_count_key = metric_types.MetricKey(
                        name='weighted_example_count',
                        output_name='english_head')
                    english_label_key = metric_types.MetricKey(
                        name='mean_label', output_name='english_head')
                    other_weighted_example_count_key = metric_types.MetricKey(
                        name='weighted_example_count',
                        output_name='other_head')
                    other_label_key = metric_types.MetricKey(
                        name='mean_label', output_name='other_head')
                    self.assertDictElementsAlmostEqual(
                        got_metrics, {
                            example_count_key:
                            4,
                            chinese_label_key:
                            (0.0 + 1.0 + 2 * 0.0 + 2 * 1.0) /
                            (1.0 + 1.0 + 2.0 + 2.0),
                            chinese_weighted_example_count_key:
                            (1.0 + 1.0 + 2.0 + 2.0),
                            english_label_key:
                            (1.0 + 0.0 + 2 * 1.0 + 2 * 0.0) /
                            (1.0 + 1.0 + 2.0 + 2.0),
                            english_weighted_example_count_key:
                            (1.0 + 1.0 + 2.0 + 2.0),
                            other_label_key: (0.0 + 0.0 + 2 * 0.0 + 2 * 1.0) /
                            (1.0 + 1.0 + 2.0 + 2.0),
                            other_weighted_example_count_key:
                            (1.0 + 1.0 + 2.0 + 2.0)
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics[constants.METRICS_KEY],
                             check_metrics,
                             label='metrics')
    def testEvaluateWithSlicing(self):
        temp_export_dir = self._getExportDir()
        _, export_dir = (fixed_prediction_estimator_extra_fields.
                         simple_fixed_prediction_estimator_extra_fields(
                             None, temp_export_dir))
        eval_config = config.EvalConfig(
            model_specs=[
                config.ModelSpec(location=export_dir,
                                 label_key='label',
                                 example_weight_key='fixed_float')
            ],
            slicing_specs=[
                config.SlicingSpec(),
                config.SlicingSpec(feature_keys=['fixed_string']),
            ],
            metrics_specs=metric_specs.specs_from_metrics([
                calibration.MeanLabel('mean_label'),
                calibration.MeanPrediction('mean_prediction')
            ]))
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir)
        slice_spec = [
            slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
        ]
        extractors = [
            predict_extractor.PredictExtractor(
                eval_shared_model=eval_shared_model),
            slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
        ]
        evaluators = [
            metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model])
        ]

        # fixed_float used as example_weight key
        examples = [
            self._makeExample(prediction=0.2,
                              label=1.0,
                              fixed_int=1,
                              fixed_float=1.0,
                              fixed_string='fixed_string1'),
            self._makeExample(prediction=0.8,
                              label=0.0,
                              fixed_int=1,
                              fixed_float=1.0,
                              fixed_string='fixed_string1'),
            self._makeExample(prediction=0.5,
                              label=0.0,
                              fixed_int=2,
                              fixed_float=2.0,
                              fixed_string='fixed_string2')
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            metrics = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate(
                    extractors=extractors, evaluators=evaluators))

            # pylint: enable=no-value-for-parameter

            def check_metrics(got):
                try:
                    self.assertLen(got, 3)
                    slices = {}
                    for slice_key, value in got:
                        slices[slice_key] = value
                    overall_slice = ()
                    fixed_string1_slice = (('fixed_string',
                                            b'fixed_string1'), )
                    fixed_string2_slice = (('fixed_string',
                                            b'fixed_string2'), )
                    self.asssertCountEqual(list(slices.keys()), [
                        overall_slice, fixed_string1_slice, fixed_string2_slice
                    ])
                    example_count_key = metric_types.MetricKey(
                        name='example_count')
                    weighted_example_count_key = metric_types.MetricKey(
                        name='weighted_example_count')
                    label_key = metric_types.MetricKey(name='mean_label')
                    pred_key = metric_types.MetricKey(name='mean_prediction')
                    self.assertDictElementsAlmostEqual(
                        slices[overall_slice], {
                            example_count_key: 3,
                            weighted_example_count_key: 4.0,
                            label_key:
                            (1.0 + 0.0 + 2 * 0.0) / (1.0 + 1.0 + 2.0),
                            pred_key:
                            (0.2 + 0.8 + 2 * 0.5) / (1.0 + 1.0 + 2.0),
                        })
                    self.assertDictElementsAlmostEqual(
                        slices[fixed_string1_slice], {
                            example_count_key: 2,
                            weighted_example_count_key: 2.0,
                            label_key: (1.0 + 0.0) / (1.0 + 1.0),
                            pred_key: (0.2 + 0.8) / (1.0 + 1.0),
                        })
                    self.assertDictElementsAlmostEqual(
                        slices[fixed_string2_slice], {
                            example_count_key: 1,
                            weighted_example_count_key: 2.0,
                            label_key: (2 * 0.0) / 2.0,
                            pred_key: (2 * 0.5) / 2.0,
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

                util.assert_that(metrics[constants.METRICS_KEY],
                                 check_metrics,
                                 label='metrics')
    def testEvaluateWithMultiClassModel(self):
        n_classes = 3
        temp_export_dir = self._getExportDir()
        _, export_dir = dnn_classifier.simple_dnn_classifier(
            None, temp_export_dir, n_classes=n_classes)

        # Add example_count and weighted_example_count
        eval_config = config.EvalConfig(
            model_specs=[
                config.ModelSpec(location=export_dir,
                                 label_key='label',
                                 example_weight_key='age')
            ],
            slicing_specs=[config.SlicingSpec()],
            metrics_specs=metric_specs.specs_from_metrics(
                [calibration.MeanLabel('mean_label')],
                binarize=config.BinarizationOptions(
                    class_ids=range(n_classes))))
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])

        slice_spec = [
            slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
        ]
        extractors = [
            input_extractor.InputExtractor(eval_config=eval_config),
            predict_extractor_v2.PredictExtractor(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model]),
            slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
        ]
        evaluators = [
            metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model])
        ]

        examples = [
            self._makeExample(age=1.0, language='english', label=0),
            self._makeExample(age=2.0, language='chinese', label=1),
            self._makeExample(age=3.0, language='english', label=2),
            self._makeExample(age=4.0, language='chinese', label=1),
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            metrics = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate(
                    extractors=extractors, evaluators=evaluators))

            # pylint: enable=no-value-for-parameter

            def check_metrics(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    example_count_key = metric_types.MetricKey(
                        name='example_count')
                    weighted_example_count_key = metric_types.MetricKey(
                        name='weighted_example_count')
                    label_key_class_0 = metric_types.MetricKey(
                        name='mean_label',
                        sub_key=metric_types.SubKey(class_id=0))
                    label_key_class_1 = metric_types.MetricKey(
                        name='mean_label',
                        sub_key=metric_types.SubKey(class_id=1))
                    label_key_class_2 = metric_types.MetricKey(
                        name='mean_label',
                        sub_key=metric_types.SubKey(class_id=2))
                    self.assertEqual(got_slice_key, ())
                    self.assertDictElementsAlmostEqual(
                        got_metrics, {
                            example_count_key:
                            4,
                            weighted_example_count_key:
                            (1.0 + 2.0 + 3.0 + 4.0),
                            label_key_class_0:
                            (1 * 1.0 + 0 * 2.0 + 0 * 3.0 + 0 * 4.0) /
                            (1.0 + 2.0 + 3.0 + 4.0),
                            label_key_class_1:
                            (0 * 1.0 + 1 * 2.0 + 0 * 3.0 + 1 * 4.0) /
                            (1.0 + 2.0 + 3.0 + 4.0),
                            label_key_class_2:
                            (0 * 1.0 + 0 * 2.0 + 1 * 3.0 + 0 * 4.0) /
                            (1.0 + 2.0 + 3.0 + 4.0)
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics[constants.METRICS_KEY],
                             check_metrics,
                             label='metrics')
コード例 #23
0
 def to_proto(self) -> config.SlicingSpec:
     feature_values = {k: str(v) for (k, v) in self._features}
     return config.SlicingSpec(feature_keys=self._columns,
                               feature_values=feature_values)
コード例 #24
0
  def testWriteValidationResults(self, output_file_format):
    model_dir, baseline_dir = self._getExportDir(), self._getBaselineDir()
    eval_shared_model = self._build_keras_model(model_dir, mul=0)
    baseline_eval_shared_model = self._build_keras_model(baseline_dir, mul=1)
    validations_file = os.path.join(self._getTempDir(),
                                    constants.VALIDATIONS_KEY)
    schema = text_format.Parse(
        """
        tensor_representation_group {
          key: ""
          value {
            tensor_representation {
              key: "input"
              value {
                dense_tensor {
                  column_name: "input"
                  shape { dim { size: 1 } }
                }
              }
            }
          }
        }
        feature {
          name: "input"
          type: FLOAT
        }
        feature {
          name: "label"
          type: FLOAT
        }
        feature {
          name: "example_weight"
          type: FLOAT
        }
        feature {
          name: "extra_feature"
          type: BYTES
        }
        """, schema_pb2.Schema())
    tfx_io = test_util.InMemoryTFExampleRecord(
        schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN)
    tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
        arrow_schema=tfx_io.ArrowSchema(),
        tensor_representations=tfx_io.TensorRepresentations())
    examples = [
        self._makeExample(
            input=0.0,
            label=1.0,
            example_weight=1.0,
            extra_feature='non_model_feature'),
        self._makeExample(
            input=1.0,
            label=0.0,
            example_weight=0.5,
            extra_feature='non_model_feature'),
    ]

    eval_config = config.EvalConfig(
        model_specs=[
            config.ModelSpec(
                name='candidate',
                label_key='label',
                example_weight_key='example_weight'),
            config.ModelSpec(
                name='baseline',
                label_key='label',
                example_weight_key='example_weight',
                is_baseline=True)
        ],
        slicing_specs=[config.SlicingSpec()],
        metrics_specs=[
            config.MetricsSpec(
                metrics=[
                    config.MetricConfig(
                        class_name='WeightedExampleCount',
                        # 1.5 < 1, NOT OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                upper_bound={'value': 1}))),
                    config.MetricConfig(
                        class_name='ExampleCount',
                        # 2 > 10, NOT OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                lower_bound={'value': 10}))),
                    config.MetricConfig(
                        class_name='MeanLabel',
                        # 0 > 0 and 0 > 0%?: NOT OK.
                        threshold=config.MetricThreshold(
                            change_threshold=config.GenericChangeThreshold(
                                direction=config.MetricDirection
                                .HIGHER_IS_BETTER,
                                relative={'value': 0},
                                absolute={'value': 0}))),
                    config.MetricConfig(
                        # MeanPrediction = (0+0)/(1+0.5) = 0
                        class_name='MeanPrediction',
                        # -.01 < 0 < .01, OK.
                        # Diff% = -.333/.333 = -100% < -99%, OK.
                        # Diff = 0 - .333 = -.333 < 0, OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                upper_bound={'value': .01},
                                lower_bound={'value': -.01}),
                            change_threshold=config.GenericChangeThreshold(
                                direction=config.MetricDirection
                                .LOWER_IS_BETTER,
                                relative={'value': -.99},
                                absolute={'value': 0})))
                ],
                model_names=['candidate', 'baseline']),
        ],
        options=config.Options(
            disabled_outputs={'values': ['eval_config.json']}),
    )
    slice_spec = [
        slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
    ]
    eval_shared_models = {
        'candidate': eval_shared_model,
        'baseline': baseline_eval_shared_model
    }
    extractors = [
        batched_input_extractor.BatchedInputExtractor(eval_config),
        batched_predict_extractor_v2.BatchedPredictExtractor(
            eval_shared_model=eval_shared_models,
            eval_config=eval_config,
            tensor_adapter_config=tensor_adapter_config),
        unbatch_extractor.UnbatchExtractor(),
        slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
    ]
    evaluators = [
        metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
            eval_config=eval_config, eval_shared_model=eval_shared_models)
    ]
    output_paths = {
        constants.VALIDATIONS_KEY: validations_file,
    }
    writers = [
        metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter(
            output_paths,
            add_metrics_callbacks=[],
            output_file_format=output_file_format)
    ]

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      _ = (
          pipeline
          | 'Create' >> beam.Create([e.SerializeToString() for e in examples])
          | 'BatchExamples' >> tfx_io.BeamSource()
          | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
          | 'ExtractEvaluate' >> model_eval_lib.ExtractAndEvaluate(
              extractors=extractors, evaluators=evaluators)
          | 'WriteResults' >> model_eval_lib.WriteResults(writers=writers))
      # pylint: enable=no-value-for-parameter

    validation_result = (
        metrics_plots_and_validations_writer
        .load_and_deserialize_validation_result(
            os.path.dirname(validations_file)))

    expected_validations = [
        text_format.Parse(
            """
            metric_key {
              name: "weighted_example_count"
              model_name: "candidate"
            }
            metric_threshold {
              value_threshold {
                upper_bound {
                  value: 1.0
                }
              }
            }
            metric_value {
              double_value {
                value: 1.5
              }
            }
            """, validation_result_pb2.ValidationFailure()),
        text_format.Parse(
            """
            metric_key {
              name: "example_count"
              model_name: "candidate"
            }
            metric_threshold {
              value_threshold {
                lower_bound {
                  value: 10.0
                }
              }
            }
            metric_value {
              double_value {
                value: 2.0
              }
            }
            """, validation_result_pb2.ValidationFailure()),
        text_format.Parse(
            """
            metric_key {
              name: "mean_label"
              model_name: "candidate"
              is_diff: true
            }
            metric_threshold {
              change_threshold {
                absolute {
                  value: 0.0
                }
                relative {
                  value: 0.0
                }
                direction: HIGHER_IS_BETTER
              }
            }
            metric_value {
              double_value {
                value: 0.0
              }
            }
            """, validation_result_pb2.ValidationFailure()),
    ]
    self.assertFalse(validation_result.validation_ok)
    self.assertLen(validation_result.metric_validations_per_slice, 1)
    self.assertCountEqual(
        expected_validations,
        validation_result.metric_validations_per_slice[0].failures)
コード例 #25
0
 def testIsCrossSliceApplicable(self):
     test_cases = [
         (True, 'overall pass', ((), (('b', 2), )),
          config.CrossSlicingSpec(
              baseline_spec=config.SlicingSpec(),
              slicing_specs=[config.SlicingSpec(feature_values={'b': '2'})
                             ])),
         (True, 'value pass', ((('a', 1), ), (('b', 2), )),
          config.CrossSlicingSpec(
              baseline_spec=config.SlicingSpec(feature_values={'a': '1'}),
              slicing_specs=[config.SlicingSpec(feature_values={'b': '2'})
                             ])),
         (True, 'baseline key pass', ((('a', 1), ), (('b', 2), )),
          config.CrossSlicingSpec(
              baseline_spec=config.SlicingSpec(feature_keys=['a']),
              slicing_specs=[config.SlicingSpec(feature_values={'b': '2'})
                             ])),
         (True, 'comparison key pass', ((('a', 1), ), (('b', 2), )),
          config.CrossSlicingSpec(
              baseline_spec=config.SlicingSpec(feature_values={'a': '1'}),
              slicing_specs=[config.SlicingSpec(feature_keys=['b'])])),
         (True, 'comparison multiple key pass', ((('a', 1), ), (('c',
                                                                 3), )),
          config.CrossSlicingSpec(
              baseline_spec=config.SlicingSpec(feature_values={'a': '1'}),
              slicing_specs=[
                  config.SlicingSpec(feature_keys=['b']),
                  config.SlicingSpec(feature_keys=['c'])
              ])),
         (False, 'overall fail', ((('a', 1), ), (('b', 2), )),
          config.CrossSlicingSpec(
              baseline_spec=config.SlicingSpec(),
              slicing_specs=[config.SlicingSpec(feature_values={'b': '2'})
                             ])),
         (False, 'value fail', ((('a', 1), ), (('b', 3), )),
          config.CrossSlicingSpec(
              baseline_spec=config.SlicingSpec(feature_values={'a': '1'}),
              slicing_specs=[config.SlicingSpec(feature_values={'b': '2'})
                             ])),
         (False, 'baseline key fail', ((('c', 1), ), (('b', 2), )),
          config.CrossSlicingSpec(
              baseline_spec=config.SlicingSpec(feature_keys=['a']),
              slicing_specs=[config.SlicingSpec(feature_values={'b': '2'})
                             ])),
         (False, 'comparison key fail', ((('a', 1), ), (('c', 3), )),
          config.CrossSlicingSpec(
              baseline_spec=config.SlicingSpec(feature_values={'a': '1'}),
              slicing_specs=[config.SlicingSpec(feature_keys=['b'])])),
         (False, 'comparison multiple key fail', ((('a', 1), ), (('d',
                                                                  3), )),
          config.CrossSlicingSpec(
              baseline_spec=config.SlicingSpec(feature_values={'a': '1'}),
              slicing_specs=[
                  config.SlicingSpec(feature_keys=['b']),
                  config.SlicingSpec(feature_keys=['c'])
              ])),
     ]  # pyformat: disable
     for (expected_result, name, sliced_key, slicing_spec) in test_cases:
         self.assertEqual(expected_result,
                          slicer.is_cross_slice_applicable(
                              cross_slice_key=sliced_key,
                              cross_slicing_spec=slicing_spec),
                          msg=name)
コード例 #26
0
 def testRunModelAnalysisWithModelAgnosticPredictions(self):
     examples = [
         self._makeExample(age=3.0,
                           language='english',
                           label=1.0,
                           prediction=0.9),
         self._makeExample(age=3.0,
                           language='chinese',
                           label=0.0,
                           prediction=0.4),
         self._makeExample(age=4.0,
                           language='english',
                           label=1.0,
                           prediction=0.7),
         self._makeExample(age=5.0,
                           language='chinese',
                           label=1.0,
                           prediction=0.2)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     model_specs = [
         config.ModelSpec(prediction_key='prediction',
                          label_key='label',
                          example_weight_key='age')
     ]
     metrics = [
         config.MetricConfig(class_name='ExampleCount'),
         config.MetricConfig(class_name='WeightedExampleCount'),
         config.MetricConfig(class_name='BinaryAccuracy')
     ]
     slicing_specs = [config.SlicingSpec(feature_keys=['language'])]
     eval_config = config.EvalConfig(
         model_specs=model_specs,
         metrics_specs=[config.MetricsSpec(metrics=metrics)],
         slicing_specs=slicing_specs)
     eval_result = model_eval_lib.run_model_analysis(
         eval_config=eval_config,
         data_location=data_location,
         output_path=self._getTempDir())
     expected = {
         (('language', 'chinese'), ): {
             'binary_accuracy': {
                 'doubleValue': 0.375
             },
             'weighted_example_count': {
                 'doubleValue': 8.0
             },
             'example_count': {
                 'doubleValue': 2.0
             },
         },
         (('language', 'english'), ): {
             'binary_accuracy': {
                 'doubleValue': 1.0
             },
             'weighted_example_count': {
                 'doubleValue': 7.0
             },
             'example_count': {
                 'doubleValue': 2.0
             },
         }
     }
     self.assertEqual(eval_result.data_location, data_location)
     self.assertEqual(eval_result.config.slicing_specs[0],
                      config.SlicingSpec(feature_keys=['language']))
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
    def testEvaluateWithQueryBasedMetrics(self):
        temp_export_dir = self._getExportDir()
        _, export_dir = (fixed_prediction_estimator_extra_fields.
                         simple_fixed_prediction_estimator_extra_fields(
                             None, temp_export_dir))
        eval_config = config.EvalConfig(
            model_specs=[
                config.ModelSpec(location=export_dir,
                                 label_key='label',
                                 example_weight_key='fixed_int')
            ],
            slicing_specs=[
                config.SlicingSpec(),
                config.SlicingSpec(feature_keys=['fixed_string']),
            ],
            metrics_specs=metric_specs.specs_from_metrics(
                [ndcg.NDCG(gain_key='fixed_float', name='ndcg')],
                binarize=config.BinarizationOptions(top_k_list=[1, 2]),
                query_key='fixed_string'))
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])
        slice_spec = [
            slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
        ]
        extractors = [
            input_extractor.InputExtractor(eval_config=eval_config),
            predict_extractor_v2.PredictExtractor(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model]),
            slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
        ]
        evaluators = [
            metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model])
        ]

        # fixed_string used as query_key
        # fixed_float used as gain_key for NDCG
        # fixed_int used as example_weight_key for NDCG
        examples = [
            self._makeExample(prediction=0.2,
                              label=1.0,
                              fixed_float=1.0,
                              fixed_string='query1',
                              fixed_int=1),
            self._makeExample(prediction=0.8,
                              label=0.0,
                              fixed_float=0.5,
                              fixed_string='query1',
                              fixed_int=1),
            self._makeExample(prediction=0.5,
                              label=0.0,
                              fixed_float=0.5,
                              fixed_string='query2',
                              fixed_int=2),
            self._makeExample(prediction=0.9,
                              label=1.0,
                              fixed_float=1.0,
                              fixed_string='query2',
                              fixed_int=2),
            self._makeExample(prediction=0.1,
                              label=0.0,
                              fixed_float=0.1,
                              fixed_string='query2',
                              fixed_int=2),
            self._makeExample(prediction=0.9,
                              label=1.0,
                              fixed_float=1.0,
                              fixed_string='query3',
                              fixed_int=3)
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            metrics = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate(
                    extractors=extractors, evaluators=evaluators))

            # pylint: enable=no-value-for-parameter

            def check_metrics(got):
                try:
                    self.assertLen(got, 4)
                    slices = {}
                    for slice_key, value in got:
                        slices[slice_key] = value
                    overall_slice = ()
                    query1_slice = (('fixed_string', b'query1'), )
                    query2_slice = (('fixed_string', b'query2'), )
                    query3_slice = (('fixed_string', b'query3'), )
                    self.assertCountEqual(list(slices.keys()), [
                        overall_slice, query1_slice, query2_slice, query3_slice
                    ])
                    example_count_key = metric_types.MetricKey(
                        name='example_count')
                    weighted_example_count_key = metric_types.MetricKey(
                        name='weighted_example_count')
                    ndcg1_key = metric_types.MetricKey(
                        name='ndcg', sub_key=metric_types.SubKey(top_k=1))
                    ndcg2_key = metric_types.MetricKey(
                        name='ndcg', sub_key=metric_types.SubKey(top_k=2))
                    # Query1 (weight=1): (p=0.8, g=0.5) (p=0.2, g=1.0)
                    # Query2 (weight=2): (p=0.9, g=1.0) (p=0.5, g=0.5) (p=0.1, g=0.1)
                    # Query3 (weight=3): (p=0.9, g=1.0)
                    #
                    # DCG@1:  0.5, 1.0, 1.0
                    # NDCG@1: 0.5, 1.0, 1.0
                    # Average NDCG@1: (1 * 0.5 + 2 * 1.0 + 3 * 1.0) / (1 + 2 + 3) ~ 0.92
                    #
                    # DCG@2: (0.5 + 1.0/log(3) ~ 0.630930
                    #        (1.0 + 0.5/log(3) ~ 1.315465
                    #        1.0
                    # NDCG@2: (0.5 + 1.0/log(3)) / (1.0 + 0.5/log(3)) ~ 0.85972
                    #         (1.0 + 0.5/log(3)) / (1.0 + 0.5/log(3)) = 1.0
                    #         1.0
                    # Average NDCG@2: (1 * 0.860 + 2 * 1.0 + 3 * 1.0) / (1 + 2 + 3) ~ 0.97
                    self.assertDictElementsAlmostEqual(
                        slices[overall_slice], {
                            example_count_key: 6,
                            weighted_example_count_key: 11.0,
                            ndcg1_key: 0.9166667,
                            ndcg2_key: 0.9766198
                        })
                    self.assertDictElementsAlmostEqual(
                        slices[query1_slice], {
                            example_count_key: 2,
                            weighted_example_count_key: 2.0,
                            ndcg1_key: 0.5,
                            ndcg2_key: 0.85972
                        })
                    self.assertDictElementsAlmostEqual(
                        slices[query2_slice], {
                            example_count_key: 3,
                            weighted_example_count_key: 6.0,
                            ndcg1_key: 1.0,
                            ndcg2_key: 1.0
                        })
                    self.assertDictElementsAlmostEqual(
                        slices[query3_slice], {
                            example_count_key: 1,
                            weighted_example_count_key: 3.0,
                            ndcg1_key: 1.0,
                            ndcg2_key: 1.0
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics[constants.METRICS_KEY],
                             check_metrics,
                             label='metrics')
コード例 #28
0
def validate_metrics(
        sliced_metrics: Tuple[Union[slicer.SliceKeyType,
                                    slicer.CrossSliceKeyType],
                              Dict['metric_types.MetricKey',
                                   Any]], eval_config: config.EvalConfig
) -> validation_result_pb2.ValidationResult:
    """Check the metrics and check whether they should be validated."""
    # Find out which model is baseline.
    baseline_spec = model_util.get_baseline_model_spec(eval_config)
    baseline_model_name = baseline_spec.name if baseline_spec else None

    sliced_key, metrics = sliced_metrics
    thresholds = metric_specs.metric_thresholds_from_metrics_specs(
        eval_config.metrics_specs)  # pytype: disable=wrong-arg-types
    is_cross_slice = slicer.is_cross_slice_key(sliced_key)

    def _check_threshold(key: metric_types.MetricKey,
                         threshold: _ThresholdType, metric: Any) -> bool:
        """Verify a metric given its metric key and metric value."""
        if isinstance(threshold, config.GenericValueThreshold):
            lower_bound, upper_bound = -np.inf, np.inf
            if threshold.HasField('lower_bound'):
                lower_bound = threshold.lower_bound.value
            if threshold.HasField('upper_bound'):
                upper_bound = threshold.upper_bound.value
            return metric > lower_bound and metric < upper_bound
        elif isinstance(threshold, config.GenericChangeThreshold):
            diff = metric
            ratio = diff / metrics[key.make_baseline_key(baseline_model_name)]
            if threshold.direction == config.MetricDirection.LOWER_IS_BETTER:
                absolute, relative = np.inf, np.inf
            elif threshold.direction == config.MetricDirection.HIGHER_IS_BETTER:
                absolute, relative = -np.inf, -np.inf
            else:
                raise ValueError('"UNKNOWN" direction for change threshold.')
            if threshold.HasField('absolute'):
                absolute = threshold.absolute.value
            if threshold.HasField('relative'):
                relative = threshold.relative.value
            if threshold.direction == config.MetricDirection.LOWER_IS_BETTER:
                return diff < absolute and ratio < relative
            elif threshold.direction == config.MetricDirection.HIGHER_IS_BETTER:
                return diff > absolute and ratio > relative

    def _copy_metric(metric, to):
        # Will add more types when more MetricValue are supported.
        to.double_value.value = float(metric)

    def _copy_threshold(threshold, to):
        if isinstance(threshold, config.GenericValueThreshold):
            to.value_threshold.CopyFrom(threshold)
        if isinstance(threshold, config.GenericChangeThreshold):
            to.change_threshold.CopyFrom(threshold)

    def _add_to_set(s, v):
        """Adds value to set. Returns true if didn't exist."""
        if v in s:
            return False
        else:
            s.add(v)
            return True

    # Empty metrics per slice is considered validated.
    result = validation_result_pb2.ValidationResult(validation_ok=True)
    validation_for_slice = validation_result_pb2.MetricsValidationForSlice()
    unchecked_thresholds = dict(thresholds)
    for metric_key, metric in metrics.items():
        if metric_key not in thresholds:
            continue
        del unchecked_thresholds[metric_key]
        # Not meaningful to check threshold for baseline model, thus always return
        # True if such threshold is configured. We also do not compare Message type
        # metrics.
        if metric_key.model_name == baseline_model_name:
            continue
        msg = ''
        # We try to convert to float values.
        try:
            metric = float(metric)
        except (TypeError, ValueError):
            msg = """
        Invalid threshold config: This metric is not comparable to the
        threshold. The type of the threshold is: {}, and the metric value is:
        \n{}""".format(type(metric), metric)
        existing_failures = set()
        for slice_spec, threshold in thresholds[metric_key]:
            if (slice_spec is not None
                    and isinstance(slice_spec, config.SlicingSpec)
                    and (is_cross_slice or not slicer.SingleSliceSpec(
                        spec=slice_spec).is_slice_applicable(sliced_key))):
                continue
            if (slice_spec is not None
                    and isinstance(slice_spec, config.CrossSlicingSpec) and
                (not is_cross_slice or not slicer.is_cross_slice_applicable(
                    cross_slice_key=sliced_key, cross_slicing_spec=slice_spec))
                ):
                continue
            if not _check_threshold(metric_key, threshold, metric):
                # The same threshold values could be set for multiple matching slice
                # specs. Only store the first match.
                #
                # Note that hashing by SerializeToString() is only safe if used within
                # the same process.
                if not _add_to_set(existing_failures,
                                   threshold.SerializeToString()):
                    continue
                failure = validation_for_slice.failures.add()
                failure.metric_key.CopyFrom(metric_key.to_proto())
                _copy_metric(metric, failure.metric_value)
                _copy_threshold(threshold, failure.metric_threshold)
                failure.message = msg
            # Track we have completed a validation check for slice spec and metric
            slicing_details = result.validation_details.slicing_details.add()
            if slice_spec is not None:
                if isinstance(slice_spec, config.SlicingSpec):
                    slicing_details.slicing_spec.CopyFrom(slice_spec)
                else:
                    slicing_details.cross_slicing_spec.CopyFrom(slice_spec)
            else:
                slicing_details.slicing_spec.CopyFrom(config.SlicingSpec())
            slicing_details.num_matching_slices = 1
    # All unchecked thresholds are considered failures.
    for metric_key, thresholds in unchecked_thresholds.items():
        if metric_key.model_name == baseline_model_name:
            continue
        existing_failures = set()
        for _, threshold in thresholds:
            # The same threshold values could be set for multiple matching slice
            # specs. Only store the first match.
            #
            # Note that hashing by SerializeToString() is only safe if used within
            # the same process.
            if not _add_to_set(existing_failures,
                               threshold.SerializeToString()):
                continue
            failure = validation_for_slice.failures.add()
            failure.metric_key.CopyFrom(metric_key.to_proto())
            _copy_threshold(threshold, failure.metric_threshold)
            failure.message = 'Metric not found.'
    # Any failure leads to overall failure.
    if validation_for_slice.failures:
        if not is_cross_slice:
            validation_for_slice.slice_key.CopyFrom(
                slicer.serialize_slice_key(sliced_key))
        else:
            validation_for_slice.cross_slice_key.CopyFrom(
                slicer.serialize_cross_slice_key(sliced_key))
        result.validation_ok = False
        result.metric_validations_per_slice.append(validation_for_slice)
    return result
  def testWriteValidationResults(self):
    model_dir, baseline_dir = self._getExportDir(), self._getBaselineDir()
    eval_shared_model = self._build_keras_model(model_dir, mul=0)
    baseline_eval_shared_model = self._build_keras_model(baseline_dir, mul=1)
    validations_file = os.path.join(self._getTempDir(),
                                    constants.VALIDATIONS_KEY)
    examples = [
        self._makeExample(
            input=0.0,
            label=1.0,
            example_weight=1.0,
            extra_feature='non_model_feature'),
        self._makeExample(
            input=1.0,
            label=0.0,
            example_weight=0.5,
            extra_feature='non_model_feature'),
    ]

    eval_config = config.EvalConfig(
        model_specs=[
            config.ModelSpec(
                name='candidate',
                label_key='label',
                example_weight_key='example_weight'),
            config.ModelSpec(
                name='baseline',
                label_key='label',
                example_weight_key='example_weight',
                is_baseline=True)
        ],
        slicing_specs=[config.SlicingSpec()],
        metrics_specs=[
            config.MetricsSpec(
                metrics=[
                    config.MetricConfig(
                        class_name='WeightedExampleCount',
                        # 1.5 < 1, NOT OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                upper_bound={'value': 1}))),
                    config.MetricConfig(
                        class_name='ExampleCount',
                        # 2 > 10, NOT OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                lower_bound={'value': 10}))),
                    config.MetricConfig(
                        class_name='MeanLabel',
                        # 0 > 0 and 0 > 0%?: NOT OK.
                        threshold=config.MetricThreshold(
                            change_threshold=config.GenericChangeThreshold(
                                direction=config.MetricDirection
                                .HIGHER_IS_BETTER,
                                relative={'value': 0},
                                absolute={'value': 0}))),
                    config.MetricConfig(
                        # MeanPrediction = (0+0)/(1+0.5) = 0
                        class_name='MeanPrediction',
                        # -.01 < 0 < .01, OK.
                        # Diff% = -.333/.333 = -100% < -99%, OK.
                        # Diff = 0 - .333 = -.333 < 0, OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                upper_bound={'value': .01},
                                lower_bound={'value': -.01}),
                            change_threshold=config.GenericChangeThreshold(
                                direction=config.MetricDirection
                                .LOWER_IS_BETTER,
                                relative={'value': -.99},
                                absolute={'value': 0})))
                ],
                model_names=['candidate', 'baseline']),
        ],
        options=config.Options(
            disabled_outputs={'values': ['eval_config.json']}),
    )
    slice_spec = [
        slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
    ]
    eval_shared_models = {
        'candidate': eval_shared_model,
        'baseline': baseline_eval_shared_model
    }
    extractors = [
        input_extractor.InputExtractor(eval_config),
        predict_extractor_v2.PredictExtractor(
            eval_shared_model=eval_shared_models, eval_config=eval_config),
        slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
    ]
    evaluators = [
        metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
            eval_config=eval_config, eval_shared_model=eval_shared_models)
    ]
    output_paths = {
        constants.VALIDATIONS_KEY: validations_file,
    }
    writers = [
        metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter(
            output_paths, add_metrics_callbacks=[])
    ]

    with beam.Pipeline() as pipeline:

      # pylint: disable=no-value-for-parameter
      _ = (
          pipeline
          | 'Create' >> beam.Create([e.SerializeToString() for e in examples])
          | 'ExtractEvaluateAndWriteResults' >>
          model_eval_lib.ExtractEvaluateAndWriteResults(
              eval_config=eval_config,
              eval_shared_model=eval_shared_model,
              extractors=extractors,
              evaluators=evaluators,
              writers=writers))
      # pylint: enable=no-value-for-parameter

    validation_result = model_eval_lib.load_validation_result(
        os.path.dirname(validations_file))

    expected_validations = [
        text_format.Parse(
            """
            metric_key {
              name: "weighted_example_count"
              model_name: "candidate"
            }
            metric_threshold {
              value_threshold {
                upper_bound {
                  value: 1.0
                }
              }
            }
            metric_value {
              double_value {
                value: 1.5
              }
            }
            """, validation_result_pb2.ValidationFailure()),
        text_format.Parse(
            """
            metric_key {
              name: "example_count"
            }
            metric_threshold {
              value_threshold {
                lower_bound {
                  value: 10.0
                }
              }
            }
            metric_value {
              double_value {
                value: 2.0
              }
            }
            """, validation_result_pb2.ValidationFailure()),
        text_format.Parse(
            """
            metric_key {
              name: "mean_label"
              model_name: "candidate"
              is_diff: true
            }
            metric_threshold {
              change_threshold {
                absolute {
                  value: 0.0
                }
                relative {
                  value: 0.0
                }
                direction: HIGHER_IS_BETTER
              }
            }
            metric_value {
              double_value {
                value: 0.0
              }
            }
            """, validation_result_pb2.ValidationFailure()),
    ]
    self.assertFalse(validation_result.validation_ok)
    self.assertLen(validation_result.metric_validations_per_slice, 1)
    self.assertCountEqual(
        expected_validations,
        validation_result.metric_validations_per_slice[0].failures)
コード例 #30
0
    def testMetricThresholdsFromMetricsSpecs(self):
        slice_specs = [
            config.SlicingSpec(feature_keys=['feature1']),
            config.SlicingSpec(feature_values={'feature2': 'value1'})
        ]

        # For cross slice tests.
        baseline_slice_spec = config.SlicingSpec(feature_keys=['feature3'])

        metrics_specs = [
            config.MetricsSpec(
                thresholds={
                    'auc':
                    config.MetricThreshold(
                        value_threshold=config.GenericValueThreshold()),
                    'mean/label':
                    config.MetricThreshold(
                        value_threshold=config.GenericValueThreshold(),
                        change_threshold=config.GenericChangeThreshold()),
                    'mse':
                    config.MetricThreshold(
                        change_threshold=config.GenericChangeThreshold())
                },
                per_slice_thresholds={
                    'auc':
                    config.PerSliceMetricThresholds(thresholds=[
                        config.PerSliceMetricThreshold(
                            slicing_specs=slice_specs,
                            threshold=config.MetricThreshold(
                                value_threshold=config.GenericValueThreshold(
                                )))
                    ]),
                    'mean/label':
                    config.PerSliceMetricThresholds(thresholds=[
                        config.PerSliceMetricThreshold(
                            slicing_specs=slice_specs,
                            threshold=config.MetricThreshold(
                                value_threshold=config.GenericValueThreshold(),
                                change_threshold=config.GenericChangeThreshold(
                                )))
                    ])
                },
                cross_slice_thresholds={
                    'auc':
                    config.CrossSliceMetricThresholds(thresholds=[
                        config.CrossSliceMetricThreshold(
                            cross_slicing_specs=[
                                config.CrossSlicingSpec(
                                    baseline_spec=baseline_slice_spec,
                                    slicing_specs=slice_specs)
                            ],
                            threshold=config.MetricThreshold(
                                value_threshold=config.GenericValueThreshold(),
                                change_threshold=config.GenericChangeThreshold(
                                )))
                    ]),
                    'mse':
                    config.CrossSliceMetricThresholds(thresholds=[
                        config.CrossSliceMetricThreshold(
                            cross_slicing_specs=[
                                config.CrossSlicingSpec(
                                    baseline_spec=baseline_slice_spec,
                                    slicing_specs=slice_specs)
                            ],
                            threshold=config.MetricThreshold(
                                change_threshold=config.GenericChangeThreshold(
                                ))),
                        # Test for duplicate cross_slicing_spec.
                        config.CrossSliceMetricThreshold(
                            cross_slicing_specs=[
                                config.CrossSlicingSpec(
                                    baseline_spec=baseline_slice_spec,
                                    slicing_specs=slice_specs)
                            ],
                            threshold=config.MetricThreshold(
                                value_threshold=config.GenericValueThreshold())
                        )
                    ])
                },
                model_names=['model_name'],
                output_names=['output_name']),
            config.MetricsSpec(metrics=[
                config.MetricConfig(
                    class_name='ExampleCount',
                    config=json.dumps({'name': 'example_count'}),
                    threshold=config.MetricThreshold(
                        value_threshold=config.GenericValueThreshold()))
            ],
                               model_names=['model_name1', 'model_name2'],
                               output_names=['output_name1', 'output_name2']),
            config.MetricsSpec(metrics=[
                config.MetricConfig(
                    class_name='WeightedExampleCount',
                    config=json.dumps({'name': 'weighted_example_count'}),
                    threshold=config.MetricThreshold(
                        value_threshold=config.GenericValueThreshold()))
            ],
                               model_names=['model_name1', 'model_name2'],
                               output_names=['output_name1', 'output_name2']),
            config.MetricsSpec(metrics=[
                config.MetricConfig(
                    class_name='MeanSquaredError',
                    config=json.dumps({'name': 'mse'}),
                    threshold=config.MetricThreshold(
                        change_threshold=config.GenericChangeThreshold())),
                config.MetricConfig(
                    class_name='MeanLabel',
                    config=json.dumps({'name': 'mean_label'}),
                    threshold=config.MetricThreshold(
                        change_threshold=config.GenericChangeThreshold()),
                    per_slice_thresholds=[
                        config.PerSliceMetricThreshold(
                            slicing_specs=slice_specs,
                            threshold=config.MetricThreshold(
                                change_threshold=config.GenericChangeThreshold(
                                ))),
                    ],
                    cross_slice_thresholds=[
                        config.CrossSliceMetricThreshold(
                            cross_slicing_specs=[
                                config.CrossSlicingSpec(
                                    baseline_spec=baseline_slice_spec,
                                    slicing_specs=slice_specs)
                            ],
                            threshold=config.MetricThreshold(
                                change_threshold=config.GenericChangeThreshold(
                                )))
                    ]),
            ],
                               model_names=['model_name'],
                               output_names=['output_name'],
                               binarize=config.BinarizationOptions(
                                   class_ids={'values': [0, 1]}),
                               aggregate=config.AggregationOptions(
                                   macro_average=True,
                                   class_weights={
                                       0: 1.0,
                                       1: 1.0
                                   }))
        ]

        thresholds = metric_specs.metric_thresholds_from_metrics_specs(
            metrics_specs)

        expected_keys_and_threshold_counts = {
            metric_types.MetricKey(name='auc',
                                   model_name='model_name',
                                   output_name='output_name',
                                   is_diff=False):
            4,
            metric_types.MetricKey(name='auc',
                                   model_name='model_name',
                                   output_name='output_name',
                                   is_diff=True):
            1,
            metric_types.MetricKey(name='mean/label',
                                   model_name='model_name',
                                   output_name='output_name',
                                   is_diff=True):
            3,
            metric_types.MetricKey(name='mean/label',
                                   model_name='model_name',
                                   output_name='output_name',
                                   is_diff=False):
            3,
            metric_types.MetricKey(name='example_count',
                                   model_name='model_name1',
                                   output_name='output_name1'):
            1,
            metric_types.MetricKey(name='example_count',
                                   model_name='model_name1',
                                   output_name='output_name2'):
            1,
            metric_types.MetricKey(name='example_count',
                                   model_name='model_name2',
                                   output_name='output_name1'):
            1,
            metric_types.MetricKey(name='example_count',
                                   model_name='model_name2',
                                   output_name='output_name2'):
            1,
            metric_types.MetricKey(name='weighted_example_count',
                                   model_name='model_name1',
                                   output_name='output_name1'):
            1,
            metric_types.MetricKey(name='weighted_example_count',
                                   model_name='model_name1',
                                   output_name='output_name2'):
            1,
            metric_types.MetricKey(name='weighted_example_count',
                                   model_name='model_name2',
                                   output_name='output_name1'):
            1,
            metric_types.MetricKey(name='weighted_example_count',
                                   model_name='model_name2',
                                   output_name='output_name2'):
            1,
            metric_types.MetricKey(name='mse',
                                   model_name='model_name',
                                   output_name='output_name',
                                   sub_key=metric_types.SubKey(class_id=0),
                                   is_diff=True):
            1,
            metric_types.MetricKey(name='mse',
                                   model_name='model_name',
                                   output_name='output_name',
                                   sub_key=metric_types.SubKey(class_id=1),
                                   is_diff=True):
            1,
            metric_types.MetricKey(name='mse',
                                   model_name='model_name',
                                   output_name='output_name',
                                   is_diff=True):
            2,
            metric_types.MetricKey(name='mse',
                                   model_name='model_name',
                                   output_name='output_name',
                                   is_diff=False):
            1,
            metric_types.MetricKey(name='mse',
                                   model_name='model_name',
                                   output_name='output_name',
                                   aggregation_type=metric_types.AggregationType(macro_average=True),
                                   is_diff=True):
            1,
            metric_types.MetricKey(name='mean_label',
                                   model_name='model_name',
                                   output_name='output_name',
                                   sub_key=metric_types.SubKey(class_id=0),
                                   is_diff=True):
            4,
            metric_types.MetricKey(name='mean_label',
                                   model_name='model_name',
                                   output_name='output_name',
                                   sub_key=metric_types.SubKey(class_id=1),
                                   is_diff=True):
            4,
            metric_types.MetricKey(name='mean_label',
                                   model_name='model_name',
                                   output_name='output_name',
                                   aggregation_type=metric_types.AggregationType(macro_average=True),
                                   is_diff=True):
            4
        }
        self.assertLen(thresholds, len(expected_keys_and_threshold_counts))
        for key, count in expected_keys_and_threshold_counts.items():
            self.assertIn(key, thresholds)
            self.assertLen(thresholds[key], count,
                           'failed for key {}'.format(key))