def testValidateMetricsValueThresholdUpperBoundFail(self): eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=[config.SlicingSpec()], metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 1.5 < 1, NOT OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': 1}))), ], model_names=['']), ], ) sliced_metrics = ((()), { metric_types.MetricKey(name='weighted_example_count'): 1.5, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok)
def testValidateMetricsChangeThresholdAbsoluteFail(self): eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), config.ModelSpec(name='baseline', is_baseline=True) ], slicing_specs=[config.SlicingSpec()], metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='MeanPrediction', # Diff = 0 - .333 = -.333 < -1, NOT OK. threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection .LOWER_IS_BETTER, absolute={'value': -1}))) ], model_names=['']), ], ) sliced_metrics = ((()), { metric_types.MetricKey(name='mean_prediction', model_name='baseline'): 0.333, metric_types.MetricKey(name='mean_prediction', is_diff=True): -0.333, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok)
def get_missing_slices( slicing_details: Iterable[validation_result_pb2.SlicingDetails], eval_config: config.EvalConfig ) -> List[Union[config.SlicingSpec, config.CrossSlicingSpec]]: """Returns specs that are defined in the EvalConfig but not found in details. Args: slicing_details: Slicing details. eval_config: Eval config. Returns: List of missing slices or empty list if none are missing. """ hashed_details = _hashed_slicing_details(slicing_details) thresholds = metric_specs.metric_thresholds_from_metrics_specs( eval_config.metrics_specs) baseline_spec = model_util.get_baseline_model_spec(eval_config) baseline_model_name = baseline_spec.name if baseline_spec else None missing_slices = [] for metric_key, sliced_thresholds in thresholds.items(): # Skip baseline. if metric_key.model_name == baseline_model_name: continue for slice_spec, _ in sliced_thresholds: if not slice_spec: slice_spec = config.SlicingSpec() slice_hash = slice_spec.SerializeToString() if slice_hash not in hashed_details: missing_slices.append(slice_spec) # Same slice may be used by other metrics/thresholds, only add once hashed_details[ slice_hash] = validation_result_pb2.SlicingDetails() return missing_slices
def testValidateMetricsMetricTDistributionValueAndThreshold( self, slicing_specs, slice_key): threshold = config.MetricThreshold( value_threshold=config.GenericValueThreshold( lower_bound={'value': 0.9})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='AUC', threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]), ], model_names=['']), ], ) sliced_metrics = (slice_key, { metric_types.MetricKey(name='auc'): types.ValueWithTDistribution(sample_mean=0.91, unsampled_value=0.8) }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok) expected = text_format.Parse( """ metric_validations_per_slice { failures { metric_key { name: "auc" } metric_value { double_value { value: 0.8 } } } }""", validation_result_pb2.ValidationResult()) expected.metric_validations_per_slice[0].failures[ 0].metric_threshold.CopyFrom(threshold) expected.metric_validations_per_slice[0].slice_key.CopyFrom( slicer.serialize_slice_key(slice_key)) for spec in slicing_specs or [None]: if (spec is None or slicer.SingleSliceSpec(spec=spec).is_slice_applicable(slice_key)): slicing_details = expected.validation_details.slicing_details.add() if spec is not None: slicing_details.slicing_spec.CopyFrom(spec) else: slicing_details.slicing_spec.CopyFrom(config.SlicingSpec()) slicing_details.num_matching_slices = 1 self.assertEqual(result, expected)
def testSerializeDeserializeLegacyEvalConfig(self): output_path = self._getTempDir() old_config = LegacyConfig( model_location='/path/to/model', data_location='/path/to/data', slice_spec=[ slicer.SingleSliceSpec(columns=['country'], features=[('age', 5), ('gender', 'f')]), slicer.SingleSliceSpec(columns=['interest'], features=[('age', 6), ('gender', 'm')]) ], example_count_metric_key=None, example_weight_metric_key='key', compute_confidence_intervals=False, k_anonymization_count=1) final_dict = {} final_dict['tfma_version'] = tfma_version.VERSION_STRING final_dict['eval_config'] = old_config with tf.io.TFRecordWriter(os.path.join(output_path, 'eval_config')) as w: w.write(pickle.dumps(final_dict)) got_eval_config = model_eval_lib.load_eval_config(output_path) eval_config = config.EvalConfig( input_data_specs=[ config.InputDataSpec(location=old_config.data_location) ], model_specs=[config.ModelSpec(location=old_config.model_location)], output_data_specs=[ config.OutputDataSpec(default_location=output_path) ], slicing_specs=[ config.SlicingSpec(feature_keys=['country'], feature_values={ 'age': '5', 'gender': 'f' }), config.SlicingSpec(feature_keys=['interest'], feature_values={ 'age': '6', 'gender': 'm' }) ], compute_confidence_intervals=old_config. compute_confidence_intervals, k_anonymization_count=old_config.k_anonymization_count) self.assertEqual(eval_config, got_eval_config)
def testSerializeDeserializeLegacyEvalConfig(self): output_path = self._getTempDir() old_config = LegacyConfig( model_location='/path/to/model', data_location='/path/to/data', slice_spec=[ slicer.SingleSliceSpec(columns=['country'], features=[('age', 5), ('gender', 'f')]), slicer.SingleSliceSpec(columns=['interest'], features=[('age', 6), ('gender', 'm')]) ], example_count_metric_key=None, example_weight_metric_key='key', compute_confidence_intervals=False, k_anonymization_count=1) final_dict = {} final_dict['tfma_version'] = tfma_version.VERSION final_dict['eval_config'] = old_config with tf.io.TFRecordWriter(os.path.join(output_path, 'eval_config')) as w: w.write(pickle.dumps(final_dict)) got_eval_config, got_data_location, _, got_model_locations = ( eval_config_writer.load_eval_run(output_path)) options = config.Options() options.compute_confidence_intervals.value = ( old_config.compute_confidence_intervals) options.min_slice_size.value = old_config.k_anonymization_count eval_config = config.EvalConfig(slicing_specs=[ config.SlicingSpec(feature_keys=['country'], feature_values={ 'age': '5', 'gender': 'f' }), config.SlicingSpec(feature_keys=['interest'], feature_values={ 'age': '6', 'gender': 'm' }) ], options=options) self.assertEqual(eval_config, got_eval_config) self.assertEqual(old_config.data_location, got_data_location) self.assertLen(got_model_locations, 1) self.assertEqual(old_config.model_location, list(got_model_locations.values())[0])
def testValidateMetricsMetricValueAndThreshold(self): eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=[config.SlicingSpec()], metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 1.5 < 1, NOT OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': 1}))), ], model_names=['']), ], ) sliced_metrics = ((()), { metric_types.MetricKey(name='weighted_example_count'): 1.5, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok) expected = text_format.Parse( """ metric_validations_per_slice { slice_key { } failures { metric_key { name: "weighted_example_count" } metric_threshold { value_threshold { upper_bound { value: 1.0 } } } metric_value { double_value { value: 1.5 } } } }""", validation_result_pb2.ValidationResult()) self.assertEqual(result, expected)
def testValidateMetricsInvalidThreshold(self): eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=[config.SlicingSpec()], metrics_specs=[ config.MetricsSpec( thresholds={ 'invalid_threshold': config.MetricThreshold( value_threshold=config.GenericValueThreshold( lower_bound={'value': 0.2})) }) ], ) sliced_metrics = ((()), { metric_types.MetricKey(name='weighted_example_count'): 1.5, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok) expected = text_format.Parse( """ metric_validations_per_slice { slice_key { } failures { metric_key { name: "invalid_threshold" } metric_threshold { value_threshold { lower_bound { value: 0.2 } } } message: 'Metric not found.' } }""", validation_result_pb2.ValidationResult()) self.assertProtoEquals(expected, result)
def testValidateMetricsDivByZero(self): threshold = config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection.HIGHER_IS_BETTER, relative={'value': 0.1})) slicing_specs = [config.SlicingSpec()] eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(name='candidate'), config.ModelSpec(name='baseline', is_baseline=True) ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='MeanPrediction', threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]) ], model_names=['baseline', 'candidate']), ], ) sliced_metrics = ((()), { metric_types.MetricKey(name='mean_prediction', model_name='baseline'): 0.0, metric_types.MetricKey( name='mean_prediction', model_name='candidate', is_diff=True): 0.1, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok)
def testRunModelAnalysisWithUncertainty(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=1.0), self._makeExample(age=5.0, language='hindi', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec(feature_keys=['language'])] options = config.Options() options.compute_confidence_intervals.value = True options.k_anonymization_count.value = 2 eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], slicing_specs=slicing_specs, options=options) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[ model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') ]) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (('language', 'hindi'), ): { u'__ERROR__': { 'debugMessage': u'Example count for this slice key is lower than the ' u'minimum required value: 2. No data is aggregated for ' u'this slice.' }, }, (('language', 'chinese'), ): { metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 8.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, }, (('language', 'english'), ): { 'accuracy': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, 'my_mean_label': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 7.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, } } self.assertEqual(eval_result.config.model_specs[0].location, model_location.decode()) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertEqual(eval_result.config.slicing_specs[0], config.SlicingSpec(feature_keys=['language'])) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) self.assertFalse(eval_result.plots)
def testRunModelAnalysisWithLegacyQueryExtractor(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=0.0), self._makeExample(age=5.0, language='chinese', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec()] eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], slicing_specs=slicing_specs) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[eval_shared_model], evaluators=[ metrics_and_plots_evaluator.MetricsAndPlotsEvaluator( eval_shared_model), query_based_metrics_evaluator.QueryBasedMetricsEvaluator( query_id='language', prediction_key='logistic', combine_fns=[ query_statistics.QueryStatisticsCombineFn(), legacy_ndcg.NdcgMetricCombineFn(at_vals=[1], gain_key='label', weight_key='') ]), ]) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (): { 'post_export_metrics/total_queries': { 'doubleValue': 2.0 }, 'post_export_metrics/min_documents': { 'doubleValue': 2.0 }, 'post_export_metrics/max_documents': { 'doubleValue': 2.0 }, 'post_export_metrics/total_documents': { 'doubleValue': 4.0 }, 'post_export_metrics/ndcg@1': { 'doubleValue': 0.5 }, 'post_export_metrics/example_weight': { 'doubleValue': 15.0 }, 'post_export_metrics/example_count': { 'doubleValue': 4.0 }, } } self.assertEqual(eval_result.config.model_specs[0].location, model_location.decode()) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertEqual(eval_result.config.slicing_specs[0], config.SlicingSpec()) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) self.assertFalse(eval_result.plots)
def testRunModelAnalysisWithQueryBasedMetrics(self): input_layer = tf.keras.layers.Input(shape=(1, ), name='age') output_layer = tf.keras.layers.Dense( 1, activation=tf.nn.sigmoid)(input_layer) model = tf.keras.models.Model(input_layer, output_layer) model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001), loss=tf.keras.losses.binary_crossentropy) features = {'age': [[20.0]]} labels = [[1]] example_weights = [1.0] dataset = tf.data.Dataset.from_tensor_slices( (features, labels, example_weights)) dataset = dataset.shuffle(buffer_size=1).repeat().batch(1) model.fit(dataset, steps_per_epoch=1) model_location = os.path.join(self._getTempDir(), 'export_dir') model.save(model_location, save_format='tf') examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=0.0), self._makeExample(age=3.0, language='english', label=0.0), self._makeExample(age=5.0, language='chinese', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec()] eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[ config.ModelSpec(location=model_location, label_key='label') ], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], slicing_specs=slicing_specs, metrics_specs=metric_specs.specs_from_metrics( [ndcg.NDCG(gain_key='age', name='ndcg')], binarize=config.BinarizationOptions(top_k_list=[1]), query_key='language')) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, tags=[tf.saved_model.SERVING]) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[eval_shared_model], evaluators=[ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ]) self.assertEqual(eval_result.config.model_specs[0].location, model_location) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertLen(eval_result.slicing_metrics, 1) got_slice_key, got_metrics = eval_result.slicing_metrics[0] self.assertEqual(got_slice_key, ()) self.assertIn('', got_metrics) # output_name got_metrics = got_metrics[''] expected_metrics = { '': { 'example_count': True, 'weighted_example_count': True, }, 'topK:1': { 'ndcg': True, }, } for group in expected_metrics: self.assertIn(group, got_metrics) for k in expected_metrics[group]: self.assertIn(k, got_metrics[group])
def testRunModelAnalysisExtraFieldsPlusFeatureExtraction(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0, my_slice='a'), self._makeExample(age=3.0, language='chinese', label=0.0, my_slice='a'), self._makeExample(age=4.0, language='english', label=1.0, my_slice='b'), self._makeExample(age=5.0, language='chinese', label=1.0, my_slice='c'), self._makeExample(age=5.0, language='hindi', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec(feature_keys=['my_slice'])] eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], slicing_specs=slicing_specs) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') slice_spec = [slicer.SingleSliceSpec(spec=slicing_specs[0])] extractors_with_feature_extraction = [ predict_extractor.PredictExtractor(eval_shared_model, desired_batch_size=3, materialize=False), feature_extractor.FeatureExtractor( extract_source=constants.INPUT_KEY, extract_dest=constants.FEATURES_PREDICTIONS_LABELS_KEY), slice_key_extractor.SliceKeyExtractor(slice_spec, materialize=False) ] eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[ model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') ], extractors=extractors_with_feature_extraction) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (('my_slice', 'a'), ): { 'accuracy': { 'doubleValue': 1.0 }, 'my_mean_label': { 'doubleValue': 0.5 }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 6.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, }, (('my_slice', 'b'), ): { 'accuracy': { 'doubleValue': 1.0 }, 'my_mean_label': { 'doubleValue': 1.0 }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 4.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 1.0 }, }, (('my_slice', 'c'), ): { 'accuracy': { 'doubleValue': 0.0 }, 'my_mean_label': { 'doubleValue': 1.0 }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 5.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 1.0 }, }, } self.assertEqual(eval_result.config.model_specs[0].location, model_location.decode()) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertEqual(eval_result.config.slicing_specs[0], config.SlicingSpec(feature_keys=['my_slice'])) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) self.assertFalse(eval_result.plots)
def testRunModelAnalysisWithDeterministicConfidenceIntervals(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=1.0), self._makeExample(age=5.0, language='hindi', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec(feature_keys=['language'])] options = config.Options() options.compute_confidence_intervals.value = True options.k_anonymization_count.value = 2 eval_config = config.EvalConfig(slicing_specs=slicing_specs, options=options) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_model=model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age'), data_location=data_location, output_path=self._getTempDir(), random_seed_for_testing=_TEST_SEED) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (('language', 'hindi'), ): { u'__ERROR__': { 'debugMessage': u'Example count for this slice key is lower than the ' u'minimum required value: 2. No data is aggregated for ' u'this slice.' }, }, (('language', 'chinese'), ): { metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 8.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, }, (('language', 'english'), ): { 'accuracy': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, 'my_mean_label': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 7.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, } } self.assertEqual(eval_result.model_location, model_location.decode()) self.assertEqual(eval_result.data_location, data_location) self.assertEqual(eval_result.config.slicing_specs[0], config.SlicingSpec(feature_keys=['language'])) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) for key, value in eval_result.slicing_metrics: if (('language', 'english'), ) == key: metric = value['']['']['average_loss'] self.assertAlmostEqual(0.171768754720, metric['boundedValue']['value'], delta=0.1) metric = value['']['']['auc_precision_recall'] self.assertAlmostEqual(0.99999940395, metric['boundedValue']['value'], delta=0.1) self.assertFalse(eval_result.plots)
def testEvaluateWithBinaryClassificationModel(self): n_classes = 2 temp_export_dir = self._getExportDir() _, export_dir = dnn_classifier.simple_dnn_classifier( None, temp_export_dir, n_classes=n_classes) # Add mean_label, example_count, weighted_example_count, calibration_plot eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_key='label', example_weight_key='age') ], slicing_specs=[config.SlicingSpec()], metrics_specs=metric_specs.specs_from_metrics([ calibration.MeanLabel('mean_label'), calibration_plot.CalibrationPlot(name='calibration_plot', num_buckets=10) ])) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_models=[eval_shared_model]), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] examples = [ self._makeExample(age=1.0, language='english', label=0.0), self._makeExample(age=2.0, language='chinese', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics_and_plots = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) example_count_key = metric_types.MetricKey( name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') label_key = metric_types.MetricKey(name='mean_label') self.assertDictElementsAlmostEqual( got_metrics, { example_count_key: 3, weighted_example_count_key: (1.0 + 2.0 + 3.0), label_key: (0 * 1.0 + 1 * 2.0 + 0 * 3.0) / (1.0 + 2.0 + 3.0), }) except AssertionError as err: raise util.BeamAssertException(err) def check_plots(got): try: self.assertLen(got, 1) got_slice_key, got_plots = got[0] self.assertEqual(got_slice_key, ()) plot_key = metric_types.PlotKey('calibration_plot') self.assertIn(plot_key, got_plots) # 10 buckets + 2 for edge cases self.assertLen(got_plots[plot_key].buckets, 12) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics_and_plots[constants.METRICS_KEY], check_metrics, label='metrics') util.assert_that(metrics_and_plots[constants.PLOTS_KEY], check_plots, label='plots')
from absl.testing import parameterized import tensorflow as tf from tensorflow_model_analysis import config from tensorflow_model_analysis import types from tensorflow_model_analysis.eval_saved_model import testutil from tensorflow_model_analysis.evaluators import metrics_validator from tensorflow_model_analysis.metrics import metric_types from tensorflow_model_analysis.proto import validation_result_pb2 from tensorflow_model_analysis.slicer import slicer_lib as slicer from google.protobuf import text_format # Tests involiving slices: (<test_name>, <slice_config> , <slice_key>) _NO_SLICE_TEST = ('no_slice', None, (())) _GLOBAL_SLICE_TEST = ('global_slice', [config.SlicingSpec()], (())) _FEATURE_SLICE_TEST = ('feature_slice', [config.SlicingSpec(feature_keys=['feature1']) ], (('feature1', 'value1'), )) _FEATURE_VALUE_SLICE_TEST = ('feature_value_slice', [ config.SlicingSpec(feature_values={'feature1': 'value1'}) ], (('feature1', 'value1'), )) _MULTIPLE_SLICES_TEST = ('multiple_slices', [ config.SlicingSpec(feature_values={'feature1': 'value1'}), config.SlicingSpec(feature_values={'feature2': 'value2'}) ], (('feature1', 'value1'), )) _UNMATCHED_SINGLE_SLICE_TEST = ('single_slice', [config.SlicingSpec(feature_keys='feature1')], (('unmatched_feature', 'unmatched_value'), )) _UNMATCHED_MULTIPLE_SLICES_TEST = ('multiple_slices', [ config.SlicingSpec(feature_values={'feature1': 'value1'}),
def testEvaluateWithEvalSavedModel(self): temp_export_dir = self._getExportDir() _, export_dir = linear_classifier.simple_linear_classifier( None, temp_export_dir) eval_config = config.EvalConfig( model_specs=[config.ModelSpec(signature_name='eval')], slicing_specs=[ config.SlicingSpec(), config.SlicingSpec(feature_keys=['slice_key']), ]) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, add_metrics_callbacks=[_addExampleCountMetricCallback]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_model=eval_shared_model) ] examples = [ self._makeExample(age=3.0, language='english', label=1.0, slice_key='first_slice'), self._makeExample(age=3.0, language='chinese', label=0.0, slice_key='first_slice'), self._makeExample(age=4.0, language='english', label=0.0, slice_key='second_slice'), self._makeExample(age=5.0, language='chinese', label=1.0, slice_key='second_slice'), self._makeExample(age=5.0, language='chinese', label=1.0, slice_key='second_slice') ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 3) slices = {} for slice_key, value in got: slices[slice_key] = value overall_slice = () first_slice = (('slice_key', b'first_slice'), ) second_slice = (('slice_key', b'second_slice'), ) self.assertCountEqual( list(slices.keys()), [overall_slice, first_slice, second_slice]) self.assertDictElementsAlmostEqual( slices[overall_slice], { metric_types.MetricKey(name='accuracy'): 0.4, metric_types.MetricKey(name='label/mean'): 0.6, metric_types.MetricKey(name='my_mean_age'): 4.0, metric_types.MetricKey(name='my_mean_age_times_label'): 2.6, metric_types.MetricKey(name='added_example_count'): 5.0 }) self.assertDictElementsAlmostEqual( slices[first_slice], { metric_types.MetricKey(name='accuracy'): 1.0, metric_types.MetricKey(name='label/mean'): 0.5, metric_types.MetricKey(name='my_mean_age'): 3.0, metric_types.MetricKey(name='my_mean_age_times_label'): 1.5, metric_types.MetricKey(name='added_example_count'): 2.0 }) self.assertDictElementsAlmostEqual( slices[second_slice], { metric_types.MetricKey(name='accuracy'): 0.0, metric_types.MetricKey(name='label/mean'): 2.0 / 3.0, metric_types.MetricKey(name='my_mean_age'): 14.0 / 3.0, metric_types.MetricKey(name='my_mean_age_times_label'): 10.0 / 3.0, metric_types.MetricKey(name='added_example_count'): 3.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def testEvaluateWithKerasModel(self): input1 = tf.keras.layers.Input(shape=(1, ), name='input1') input2 = tf.keras.layers.Input(shape=(1, ), name='input2') inputs = [input1, input2] input_layer = tf.keras.layers.concatenate(inputs) output_layer = tf.keras.layers.Dense(1, activation=tf.nn.sigmoid, name='output')(input_layer) model = tf.keras.models.Model(inputs, output_layer) model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001), loss=tf.keras.losses.binary_crossentropy, metrics=['accuracy']) features = {'input1': [[0.0], [1.0]], 'input2': [[1.0], [0.0]]} labels = [[1], [0]] example_weights = [1.0, 0.5] dataset = tf.data.Dataset.from_tensor_slices( (features, labels, example_weights)) dataset = dataset.shuffle(buffer_size=1).repeat().batch(2) model.fit(dataset, steps_per_epoch=1) export_dir = self._getExportDir() model.save(export_dir, save_format='tf') eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_key='label', example_weight_key='example_weight') ], slicing_specs=[config.SlicingSpec()], metrics_specs=metric_specs.specs_from_metrics( [calibration.MeanLabel('mean_label')])) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_models=[eval_shared_model]), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] examples = [ self._makeExample(input1=0.0, input2=1.0, label=1.0, example_weight=1.0, extra_feature='non_model_feature'), self._makeExample(input1=1.0, input2=0.0, label=0.0, example_weight=0.5, extra_feature='non_model_feature'), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) example_count_key = metric_types.MetricKey( name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') label_key = metric_types.MetricKey(name='mean_label') self.assertDictElementsAlmostEqual( got_metrics, { example_count_key: 2, weighted_example_count_key: (1.0 + 0.5), label_key: (1.0 * 1.0 + 0.0 * 0.5) / (1.0 + 0.5), }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def testValidateMetricsMetricTDistributionChangeAndThreshold( self, slicing_specs, slice_key): threshold = config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection.LOWER_IS_BETTER, absolute={'value': -1})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), config.ModelSpec(name='baseline', is_baseline=True) ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec(metrics=[ config.MetricConfig( class_name='AUC', threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]), ], model_names=['']), ], ) sliced_metrics = ( slice_key, { # This is the mean of the diff. metric_types.MetricKey(name='auc', model_name='baseline'): types.ValueWithTDistribution(sample_mean=0.91, unsampled_value=0.6), metric_types.MetricKey(name='auc', is_diff=True): types.ValueWithTDistribution(sample_mean=0.1, unsampled_value=0.1), }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok) expected = text_format.Parse( """ metric_validations_per_slice { failures { metric_key { name: "auc" is_diff: true } metric_value { double_value { value: 0.1 } } } }""", validation_result_pb2.ValidationResult()) expected.metric_validations_per_slice[0].failures[ 0].metric_threshold.CopyFrom(threshold) expected.metric_validations_per_slice[0].slice_key.CopyFrom( slicer.serialize_slice_key(slice_key)) for spec in slicing_specs or [None]: if (spec is None or slicer.SingleSliceSpec( spec=spec).is_slice_applicable(slice_key)): slicing_details = expected.validation_details.slicing_details.add( ) if spec is not None: slicing_details.slicing_spec.CopyFrom(spec) else: slicing_details.slicing_spec.CopyFrom(config.SlicingSpec()) slicing_details.num_matching_slices = 1 self.assertAlmostEqual(result, expected)
def testEvaluateWithMultiOutputModel(self): temp_export_dir = self._getExportDir() _, export_dir = multi_head.simple_multi_head(None, temp_export_dir) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_keys={ 'chinese_head': 'chinese_label', 'english_head': 'english_label', 'other_head': 'other_label' }, example_weight_keys={ 'chinese_head': 'age', 'english_head': 'age', 'other_head': 'age' }) ], slicing_specs=[config.SlicingSpec()], metrics_specs=metric_specs.specs_from_metrics({ 'chinese_head': [calibration.MeanLabel('mean_label')], 'english_head': [calibration.MeanLabel('mean_label')], 'other_head': [calibration.MeanLabel('mean_label')], })) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_models=[eval_shared_model]), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] examples = [ self._makeExample(age=1.0, language='english', english_label=1.0, chinese_label=0.0, other_label=0.0), self._makeExample(age=1.0, language='chinese', english_label=0.0, chinese_label=1.0, other_label=0.0), self._makeExample(age=2.0, language='english', english_label=1.0, chinese_label=0.0, other_label=0.0), self._makeExample(age=2.0, language='other', english_label=0.0, chinese_label=1.0, other_label=1.0), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) example_count_key = metric_types.MetricKey( name='example_count') chinese_weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count', output_name='chinese_head') chinese_label_key = metric_types.MetricKey( name='mean_label', output_name='chinese_head') english_weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count', output_name='english_head') english_label_key = metric_types.MetricKey( name='mean_label', output_name='english_head') other_weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count', output_name='other_head') other_label_key = metric_types.MetricKey( name='mean_label', output_name='other_head') self.assertDictElementsAlmostEqual( got_metrics, { example_count_key: 4, chinese_label_key: (0.0 + 1.0 + 2 * 0.0 + 2 * 1.0) / (1.0 + 1.0 + 2.0 + 2.0), chinese_weighted_example_count_key: (1.0 + 1.0 + 2.0 + 2.0), english_label_key: (1.0 + 0.0 + 2 * 1.0 + 2 * 0.0) / (1.0 + 1.0 + 2.0 + 2.0), english_weighted_example_count_key: (1.0 + 1.0 + 2.0 + 2.0), other_label_key: (0.0 + 0.0 + 2 * 0.0 + 2 * 1.0) / (1.0 + 1.0 + 2.0 + 2.0), other_weighted_example_count_key: (1.0 + 1.0 + 2.0 + 2.0) }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def testEvaluateWithSlicing(self): temp_export_dir = self._getExportDir() _, export_dir = (fixed_prediction_estimator_extra_fields. simple_fixed_prediction_estimator_extra_fields( None, temp_export_dir)) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_key='label', example_weight_key='fixed_float') ], slicing_specs=[ config.SlicingSpec(), config.SlicingSpec(feature_keys=['fixed_string']), ], metrics_specs=metric_specs.specs_from_metrics([ calibration.MeanLabel('mean_label'), calibration.MeanPrediction('mean_prediction') ])) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ predict_extractor.PredictExtractor( eval_shared_model=eval_shared_model), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] # fixed_float used as example_weight key examples = [ self._makeExample(prediction=0.2, label=1.0, fixed_int=1, fixed_float=1.0, fixed_string='fixed_string1'), self._makeExample(prediction=0.8, label=0.0, fixed_int=1, fixed_float=1.0, fixed_string='fixed_string1'), self._makeExample(prediction=0.5, label=0.0, fixed_int=2, fixed_float=2.0, fixed_string='fixed_string2') ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 3) slices = {} for slice_key, value in got: slices[slice_key] = value overall_slice = () fixed_string1_slice = (('fixed_string', b'fixed_string1'), ) fixed_string2_slice = (('fixed_string', b'fixed_string2'), ) self.asssertCountEqual(list(slices.keys()), [ overall_slice, fixed_string1_slice, fixed_string2_slice ]) example_count_key = metric_types.MetricKey( name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') label_key = metric_types.MetricKey(name='mean_label') pred_key = metric_types.MetricKey(name='mean_prediction') self.assertDictElementsAlmostEqual( slices[overall_slice], { example_count_key: 3, weighted_example_count_key: 4.0, label_key: (1.0 + 0.0 + 2 * 0.0) / (1.0 + 1.0 + 2.0), pred_key: (0.2 + 0.8 + 2 * 0.5) / (1.0 + 1.0 + 2.0), }) self.assertDictElementsAlmostEqual( slices[fixed_string1_slice], { example_count_key: 2, weighted_example_count_key: 2.0, label_key: (1.0 + 0.0) / (1.0 + 1.0), pred_key: (0.2 + 0.8) / (1.0 + 1.0), }) self.assertDictElementsAlmostEqual( slices[fixed_string2_slice], { example_count_key: 1, weighted_example_count_key: 2.0, label_key: (2 * 0.0) / 2.0, pred_key: (2 * 0.5) / 2.0, }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def testEvaluateWithMultiClassModel(self): n_classes = 3 temp_export_dir = self._getExportDir() _, export_dir = dnn_classifier.simple_dnn_classifier( None, temp_export_dir, n_classes=n_classes) # Add example_count and weighted_example_count eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_key='label', example_weight_key='age') ], slicing_specs=[config.SlicingSpec()], metrics_specs=metric_specs.specs_from_metrics( [calibration.MeanLabel('mean_label')], binarize=config.BinarizationOptions( class_ids=range(n_classes)))) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_models=[eval_shared_model]), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] examples = [ self._makeExample(age=1.0, language='english', label=0), self._makeExample(age=2.0, language='chinese', label=1), self._makeExample(age=3.0, language='english', label=2), self._makeExample(age=4.0, language='chinese', label=1), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] example_count_key = metric_types.MetricKey( name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') label_key_class_0 = metric_types.MetricKey( name='mean_label', sub_key=metric_types.SubKey(class_id=0)) label_key_class_1 = metric_types.MetricKey( name='mean_label', sub_key=metric_types.SubKey(class_id=1)) label_key_class_2 = metric_types.MetricKey( name='mean_label', sub_key=metric_types.SubKey(class_id=2)) self.assertEqual(got_slice_key, ()) self.assertDictElementsAlmostEqual( got_metrics, { example_count_key: 4, weighted_example_count_key: (1.0 + 2.0 + 3.0 + 4.0), label_key_class_0: (1 * 1.0 + 0 * 2.0 + 0 * 3.0 + 0 * 4.0) / (1.0 + 2.0 + 3.0 + 4.0), label_key_class_1: (0 * 1.0 + 1 * 2.0 + 0 * 3.0 + 1 * 4.0) / (1.0 + 2.0 + 3.0 + 4.0), label_key_class_2: (0 * 1.0 + 0 * 2.0 + 1 * 3.0 + 0 * 4.0) / (1.0 + 2.0 + 3.0 + 4.0) }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def to_proto(self) -> config.SlicingSpec: feature_values = {k: str(v) for (k, v) in self._features} return config.SlicingSpec(feature_keys=self._columns, feature_values=feature_values)
def testWriteValidationResults(self, output_file_format): model_dir, baseline_dir = self._getExportDir(), self._getBaselineDir() eval_shared_model = self._build_keras_model(model_dir, mul=0) baseline_eval_shared_model = self._build_keras_model(baseline_dir, mul=1) validations_file = os.path.join(self._getTempDir(), constants.VALIDATIONS_KEY) schema = text_format.Parse( """ tensor_representation_group { key: "" value { tensor_representation { key: "input" value { dense_tensor { column_name: "input" shape { dim { size: 1 } } } } } } } feature { name: "input" type: FLOAT } feature { name: "label" type: FLOAT } feature { name: "example_weight" type: FLOAT } feature { name: "extra_feature" type: BYTES } """, schema_pb2.Schema()) tfx_io = test_util.InMemoryTFExampleRecord( schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN) tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfx_io.ArrowSchema(), tensor_representations=tfx_io.TensorRepresentations()) examples = [ self._makeExample( input=0.0, label=1.0, example_weight=1.0, extra_feature='non_model_feature'), self._makeExample( input=1.0, label=0.0, example_weight=0.5, extra_feature='non_model_feature'), ] eval_config = config.EvalConfig( model_specs=[ config.ModelSpec( name='candidate', label_key='label', example_weight_key='example_weight'), config.ModelSpec( name='baseline', label_key='label', example_weight_key='example_weight', is_baseline=True) ], slicing_specs=[config.SlicingSpec()], metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 1.5 < 1, NOT OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': 1}))), config.MetricConfig( class_name='ExampleCount', # 2 > 10, NOT OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( lower_bound={'value': 10}))), config.MetricConfig( class_name='MeanLabel', # 0 > 0 and 0 > 0%?: NOT OK. threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection .HIGHER_IS_BETTER, relative={'value': 0}, absolute={'value': 0}))), config.MetricConfig( # MeanPrediction = (0+0)/(1+0.5) = 0 class_name='MeanPrediction', # -.01 < 0 < .01, OK. # Diff% = -.333/.333 = -100% < -99%, OK. # Diff = 0 - .333 = -.333 < 0, OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': .01}, lower_bound={'value': -.01}), change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection .LOWER_IS_BETTER, relative={'value': -.99}, absolute={'value': 0}))) ], model_names=['candidate', 'baseline']), ], options=config.Options( disabled_outputs={'values': ['eval_config.json']}), ) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] eval_shared_models = { 'candidate': eval_shared_model, 'baseline': baseline_eval_shared_model } extractors = [ batched_input_extractor.BatchedInputExtractor(eval_config), batched_predict_extractor_v2.BatchedPredictExtractor( eval_shared_model=eval_shared_models, eval_config=eval_config, tensor_adapter_config=tensor_adapter_config), unbatch_extractor.UnbatchExtractor(), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_model=eval_shared_models) ] output_paths = { constants.VALIDATIONS_KEY: validations_file, } writers = [ metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter( output_paths, add_metrics_callbacks=[], output_file_format=output_file_format) ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter _ = ( pipeline | 'Create' >> beam.Create([e.SerializeToString() for e in examples]) | 'BatchExamples' >> tfx_io.BeamSource() | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | 'ExtractEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators) | 'WriteResults' >> model_eval_lib.WriteResults(writers=writers)) # pylint: enable=no-value-for-parameter validation_result = ( metrics_plots_and_validations_writer .load_and_deserialize_validation_result( os.path.dirname(validations_file))) expected_validations = [ text_format.Parse( """ metric_key { name: "weighted_example_count" model_name: "candidate" } metric_threshold { value_threshold { upper_bound { value: 1.0 } } } metric_value { double_value { value: 1.5 } } """, validation_result_pb2.ValidationFailure()), text_format.Parse( """ metric_key { name: "example_count" model_name: "candidate" } metric_threshold { value_threshold { lower_bound { value: 10.0 } } } metric_value { double_value { value: 2.0 } } """, validation_result_pb2.ValidationFailure()), text_format.Parse( """ metric_key { name: "mean_label" model_name: "candidate" is_diff: true } metric_threshold { change_threshold { absolute { value: 0.0 } relative { value: 0.0 } direction: HIGHER_IS_BETTER } } metric_value { double_value { value: 0.0 } } """, validation_result_pb2.ValidationFailure()), ] self.assertFalse(validation_result.validation_ok) self.assertLen(validation_result.metric_validations_per_slice, 1) self.assertCountEqual( expected_validations, validation_result.metric_validations_per_slice[0].failures)
def testIsCrossSliceApplicable(self): test_cases = [ (True, 'overall pass', ((), (('b', 2), )), config.CrossSlicingSpec( baseline_spec=config.SlicingSpec(), slicing_specs=[config.SlicingSpec(feature_values={'b': '2'}) ])), (True, 'value pass', ((('a', 1), ), (('b', 2), )), config.CrossSlicingSpec( baseline_spec=config.SlicingSpec(feature_values={'a': '1'}), slicing_specs=[config.SlicingSpec(feature_values={'b': '2'}) ])), (True, 'baseline key pass', ((('a', 1), ), (('b', 2), )), config.CrossSlicingSpec( baseline_spec=config.SlicingSpec(feature_keys=['a']), slicing_specs=[config.SlicingSpec(feature_values={'b': '2'}) ])), (True, 'comparison key pass', ((('a', 1), ), (('b', 2), )), config.CrossSlicingSpec( baseline_spec=config.SlicingSpec(feature_values={'a': '1'}), slicing_specs=[config.SlicingSpec(feature_keys=['b'])])), (True, 'comparison multiple key pass', ((('a', 1), ), (('c', 3), )), config.CrossSlicingSpec( baseline_spec=config.SlicingSpec(feature_values={'a': '1'}), slicing_specs=[ config.SlicingSpec(feature_keys=['b']), config.SlicingSpec(feature_keys=['c']) ])), (False, 'overall fail', ((('a', 1), ), (('b', 2), )), config.CrossSlicingSpec( baseline_spec=config.SlicingSpec(), slicing_specs=[config.SlicingSpec(feature_values={'b': '2'}) ])), (False, 'value fail', ((('a', 1), ), (('b', 3), )), config.CrossSlicingSpec( baseline_spec=config.SlicingSpec(feature_values={'a': '1'}), slicing_specs=[config.SlicingSpec(feature_values={'b': '2'}) ])), (False, 'baseline key fail', ((('c', 1), ), (('b', 2), )), config.CrossSlicingSpec( baseline_spec=config.SlicingSpec(feature_keys=['a']), slicing_specs=[config.SlicingSpec(feature_values={'b': '2'}) ])), (False, 'comparison key fail', ((('a', 1), ), (('c', 3), )), config.CrossSlicingSpec( baseline_spec=config.SlicingSpec(feature_values={'a': '1'}), slicing_specs=[config.SlicingSpec(feature_keys=['b'])])), (False, 'comparison multiple key fail', ((('a', 1), ), (('d', 3), )), config.CrossSlicingSpec( baseline_spec=config.SlicingSpec(feature_values={'a': '1'}), slicing_specs=[ config.SlicingSpec(feature_keys=['b']), config.SlicingSpec(feature_keys=['c']) ])), ] # pyformat: disable for (expected_result, name, sliced_key, slicing_spec) in test_cases: self.assertEqual(expected_result, slicer.is_cross_slice_applicable( cross_slice_key=sliced_key, cross_slicing_spec=slicing_spec), msg=name)
def testRunModelAnalysisWithModelAgnosticPredictions(self): examples = [ self._makeExample(age=3.0, language='english', label=1.0, prediction=0.9), self._makeExample(age=3.0, language='chinese', label=0.0, prediction=0.4), self._makeExample(age=4.0, language='english', label=1.0, prediction=0.7), self._makeExample(age=5.0, language='chinese', label=1.0, prediction=0.2) ] data_location = self._writeTFExamplesToTFRecords(examples) model_specs = [ config.ModelSpec(prediction_key='prediction', label_key='label', example_weight_key='age') ] metrics = [ config.MetricConfig(class_name='ExampleCount'), config.MetricConfig(class_name='WeightedExampleCount'), config.MetricConfig(class_name='BinaryAccuracy') ] slicing_specs = [config.SlicingSpec(feature_keys=['language'])] eval_config = config.EvalConfig( model_specs=model_specs, metrics_specs=[config.MetricsSpec(metrics=metrics)], slicing_specs=slicing_specs) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, data_location=data_location, output_path=self._getTempDir()) expected = { (('language', 'chinese'), ): { 'binary_accuracy': { 'doubleValue': 0.375 }, 'weighted_example_count': { 'doubleValue': 8.0 }, 'example_count': { 'doubleValue': 2.0 }, }, (('language', 'english'), ): { 'binary_accuracy': { 'doubleValue': 1.0 }, 'weighted_example_count': { 'doubleValue': 7.0 }, 'example_count': { 'doubleValue': 2.0 }, } } self.assertEqual(eval_result.data_location, data_location) self.assertEqual(eval_result.config.slicing_specs[0], config.SlicingSpec(feature_keys=['language'])) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
def testEvaluateWithQueryBasedMetrics(self): temp_export_dir = self._getExportDir() _, export_dir = (fixed_prediction_estimator_extra_fields. simple_fixed_prediction_estimator_extra_fields( None, temp_export_dir)) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_key='label', example_weight_key='fixed_int') ], slicing_specs=[ config.SlicingSpec(), config.SlicingSpec(feature_keys=['fixed_string']), ], metrics_specs=metric_specs.specs_from_metrics( [ndcg.NDCG(gain_key='fixed_float', name='ndcg')], binarize=config.BinarizationOptions(top_k_list=[1, 2]), query_key='fixed_string')) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_models=[eval_shared_model]), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] # fixed_string used as query_key # fixed_float used as gain_key for NDCG # fixed_int used as example_weight_key for NDCG examples = [ self._makeExample(prediction=0.2, label=1.0, fixed_float=1.0, fixed_string='query1', fixed_int=1), self._makeExample(prediction=0.8, label=0.0, fixed_float=0.5, fixed_string='query1', fixed_int=1), self._makeExample(prediction=0.5, label=0.0, fixed_float=0.5, fixed_string='query2', fixed_int=2), self._makeExample(prediction=0.9, label=1.0, fixed_float=1.0, fixed_string='query2', fixed_int=2), self._makeExample(prediction=0.1, label=0.0, fixed_float=0.1, fixed_string='query2', fixed_int=2), self._makeExample(prediction=0.9, label=1.0, fixed_float=1.0, fixed_string='query3', fixed_int=3) ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 4) slices = {} for slice_key, value in got: slices[slice_key] = value overall_slice = () query1_slice = (('fixed_string', b'query1'), ) query2_slice = (('fixed_string', b'query2'), ) query3_slice = (('fixed_string', b'query3'), ) self.assertCountEqual(list(slices.keys()), [ overall_slice, query1_slice, query2_slice, query3_slice ]) example_count_key = metric_types.MetricKey( name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') ndcg1_key = metric_types.MetricKey( name='ndcg', sub_key=metric_types.SubKey(top_k=1)) ndcg2_key = metric_types.MetricKey( name='ndcg', sub_key=metric_types.SubKey(top_k=2)) # Query1 (weight=1): (p=0.8, g=0.5) (p=0.2, g=1.0) # Query2 (weight=2): (p=0.9, g=1.0) (p=0.5, g=0.5) (p=0.1, g=0.1) # Query3 (weight=3): (p=0.9, g=1.0) # # DCG@1: 0.5, 1.0, 1.0 # NDCG@1: 0.5, 1.0, 1.0 # Average NDCG@1: (1 * 0.5 + 2 * 1.0 + 3 * 1.0) / (1 + 2 + 3) ~ 0.92 # # DCG@2: (0.5 + 1.0/log(3) ~ 0.630930 # (1.0 + 0.5/log(3) ~ 1.315465 # 1.0 # NDCG@2: (0.5 + 1.0/log(3)) / (1.0 + 0.5/log(3)) ~ 0.85972 # (1.0 + 0.5/log(3)) / (1.0 + 0.5/log(3)) = 1.0 # 1.0 # Average NDCG@2: (1 * 0.860 + 2 * 1.0 + 3 * 1.0) / (1 + 2 + 3) ~ 0.97 self.assertDictElementsAlmostEqual( slices[overall_slice], { example_count_key: 6, weighted_example_count_key: 11.0, ndcg1_key: 0.9166667, ndcg2_key: 0.9766198 }) self.assertDictElementsAlmostEqual( slices[query1_slice], { example_count_key: 2, weighted_example_count_key: 2.0, ndcg1_key: 0.5, ndcg2_key: 0.85972 }) self.assertDictElementsAlmostEqual( slices[query2_slice], { example_count_key: 3, weighted_example_count_key: 6.0, ndcg1_key: 1.0, ndcg2_key: 1.0 }) self.assertDictElementsAlmostEqual( slices[query3_slice], { example_count_key: 1, weighted_example_count_key: 3.0, ndcg1_key: 1.0, ndcg2_key: 1.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def validate_metrics( sliced_metrics: Tuple[Union[slicer.SliceKeyType, slicer.CrossSliceKeyType], Dict['metric_types.MetricKey', Any]], eval_config: config.EvalConfig ) -> validation_result_pb2.ValidationResult: """Check the metrics and check whether they should be validated.""" # Find out which model is baseline. baseline_spec = model_util.get_baseline_model_spec(eval_config) baseline_model_name = baseline_spec.name if baseline_spec else None sliced_key, metrics = sliced_metrics thresholds = metric_specs.metric_thresholds_from_metrics_specs( eval_config.metrics_specs) # pytype: disable=wrong-arg-types is_cross_slice = slicer.is_cross_slice_key(sliced_key) def _check_threshold(key: metric_types.MetricKey, threshold: _ThresholdType, metric: Any) -> bool: """Verify a metric given its metric key and metric value.""" if isinstance(threshold, config.GenericValueThreshold): lower_bound, upper_bound = -np.inf, np.inf if threshold.HasField('lower_bound'): lower_bound = threshold.lower_bound.value if threshold.HasField('upper_bound'): upper_bound = threshold.upper_bound.value return metric > lower_bound and metric < upper_bound elif isinstance(threshold, config.GenericChangeThreshold): diff = metric ratio = diff / metrics[key.make_baseline_key(baseline_model_name)] if threshold.direction == config.MetricDirection.LOWER_IS_BETTER: absolute, relative = np.inf, np.inf elif threshold.direction == config.MetricDirection.HIGHER_IS_BETTER: absolute, relative = -np.inf, -np.inf else: raise ValueError('"UNKNOWN" direction for change threshold.') if threshold.HasField('absolute'): absolute = threshold.absolute.value if threshold.HasField('relative'): relative = threshold.relative.value if threshold.direction == config.MetricDirection.LOWER_IS_BETTER: return diff < absolute and ratio < relative elif threshold.direction == config.MetricDirection.HIGHER_IS_BETTER: return diff > absolute and ratio > relative def _copy_metric(metric, to): # Will add more types when more MetricValue are supported. to.double_value.value = float(metric) def _copy_threshold(threshold, to): if isinstance(threshold, config.GenericValueThreshold): to.value_threshold.CopyFrom(threshold) if isinstance(threshold, config.GenericChangeThreshold): to.change_threshold.CopyFrom(threshold) def _add_to_set(s, v): """Adds value to set. Returns true if didn't exist.""" if v in s: return False else: s.add(v) return True # Empty metrics per slice is considered validated. result = validation_result_pb2.ValidationResult(validation_ok=True) validation_for_slice = validation_result_pb2.MetricsValidationForSlice() unchecked_thresholds = dict(thresholds) for metric_key, metric in metrics.items(): if metric_key not in thresholds: continue del unchecked_thresholds[metric_key] # Not meaningful to check threshold for baseline model, thus always return # True if such threshold is configured. We also do not compare Message type # metrics. if metric_key.model_name == baseline_model_name: continue msg = '' # We try to convert to float values. try: metric = float(metric) except (TypeError, ValueError): msg = """ Invalid threshold config: This metric is not comparable to the threshold. The type of the threshold is: {}, and the metric value is: \n{}""".format(type(metric), metric) existing_failures = set() for slice_spec, threshold in thresholds[metric_key]: if (slice_spec is not None and isinstance(slice_spec, config.SlicingSpec) and (is_cross_slice or not slicer.SingleSliceSpec( spec=slice_spec).is_slice_applicable(sliced_key))): continue if (slice_spec is not None and isinstance(slice_spec, config.CrossSlicingSpec) and (not is_cross_slice or not slicer.is_cross_slice_applicable( cross_slice_key=sliced_key, cross_slicing_spec=slice_spec)) ): continue if not _check_threshold(metric_key, threshold, metric): # The same threshold values could be set for multiple matching slice # specs. Only store the first match. # # Note that hashing by SerializeToString() is only safe if used within # the same process. if not _add_to_set(existing_failures, threshold.SerializeToString()): continue failure = validation_for_slice.failures.add() failure.metric_key.CopyFrom(metric_key.to_proto()) _copy_metric(metric, failure.metric_value) _copy_threshold(threshold, failure.metric_threshold) failure.message = msg # Track we have completed a validation check for slice spec and metric slicing_details = result.validation_details.slicing_details.add() if slice_spec is not None: if isinstance(slice_spec, config.SlicingSpec): slicing_details.slicing_spec.CopyFrom(slice_spec) else: slicing_details.cross_slicing_spec.CopyFrom(slice_spec) else: slicing_details.slicing_spec.CopyFrom(config.SlicingSpec()) slicing_details.num_matching_slices = 1 # All unchecked thresholds are considered failures. for metric_key, thresholds in unchecked_thresholds.items(): if metric_key.model_name == baseline_model_name: continue existing_failures = set() for _, threshold in thresholds: # The same threshold values could be set for multiple matching slice # specs. Only store the first match. # # Note that hashing by SerializeToString() is only safe if used within # the same process. if not _add_to_set(existing_failures, threshold.SerializeToString()): continue failure = validation_for_slice.failures.add() failure.metric_key.CopyFrom(metric_key.to_proto()) _copy_threshold(threshold, failure.metric_threshold) failure.message = 'Metric not found.' # Any failure leads to overall failure. if validation_for_slice.failures: if not is_cross_slice: validation_for_slice.slice_key.CopyFrom( slicer.serialize_slice_key(sliced_key)) else: validation_for_slice.cross_slice_key.CopyFrom( slicer.serialize_cross_slice_key(sliced_key)) result.validation_ok = False result.metric_validations_per_slice.append(validation_for_slice) return result
def testWriteValidationResults(self): model_dir, baseline_dir = self._getExportDir(), self._getBaselineDir() eval_shared_model = self._build_keras_model(model_dir, mul=0) baseline_eval_shared_model = self._build_keras_model(baseline_dir, mul=1) validations_file = os.path.join(self._getTempDir(), constants.VALIDATIONS_KEY) examples = [ self._makeExample( input=0.0, label=1.0, example_weight=1.0, extra_feature='non_model_feature'), self._makeExample( input=1.0, label=0.0, example_weight=0.5, extra_feature='non_model_feature'), ] eval_config = config.EvalConfig( model_specs=[ config.ModelSpec( name='candidate', label_key='label', example_weight_key='example_weight'), config.ModelSpec( name='baseline', label_key='label', example_weight_key='example_weight', is_baseline=True) ], slicing_specs=[config.SlicingSpec()], metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 1.5 < 1, NOT OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': 1}))), config.MetricConfig( class_name='ExampleCount', # 2 > 10, NOT OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( lower_bound={'value': 10}))), config.MetricConfig( class_name='MeanLabel', # 0 > 0 and 0 > 0%?: NOT OK. threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection .HIGHER_IS_BETTER, relative={'value': 0}, absolute={'value': 0}))), config.MetricConfig( # MeanPrediction = (0+0)/(1+0.5) = 0 class_name='MeanPrediction', # -.01 < 0 < .01, OK. # Diff% = -.333/.333 = -100% < -99%, OK. # Diff = 0 - .333 = -.333 < 0, OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': .01}, lower_bound={'value': -.01}), change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection .LOWER_IS_BETTER, relative={'value': -.99}, absolute={'value': 0}))) ], model_names=['candidate', 'baseline']), ], options=config.Options( disabled_outputs={'values': ['eval_config.json']}), ) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] eval_shared_models = { 'candidate': eval_shared_model, 'baseline': baseline_eval_shared_model } extractors = [ input_extractor.InputExtractor(eval_config), predict_extractor_v2.PredictExtractor( eval_shared_model=eval_shared_models, eval_config=eval_config), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_model=eval_shared_models) ] output_paths = { constants.VALIDATIONS_KEY: validations_file, } writers = [ metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter( output_paths, add_metrics_callbacks=[]) ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter _ = ( pipeline | 'Create' >> beam.Create([e.SerializeToString() for e in examples]) | 'ExtractEvaluateAndWriteResults' >> model_eval_lib.ExtractEvaluateAndWriteResults( eval_config=eval_config, eval_shared_model=eval_shared_model, extractors=extractors, evaluators=evaluators, writers=writers)) # pylint: enable=no-value-for-parameter validation_result = model_eval_lib.load_validation_result( os.path.dirname(validations_file)) expected_validations = [ text_format.Parse( """ metric_key { name: "weighted_example_count" model_name: "candidate" } metric_threshold { value_threshold { upper_bound { value: 1.0 } } } metric_value { double_value { value: 1.5 } } """, validation_result_pb2.ValidationFailure()), text_format.Parse( """ metric_key { name: "example_count" } metric_threshold { value_threshold { lower_bound { value: 10.0 } } } metric_value { double_value { value: 2.0 } } """, validation_result_pb2.ValidationFailure()), text_format.Parse( """ metric_key { name: "mean_label" model_name: "candidate" is_diff: true } metric_threshold { change_threshold { absolute { value: 0.0 } relative { value: 0.0 } direction: HIGHER_IS_BETTER } } metric_value { double_value { value: 0.0 } } """, validation_result_pb2.ValidationFailure()), ] self.assertFalse(validation_result.validation_ok) self.assertLen(validation_result.metric_validations_per_slice, 1) self.assertCountEqual( expected_validations, validation_result.metric_validations_per_slice[0].failures)
def testMetricThresholdsFromMetricsSpecs(self): slice_specs = [ config.SlicingSpec(feature_keys=['feature1']), config.SlicingSpec(feature_values={'feature2': 'value1'}) ] # For cross slice tests. baseline_slice_spec = config.SlicingSpec(feature_keys=['feature3']) metrics_specs = [ config.MetricsSpec( thresholds={ 'auc': config.MetricThreshold( value_threshold=config.GenericValueThreshold()), 'mean/label': config.MetricThreshold( value_threshold=config.GenericValueThreshold(), change_threshold=config.GenericChangeThreshold()), 'mse': config.MetricThreshold( change_threshold=config.GenericChangeThreshold()) }, per_slice_thresholds={ 'auc': config.PerSliceMetricThresholds(thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slice_specs, threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( ))) ]), 'mean/label': config.PerSliceMetricThresholds(thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slice_specs, threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold(), change_threshold=config.GenericChangeThreshold( ))) ]) }, cross_slice_thresholds={ 'auc': config.CrossSliceMetricThresholds(thresholds=[ config.CrossSliceMetricThreshold( cross_slicing_specs=[ config.CrossSlicingSpec( baseline_spec=baseline_slice_spec, slicing_specs=slice_specs) ], threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold(), change_threshold=config.GenericChangeThreshold( ))) ]), 'mse': config.CrossSliceMetricThresholds(thresholds=[ config.CrossSliceMetricThreshold( cross_slicing_specs=[ config.CrossSlicingSpec( baseline_spec=baseline_slice_spec, slicing_specs=slice_specs) ], threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( ))), # Test for duplicate cross_slicing_spec. config.CrossSliceMetricThreshold( cross_slicing_specs=[ config.CrossSlicingSpec( baseline_spec=baseline_slice_spec, slicing_specs=slice_specs) ], threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold()) ) ]) }, model_names=['model_name'], output_names=['output_name']), config.MetricsSpec(metrics=[ config.MetricConfig( class_name='ExampleCount', config=json.dumps({'name': 'example_count'}), threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold())) ], model_names=['model_name1', 'model_name2'], output_names=['output_name1', 'output_name2']), config.MetricsSpec(metrics=[ config.MetricConfig( class_name='WeightedExampleCount', config=json.dumps({'name': 'weighted_example_count'}), threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold())) ], model_names=['model_name1', 'model_name2'], output_names=['output_name1', 'output_name2']), config.MetricsSpec(metrics=[ config.MetricConfig( class_name='MeanSquaredError', config=json.dumps({'name': 'mse'}), threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold())), config.MetricConfig( class_name='MeanLabel', config=json.dumps({'name': 'mean_label'}), threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold()), per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slice_specs, threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( ))), ], cross_slice_thresholds=[ config.CrossSliceMetricThreshold( cross_slicing_specs=[ config.CrossSlicingSpec( baseline_spec=baseline_slice_spec, slicing_specs=slice_specs) ], threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( ))) ]), ], model_names=['model_name'], output_names=['output_name'], binarize=config.BinarizationOptions( class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions( macro_average=True, class_weights={ 0: 1.0, 1: 1.0 })) ] thresholds = metric_specs.metric_thresholds_from_metrics_specs( metrics_specs) expected_keys_and_threshold_counts = { metric_types.MetricKey(name='auc', model_name='model_name', output_name='output_name', is_diff=False): 4, metric_types.MetricKey(name='auc', model_name='model_name', output_name='output_name', is_diff=True): 1, metric_types.MetricKey(name='mean/label', model_name='model_name', output_name='output_name', is_diff=True): 3, metric_types.MetricKey(name='mean/label', model_name='model_name', output_name='output_name', is_diff=False): 3, metric_types.MetricKey(name='example_count', model_name='model_name1', output_name='output_name1'): 1, metric_types.MetricKey(name='example_count', model_name='model_name1', output_name='output_name2'): 1, metric_types.MetricKey(name='example_count', model_name='model_name2', output_name='output_name1'): 1, metric_types.MetricKey(name='example_count', model_name='model_name2', output_name='output_name2'): 1, metric_types.MetricKey(name='weighted_example_count', model_name='model_name1', output_name='output_name1'): 1, metric_types.MetricKey(name='weighted_example_count', model_name='model_name1', output_name='output_name2'): 1, metric_types.MetricKey(name='weighted_example_count', model_name='model_name2', output_name='output_name1'): 1, metric_types.MetricKey(name='weighted_example_count', model_name='model_name2', output_name='output_name2'): 1, metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0), is_diff=True): 1, metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1), is_diff=True): 1, metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', is_diff=True): 2, metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', is_diff=False): 1, metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', aggregation_type=metric_types.AggregationType(macro_average=True), is_diff=True): 1, metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0), is_diff=True): 4, metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1), is_diff=True): 4, metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', aggregation_type=metric_types.AggregationType(macro_average=True), is_diff=True): 4 } self.assertLen(thresholds, len(expected_keys_and_threshold_counts)) for key, count in expected_keys_and_threshold_counts.items(): self.assertIn(key, thresholds) self.assertLen(thresholds[key], count, 'failed for key {}'.format(key))