def testMetricKeysToSkipForConfidenceIntervals(self): metrics_specs = [ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='ExampleCount', config=json.dumps({'name': 'example_count'}), threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold())), config.MetricConfig( class_name='MeanLabel', config=json.dumps({'name': 'mean_label'}), threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold())), config.MetricConfig( class_name='MeanSquaredError', config=json.dumps({'name': 'mse'}), threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold())) ], # Model names and output_names should be ignored because # ExampleCount is model independent. model_names=['model_name1', 'model_name2'], output_names=['output_name1', 'output_name2']), ] metrics_specs += metric_specs.specs_from_metrics( [tf.keras.metrics.MeanSquaredError('mse')]) keys = metric_specs.metric_keys_to_skip_for_confidence_intervals( metrics_specs) self.assertLen(keys, 1) self.assertIn(metric_types.MetricKey(name='example_count'), keys)
def testMetricSpecsFromKerasSequential(self): export_dir = os.path.join(self._getTempDir(), 'export_dir') model = tf.keras.models.Sequential([ tf.keras.layers.InputLayer(input_shape=(1,), name='test'), tf.keras.layers.Dense(1, activation=tf.nn.sigmoid) ]) model.compile( loss=tf.keras.losses.BinaryCrossentropy(), metrics=[tf.keras.metrics.MeanSquaredError(name='mse')]) features = [[0.0], [1.0]] labels = [[1], [0]] dataset = tf.data.Dataset.from_tensor_slices((features, labels)) dataset = dataset.shuffle(buffer_size=1).repeat().batch(2) model.fit(dataset, steps_per_epoch=1) model.save(export_dir, save_format='tf') eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) metrics_specs = ( keras_util.metrics_specs_from_keras('', eval_shared_model.model_loader)) # TODO(b/149995449): Keras does not support re-loading metrics with the new # API. Re-enable after this is fixed. model = eval_shared_model.model_loader.construct_fn(lambda x: None)() if not hasattr(model, 'loss_functions'): return self.assertLen(metrics_specs, 1) self.assertProtoEquals( self._comparable_spec(metrics_specs[0]), config.MetricsSpec( metrics=[ config.MetricConfig( class_name='BinaryCrossentropy', config=json.dumps( { 'from_logits': False, 'label_smoothing': 0, 'reduction': 'auto', 'name': 'binary_crossentropy' }, sort_keys=True)), config.MetricConfig( class_name='MeanSquaredError', config=json.dumps({ 'name': 'mse', 'dtype': 'float32' }, sort_keys=True)) ], model_names=['']))
def testToComputationsWithMixedAggregationAndNonAggregationMetrics(self): computations = metric_specs.to_computations([ config.MetricsSpec( metrics=[config.MetricConfig(class_name='CategoricalAccuracy')]), config.MetricsSpec( metrics=[config.MetricConfig(class_name='BinaryCrossentropy')], binarize=config.BinarizationOptions(class_ids={'values': [1]}), aggregate=config.AggregationOptions(micro_average=True)) ], config.EvalConfig()) # 3 separate computations should be used (one for aggregated metrics, one # for non-aggregated metrics, and one for metrics associated with class 1) self.assertLen(computations, 3)
def testValidateMetricsMetricValueAndThresholdIgnoreUnmatchedSlice( self, slicing_specs, slice_key): threshold = config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': 1})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 1.5 < 1, NOT OK. per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]), ], model_names=['']), ], ) sliced_metrics = (slice_key, { metric_types.MetricKey(name='weighted_example_count'): 1.5, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertTrue(result.validation_ok)
def testValidateMetricsValueThresholdLowerBoundPass( self, slicing_specs, slice_key): threshold = config.MetricThreshold( value_threshold=config.GenericValueThreshold( lower_bound={'value': 1})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 2 > 1, OK. threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]), ], model_names=['']), ], ) sliced_metrics = (slice_key, { metric_types.MetricKey(name='weighted_example_count'): 2, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertTrue(result.validation_ok)
def testValidateMetricsCrossSliceThresholdFail(self, cross_slicing_specs, slice_key): threshold = config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': 1})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], cross_slicing_specs=cross_slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 1.5 < 1, NOT OK. threshold=(threshold if cross_slicing_specs is None else None), cross_slice_thresholds=[ config.CrossSliceMetricThreshold( cross_slicing_specs=cross_slicing_specs, threshold=threshold) ]), ], model_names=['']), ], ) sliced_metrics = (slice_key, { metric_types.MetricKey(name='weighted_example_count'): 1.5, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok)
def _serialize_tf_metric( metric: tf.keras.metrics.Metric) -> config.MetricConfig: """Serializes TF metric.""" cfg = metric_util.serialize_metric(metric) return config.MetricConfig( class_name=cfg['class_name'], config=json.dumps(cfg['config'], sort_keys=True))
def testValidateMetricsChangeThresholdHigherIsBetterFail(self): eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), config.ModelSpec(name='baseline', is_baseline=True) ], slicing_specs=[config.SlicingSpec()], metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='MeanPrediction', # Diff = -.333 > 0, NOT OK. threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection. HIGHER_IS_BETTER, absolute={'value': 0}))), ], model_names=['']), ], ) sliced_metrics = ((()), { metric_types.MetricKey(name='mean_prediction', model_name='baseline'): 0.333, metric_types.MetricKey(name='mean_prediction', is_diff=True): -0.333, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok)
def testValidateMetricsChangeThresholdRelativePass(self): eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), config.ModelSpec(name='baseline', is_baseline=True) ], slicing_specs=[config.SlicingSpec()], metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='MeanPrediction', # Diff = -.333 # Diff% = -.333/.333 = -100% < 0%, OK. threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection. LOWER_IS_BETTER, relative={'value': 0}))), ], model_names=['']), ], ) sliced_metrics = ((()), { metric_types.MetricKey(name='mean_prediction', model_name='baseline'): 0.333, metric_types.MetricKey(name='mean_prediction', is_diff=True): -0.333, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertTrue(result.validation_ok)
def testValidateMetricsValueThresholdLowerBoundPass(self): eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=[config.SlicingSpec()], metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 2 > 1, OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( lower_bound={'value': 1}))), ], model_names=['']), ], ) sliced_metrics = ((()), { metric_types.MetricKey(name='weighted_example_count'): 2, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertTrue(result.validation_ok)
def testValidateMetricsMetricTDistributionValueAndThreshold( self, slicing_specs, slice_key): threshold = config.MetricThreshold( value_threshold=config.GenericValueThreshold( lower_bound={'value': 0.9})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec(metrics=[ config.MetricConfig( class_name='AUC', threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]), ], model_names=['']), ], ) sliced_metrics = (slice_key, { metric_types.MetricKey(name='auc'): types.ValueWithTDistribution(sample_mean=0.91, unsampled_value=0.8) }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok) expected = text_format.Parse( """ metric_validations_per_slice { failures { metric_key { name: "auc" } metric_value { double_value { value: 0.8 } } } }""", validation_result_pb2.ValidationResult()) expected.metric_validations_per_slice[0].failures[ 0].metric_threshold.CopyFrom(threshold) expected.metric_validations_per_slice[0].slice_key.CopyFrom( slicer.serialize_slice_key(slice_key)) for spec in slicing_specs or [None]: if (spec is None or slicer.SingleSliceSpec( spec=spec).is_slice_applicable(slice_key)): slicing_details = expected.validation_details.slicing_details.add( ) if spec is not None: slicing_details.slicing_spec.CopyFrom(spec) else: slicing_details.slicing_spec.CopyFrom(config.SlicingSpec()) slicing_details.num_matching_slices = 1 self.assertEqual(result, expected)
def _serialize_tf_loss(loss: tf.keras.losses.Loss) -> config.MetricConfig: """Serializes TF loss.""" cfg = metric_util.serialize_loss(loss) return config.MetricConfig( class_name=cfg['class_name'], module=loss.__class__.__module__, config=json.dumps(cfg['config'], sort_keys=True))
def _serialize_tfma_metric(metric: metric_types.Metric) -> config.MetricConfig: """Serializes TFMA metric.""" # This implementation is identical to _serialize_tf_metric, but keeping two # implementations for symmetry with deserialize where separate implementations # are required (and to be consistent with the keras implementation). cfg = tf.keras.utils.serialize_keras_object(metric) return config.MetricConfig(class_name=cfg['class_name'], config=json.dumps(cfg['config']))
def testGetMissingSlices(self): slicing_specs = [ config.SlicingSpec(), config.SlicingSpec(feature_values={'feature1': 'value1'}), config.SlicingSpec(feature_values={'feature2': 'value2'}) ] threshold = config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': 1})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 1.5 < 1, NOT OK. per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]), ], model_names=['']), ], ) sliced_metrics = ((('feature1', 'value1'), ), { metric_types.MetricKey(name='weighted_example_count'): 0, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) expected_checks = text_format.Parse( """ validation_ok: true validation_details { slicing_details { slicing_spec { feature_values { key: "feature1" value: "value1" } } num_matching_slices: 1 } }""", validation_result_pb2.ValidationResult()) self.assertProtoEquals(expected_checks, result) missing = metrics_validator.get_missing_slices( result.validation_details.slicing_details, eval_config) self.assertLen(missing, 2) self.assertProtoEquals(missing[0], slicing_specs[0]) self.assertProtoEquals(missing[1], slicing_specs[2])
def testValidateMetricsMetricValueAndThreshold(self): eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=[config.SlicingSpec()], metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 1.5 < 1, NOT OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': 1}))), ], model_names=['']), ], ) sliced_metrics = ((()), { metric_types.MetricKey(name='weighted_example_count'): 1.5, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok) expected = text_format.Parse( """ metric_validations_per_slice { slice_key { } failures { metric_key { name: "weighted_example_count" } metric_threshold { value_threshold { upper_bound { value: 1.0 } } } metric_value { double_value { value: 1.5 } } } }""", validation_result_pb2.ValidationResult()) self.assertEqual(result, expected)
def testMetricsSpecBeamCounter(self): with beam.Pipeline() as pipeline: metrics_spec = config.MetricsSpec( metrics=[config.MetricConfig(class_name='FairnessIndicators')]) _ = pipeline | counter_util.IncrementMetricsSpecsCounters([metrics_spec]) result = pipeline.run() metric_filter = beam.metrics.metric.MetricsFilter().with_namespace( constants.METRICS_NAMESPACE).with_name( 'metric_computed_FairnessIndicators_v2') actual_metrics_count = result.metrics().query( filter=metric_filter)['counters'][0].committed self.assertEqual(actual_metrics_count, 1)
def testValidateMetricsMetricValueAndThreshold(self, slicing_specs, slice_key): threshold = config.MetricThreshold( value_threshold=config.GenericValueThreshold(upper_bound={'value': 1})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 1.5 < 1, NOT OK. threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]), ], model_names=['']), ], ) sliced_metrics = (slice_key, { metric_types.MetricKey(name='weighted_example_count'): 1.5, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok) expected = text_format.Parse( """ metric_validations_per_slice { failures { metric_key { name: "weighted_example_count" } metric_value { double_value { value: 1.5 } } } }""", validation_result_pb2.ValidationResult()) expected.metric_validations_per_slice[0].failures[ 0].metric_threshold.CopyFrom(threshold) expected.metric_validations_per_slice[0].slice_key.CopyFrom( slicer.serialize_slice_key(slice_key)) self.assertEqual(result, expected)
def testValidateMetricsChangeThresholdRelativeFail(self, slicing_specs, slice_key): threshold = config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection.LOWER_IS_BETTER, relative={'value': -2})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), config.ModelSpec(name='baseline', is_baseline=True) ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='MeanPrediction', # Diff = -.333 # Diff% = -.333/.333 = -100% < -200%, NOT OK. threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]) ], model_names=['']), ], ) sliced_metrics = (slice_key, { metric_types.MetricKey(name='mean_prediction', model_name='baseline'): 0.333, metric_types.MetricKey(name='mean_prediction', is_diff=True): -0.333, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok)
def testValidateMetricsChangeThresholdHigherIsBetterPass( self, slicing_specs, slice_key): threshold = config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection.HIGHER_IS_BETTER, absolute={'value': -1})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), config.ModelSpec(name='baseline', is_baseline=True) ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='MeanPrediction', # Diff = -.333 > -1, OK. threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]) ], model_names=['']), ], ) sliced_metrics = (slice_key, { metric_types.MetricKey(name='mean_prediction', model_name='baseline'): 0.333, metric_types.MetricKey(name='mean_prediction', is_diff=True): -0.333, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertTrue(result.validation_ok)
def testValidateMetricsDivByZero(self): threshold = config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection.HIGHER_IS_BETTER, relative={'value': 0.1})) slicing_specs = [config.SlicingSpec()] eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(name='candidate'), config.ModelSpec(name='baseline', is_baseline=True) ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='MeanPrediction', threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]) ], model_names=['baseline', 'candidate']), ], ) sliced_metrics = ((()), { metric_types.MetricKey(name='mean_prediction', model_name='baseline'): 0.0, metric_types.MetricKey( name='mean_prediction', model_name='candidate', is_diff=True): 0.1, }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok)
def testSpecsFromMetrics(self): metrics_specs = metric_specs.specs_from_metrics( { 'output_name1': [ tf.keras.metrics.MeanSquaredError('mse'), calibration.MeanLabel('mean_label') ], 'output_name2': [ tf.keras.metrics.RootMeanSquaredError('rmse'), calibration.MeanPrediction('mean_prediction') ] }, model_names=['model_name1', 'model_name2'], binarize=config.BinarizationOptions(class_ids=[0, 1]), aggregate=config.AggregationOptions(macro_average=True)) self.assertLen(metrics_specs, 5) self.assertProtoEquals( metrics_specs[0], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='ExampleCount', config=json.dumps( {'name': 'example_count'})), ])) self.assertProtoEquals( metrics_specs[1], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='WeightedExampleCount', config=json.dumps( {'name': 'weighted_example_count'})), ], model_names=['model_name1', 'model_name2'], output_names=['output_name1'])) self.assertProtoEquals( metrics_specs[2], config.MetricsSpec( metrics=[ config.MetricConfig(class_name='MeanSquaredError', config=json.dumps({ 'name': 'mse', 'dtype': 'float32' })), config.MetricConfig(class_name='MeanLabel', config=json.dumps( {'name': 'mean_label'})) ], model_names=['model_name1', 'model_name2'], output_names=['output_name1'], binarize=config.BinarizationOptions(class_ids=[0, 1]), aggregate=config.AggregationOptions(macro_average=True))) self.assertProtoEquals( metrics_specs[3], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='WeightedExampleCount', config=json.dumps( {'name': 'weighted_example_count'})), ], model_names=['model_name1', 'model_name2'], output_names=['output_name2'])) self.assertProtoEquals( metrics_specs[4], config.MetricsSpec( metrics=[ config.MetricConfig(class_name='RootMeanSquaredError', config=json.dumps({ 'name': 'rmse', 'dtype': 'float32' })), config.MetricConfig(class_name='MeanPrediction', config=json.dumps( {'name': 'mean_prediction'})) ], model_names=['model_name1', 'model_name2'], output_names=['output_name2'], binarize=config.BinarizationOptions(class_ids=[0, 1]), aggregate=config.AggregationOptions(macro_average=True)))
def _serialize_tf_metric( metric: tf.keras.metrics.Metric) -> config.MetricConfig: """Serializes TF metric.""" cfg = tf.keras.metrics.serialize(metric) return config.MetricConfig(class_name=cfg['class_name'], config=json.dumps(cfg['config']))
def testSpecsFromMetrics(self): metrics_specs = metric_specs.specs_from_metrics( { 'output_name1': [ tf.keras.metrics.MeanSquaredError('mse'), tf.keras.losses.MeanAbsoluteError(name='mae'), calibration.MeanLabel('mean_label') ], 'output_name2': [ tf.keras.metrics.RootMeanSquaredError('rmse'), tf.keras.losses.MeanAbsolutePercentageError(name='mape'), calibration.MeanPrediction('mean_prediction') ] }, model_names=['model_name1', 'model_name2'], binarize=config.BinarizationOptions(class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions(macro_average=True)) self.assertLen(metrics_specs, 5) self.assertProtoEquals( metrics_specs[0], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='ExampleCount', config=json.dumps( {'name': 'example_count'})), ])) self.assertProtoEquals( metrics_specs[1], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='WeightedExampleCount', config=json.dumps( {'name': 'weighted_example_count'})), ], model_names=['model_name1', 'model_name2'], output_names=['output_name1'])) self.assertProtoEquals( metrics_specs[2], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='MeanSquaredError', config=json.dumps( { 'name': 'mse', 'dtype': 'float32' }, sort_keys=True)), config.MetricConfig(class_name='MeanAbsoluteError', module=metric_specs._TF_LOSSES_MODULE, config=json.dumps( { 'reduction': 'auto', 'name': 'mae' }, sort_keys=True)), config.MetricConfig(class_name='MeanLabel', config=json.dumps({'name': 'mean_label'})) ], model_names=['model_name1', 'model_name2'], output_names=['output_name1'], binarize=config.BinarizationOptions( class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions( macro_average=True))) self.assertProtoEquals( metrics_specs[3], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='WeightedExampleCount', config=json.dumps( {'name': 'weighted_example_count'})), ], model_names=['model_name1', 'model_name2'], output_names=['output_name2'])) self.assertProtoEquals( metrics_specs[4], config.MetricsSpec( metrics=[ config.MetricConfig(class_name='RootMeanSquaredError', config=json.dumps( { 'name': 'rmse', 'dtype': 'float32' }, sort_keys=True)), config.MetricConfig( class_name='MeanAbsolutePercentageError', module=metric_specs._TF_LOSSES_MODULE, config=json.dumps({ 'reduction': 'auto', 'name': 'mape' }, sort_keys=True)), config.MetricConfig(class_name='MeanPrediction', config=json.dumps( {'name': 'mean_prediction'})) ], model_names=['model_name1', 'model_name2'], output_names=['output_name2'], binarize=config.BinarizationOptions( class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions(macro_average=True)))
def testMetricThresholdsFromMetricsSpecs(self): metrics_specs = [ config.MetricsSpec( thresholds={ 'auc': config.MetricThreshold( value_threshold=config.GenericValueThreshold()), 'mean/label': config.MetricThreshold( value_threshold=config.GenericValueThreshold(), change_threshold=config.GenericChangeThreshold()), # The mse metric will be overridden by MetricConfig below. 'mse': config.MetricThreshold( change_threshold=config.GenericChangeThreshold()) }, # Model names and output_names should be ignored because # ExampleCount is model independent. model_names=['model_name'], output_names=['output_name']), config.MetricsSpec( metrics=[ config.MetricConfig( class_name='ExampleCount', config=json.dumps({'name': 'example_count'}), threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold())) ], # Model names and output_names should be ignored because # ExampleCount is model independent. model_names=['model_name1', 'model_name2'], output_names=['output_name1', 'output_name2']), config.MetricsSpec(metrics=[ config.MetricConfig( class_name='WeightedExampleCount', config=json.dumps({'name': 'weighted_example_count'}), threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold())) ], model_names=['model_name1', 'model_name2'], output_names=['output_name1', 'output_name2']), config.MetricsSpec( metrics=[ config.MetricConfig( class_name='MeanSquaredError', config=json.dumps({'name': 'mse'}), threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold())), config.MetricConfig( class_name='MeanLabel', config=json.dumps({'name': 'mean_label'}), threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold())) ], model_names=['model_name'], output_names=['output_name'], binarize=config.BinarizationOptions( class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions(macro_average=True)) ] thresholds = metric_specs.metric_thresholds_from_metrics_specs( metrics_specs) self.assertLen(thresholds, 14) self.assertIn( metric_types.MetricKey(name='auc', model_name='model_name', output_name='output_name'), thresholds) self.assertIn( metric_types.MetricKey(name='mean/label', model_name='model_name', output_name='output_name', is_diff=True), thresholds) self.assertIn( metric_types.MetricKey(name='mean/label', model_name='model_name', output_name='output_name', is_diff=False), thresholds) self.assertIn(metric_types.MetricKey(name='example_count'), thresholds) self.assertIn( metric_types.MetricKey(name='weighted_example_count', model_name='model_name1', output_name='output_name1'), thresholds) self.assertIn( metric_types.MetricKey(name='weighted_example_count', model_name='model_name1', output_name='output_name2'), thresholds) self.assertIn( metric_types.MetricKey(name='weighted_example_count', model_name='model_name2', output_name='output_name1'), thresholds) self.assertIn( metric_types.MetricKey(name='weighted_example_count', model_name='model_name2', output_name='output_name2'), thresholds) self.assertIn( metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0), is_diff=True), thresholds) self.assertIn( metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1), is_diff=True), thresholds) self.assertIn( metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', is_diff=True), thresholds) self.assertIn( metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0), is_diff=True), thresholds) self.assertIn( metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1), is_diff=True), thresholds) self.assertIn( metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', is_diff=True), thresholds)
def testWriteValidationResults(self): model_dir, baseline_dir = self._getExportDir(), self._getBaselineDir() eval_shared_model = self._build_keras_model(model_dir, mul=0) baseline_eval_shared_model = self._build_keras_model(baseline_dir, mul=1) validations_file = os.path.join(self._getTempDir(), constants.VALIDATIONS_KEY) examples = [ self._makeExample( input=0.0, label=1.0, example_weight=1.0, extra_feature='non_model_feature'), self._makeExample( input=1.0, label=0.0, example_weight=0.5, extra_feature='non_model_feature'), ] eval_config = config.EvalConfig( model_specs=[ config.ModelSpec( name='candidate', label_key='label', example_weight_key='example_weight'), config.ModelSpec( name='baseline', label_key='label', example_weight_key='example_weight', is_baseline=True) ], slicing_specs=[config.SlicingSpec()], metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 1.5 < 1, NOT OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': 1}))), config.MetricConfig( class_name='ExampleCount', # 2 > 10, NOT OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( lower_bound={'value': 10}))), config.MetricConfig( class_name='MeanLabel', # 0 > 0 and 0 > 0%?: NOT OK. threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection .HIGHER_IS_BETTER, relative={'value': 0}, absolute={'value': 0}))), config.MetricConfig( # MeanPrediction = (0+0)/(1+0.5) = 0 class_name='MeanPrediction', # -.01 < 0 < .01, OK. # Diff% = -.333/.333 = -100% < -99%, OK. # Diff = 0 - .333 = -.333 < 0, OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': .01}, lower_bound={'value': -.01}), change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection .LOWER_IS_BETTER, relative={'value': -.99}, absolute={'value': 0}))) ], model_names=['candidate', 'baseline']), ], options=config.Options( disabled_outputs={'values': ['eval_config.json']}), ) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] eval_shared_models = { 'candidate': eval_shared_model, 'baseline': baseline_eval_shared_model } extractors = [ input_extractor.InputExtractor(eval_config), predict_extractor_v2.PredictExtractor( eval_shared_model=eval_shared_models, eval_config=eval_config), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_model=eval_shared_models) ] output_paths = { constants.VALIDATIONS_KEY: validations_file, } writers = [ metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter( output_paths, add_metrics_callbacks=[]) ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter _ = ( pipeline | 'Create' >> beam.Create([e.SerializeToString() for e in examples]) | 'ExtractEvaluateAndWriteResults' >> model_eval_lib.ExtractEvaluateAndWriteResults( eval_config=eval_config, eval_shared_model=eval_shared_model, extractors=extractors, evaluators=evaluators, writers=writers)) # pylint: enable=no-value-for-parameter validation_result = model_eval_lib.load_validation_result( os.path.dirname(validations_file)) expected_validations = [ text_format.Parse( """ metric_key { name: "weighted_example_count" model_name: "candidate" } metric_threshold { value_threshold { upper_bound { value: 1.0 } } } metric_value { double_value { value: 1.5 } } """, validation_result_pb2.ValidationFailure()), text_format.Parse( """ metric_key { name: "example_count" } metric_threshold { value_threshold { lower_bound { value: 10.0 } } } metric_value { double_value { value: 2.0 } } """, validation_result_pb2.ValidationFailure()), text_format.Parse( """ metric_key { name: "mean_label" model_name: "candidate" is_diff: true } metric_threshold { change_threshold { absolute { value: 0.0 } relative { value: 0.0 } direction: HIGHER_IS_BETTER } } metric_value { double_value { value: 0.0 } } """, validation_result_pb2.ValidationFailure()), ] self.assertFalse(validation_result.validation_ok) self.assertLen(validation_result.metric_validations_per_slice, 1) self.assertCountEqual( expected_validations, validation_result.metric_validations_per_slice[0].failures)
def testMetricThresholdsFromMetricsSpecs(self): slice_specs = [ config.SlicingSpec(feature_keys=['feature1']), config.SlicingSpec(feature_values={'feature2': 'value1'}) ] # For cross slice tests. baseline_slice_spec = config.SlicingSpec(feature_keys=['feature3']) metrics_specs = [ config.MetricsSpec( thresholds={ 'auc': config.MetricThreshold( value_threshold=config.GenericValueThreshold()), 'mean/label': config.MetricThreshold( value_threshold=config.GenericValueThreshold(), change_threshold=config.GenericChangeThreshold()), 'mse': config.MetricThreshold( change_threshold=config.GenericChangeThreshold()) }, per_slice_thresholds={ 'auc': config.PerSliceMetricThresholds(thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slice_specs, threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( ))) ]), 'mean/label': config.PerSliceMetricThresholds(thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slice_specs, threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold(), change_threshold=config.GenericChangeThreshold( ))) ]) }, cross_slice_thresholds={ 'auc': config.CrossSliceMetricThresholds(thresholds=[ config.CrossSliceMetricThreshold( cross_slicing_specs=[ config.CrossSlicingSpec( baseline_spec=baseline_slice_spec, slicing_specs=slice_specs) ], threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold(), change_threshold=config.GenericChangeThreshold( ))) ]), 'mse': config.CrossSliceMetricThresholds(thresholds=[ config.CrossSliceMetricThreshold( cross_slicing_specs=[ config.CrossSlicingSpec( baseline_spec=baseline_slice_spec, slicing_specs=slice_specs) ], threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( ))), # Test for duplicate cross_slicing_spec. config.CrossSliceMetricThreshold( cross_slicing_specs=[ config.CrossSlicingSpec( baseline_spec=baseline_slice_spec, slicing_specs=slice_specs) ], threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold()) ) ]) }, model_names=['model_name'], output_names=['output_name']), config.MetricsSpec(metrics=[ config.MetricConfig( class_name='ExampleCount', config=json.dumps({'name': 'example_count'}), threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold())) ], model_names=['model_name1', 'model_name2'], output_names=['output_name1', 'output_name2']), config.MetricsSpec(metrics=[ config.MetricConfig( class_name='WeightedExampleCount', config=json.dumps({'name': 'weighted_example_count'}), threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold())) ], model_names=['model_name1', 'model_name2'], output_names=['output_name1', 'output_name2']), config.MetricsSpec(metrics=[ config.MetricConfig( class_name='MeanSquaredError', config=json.dumps({'name': 'mse'}), threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold())), config.MetricConfig( class_name='MeanLabel', config=json.dumps({'name': 'mean_label'}), threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold()), per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slice_specs, threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( ))), ], cross_slice_thresholds=[ config.CrossSliceMetricThreshold( cross_slicing_specs=[ config.CrossSlicingSpec( baseline_spec=baseline_slice_spec, slicing_specs=slice_specs) ], threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( ))) ]), ], model_names=['model_name'], output_names=['output_name'], binarize=config.BinarizationOptions( class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions( macro_average=True, class_weights={ 0: 1.0, 1: 1.0 })) ] thresholds = metric_specs.metric_thresholds_from_metrics_specs( metrics_specs) expected_keys_and_threshold_counts = { metric_types.MetricKey(name='auc', model_name='model_name', output_name='output_name', is_diff=False): 4, metric_types.MetricKey(name='auc', model_name='model_name', output_name='output_name', is_diff=True): 1, metric_types.MetricKey(name='mean/label', model_name='model_name', output_name='output_name', is_diff=True): 3, metric_types.MetricKey(name='mean/label', model_name='model_name', output_name='output_name', is_diff=False): 3, metric_types.MetricKey(name='example_count', model_name='model_name1', output_name='output_name1'): 1, metric_types.MetricKey(name='example_count', model_name='model_name1', output_name='output_name2'): 1, metric_types.MetricKey(name='example_count', model_name='model_name2', output_name='output_name1'): 1, metric_types.MetricKey(name='example_count', model_name='model_name2', output_name='output_name2'): 1, metric_types.MetricKey(name='weighted_example_count', model_name='model_name1', output_name='output_name1'): 1, metric_types.MetricKey(name='weighted_example_count', model_name='model_name1', output_name='output_name2'): 1, metric_types.MetricKey(name='weighted_example_count', model_name='model_name2', output_name='output_name1'): 1, metric_types.MetricKey(name='weighted_example_count', model_name='model_name2', output_name='output_name2'): 1, metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0), is_diff=True): 1, metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1), is_diff=True): 1, metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', is_diff=True): 2, metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', is_diff=False): 1, metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', aggregation_type=metric_types.AggregationType(macro_average=True), is_diff=True): 1, metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0), is_diff=True): 4, metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1), is_diff=True): 4, metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', aggregation_type=metric_types.AggregationType(macro_average=True), is_diff=True): 4 } self.assertLen(thresholds, len(expected_keys_and_threshold_counts)) for key, count in expected_keys_and_threshold_counts.items(): self.assertIn(key, thresholds) self.assertLen(thresholds[key], count, 'failed for key {}'.format(key))
def testWriteValidationResults(self, output_file_format): model_dir, baseline_dir = self._getExportDir(), self._getBaselineDir() eval_shared_model = self._build_keras_model(model_dir, mul=0) baseline_eval_shared_model = self._build_keras_model(baseline_dir, mul=1) validations_file = os.path.join(self._getTempDir(), constants.VALIDATIONS_KEY) schema = text_format.Parse( """ tensor_representation_group { key: "" value { tensor_representation { key: "input" value { dense_tensor { column_name: "input" shape { dim { size: 1 } } } } } } } feature { name: "input" type: FLOAT } feature { name: "label" type: FLOAT } feature { name: "example_weight" type: FLOAT } feature { name: "extra_feature" type: BYTES } """, schema_pb2.Schema()) tfx_io = test_util.InMemoryTFExampleRecord( schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN) tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfx_io.ArrowSchema(), tensor_representations=tfx_io.TensorRepresentations()) examples = [ self._makeExample( input=0.0, label=1.0, example_weight=1.0, extra_feature='non_model_feature'), self._makeExample( input=1.0, label=0.0, example_weight=0.5, extra_feature='non_model_feature'), ] eval_config = config.EvalConfig( model_specs=[ config.ModelSpec( name='candidate', label_key='label', example_weight_key='example_weight'), config.ModelSpec( name='baseline', label_key='label', example_weight_key='example_weight', is_baseline=True) ], slicing_specs=[config.SlicingSpec()], metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 1.5 < 1, NOT OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': 1}))), config.MetricConfig( class_name='ExampleCount', # 2 > 10, NOT OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( lower_bound={'value': 10}))), config.MetricConfig( class_name='MeanLabel', # 0 > 0 and 0 > 0%?: NOT OK. threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection .HIGHER_IS_BETTER, relative={'value': 0}, absolute={'value': 0}))), config.MetricConfig( # MeanPrediction = (0+0)/(1+0.5) = 0 class_name='MeanPrediction', # -.01 < 0 < .01, OK. # Diff% = -.333/.333 = -100% < -99%, OK. # Diff = 0 - .333 = -.333 < 0, OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': .01}, lower_bound={'value': -.01}), change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection .LOWER_IS_BETTER, relative={'value': -.99}, absolute={'value': 0}))) ], model_names=['candidate', 'baseline']), ], options=config.Options( disabled_outputs={'values': ['eval_config.json']}), ) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] eval_shared_models = { 'candidate': eval_shared_model, 'baseline': baseline_eval_shared_model } extractors = [ batched_input_extractor.BatchedInputExtractor(eval_config), batched_predict_extractor_v2.BatchedPredictExtractor( eval_shared_model=eval_shared_models, eval_config=eval_config, tensor_adapter_config=tensor_adapter_config), unbatch_extractor.UnbatchExtractor(), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_model=eval_shared_models) ] output_paths = { constants.VALIDATIONS_KEY: validations_file, } writers = [ metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter( output_paths, add_metrics_callbacks=[], output_file_format=output_file_format) ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter _ = ( pipeline | 'Create' >> beam.Create([e.SerializeToString() for e in examples]) | 'BatchExamples' >> tfx_io.BeamSource() | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | 'ExtractEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators) | 'WriteResults' >> model_eval_lib.WriteResults(writers=writers)) # pylint: enable=no-value-for-parameter validation_result = ( metrics_plots_and_validations_writer .load_and_deserialize_validation_result( os.path.dirname(validations_file))) expected_validations = [ text_format.Parse( """ metric_key { name: "weighted_example_count" model_name: "candidate" } metric_threshold { value_threshold { upper_bound { value: 1.0 } } } metric_value { double_value { value: 1.5 } } """, validation_result_pb2.ValidationFailure()), text_format.Parse( """ metric_key { name: "example_count" model_name: "candidate" } metric_threshold { value_threshold { lower_bound { value: 10.0 } } } metric_value { double_value { value: 2.0 } } """, validation_result_pb2.ValidationFailure()), text_format.Parse( """ metric_key { name: "mean_label" model_name: "candidate" is_diff: true } metric_threshold { change_threshold { absolute { value: 0.0 } relative { value: 0.0 } direction: HIGHER_IS_BETTER } } metric_value { double_value { value: 0.0 } } """, validation_result_pb2.ValidationFailure()), ] self.assertFalse(validation_result.validation_ok) self.assertLen(validation_result.metric_validations_per_slice, 1) self.assertCountEqual( expected_validations, validation_result.metric_validations_per_slice[0].failures)
def testRunModelAnalysisWithModelAgnosticPredictions(self): examples = [ self._makeExample(age=3.0, language='english', label=1.0, prediction=0.9), self._makeExample(age=3.0, language='chinese', label=0.0, prediction=0.4), self._makeExample(age=4.0, language='english', label=1.0, prediction=0.7), self._makeExample(age=5.0, language='chinese', label=1.0, prediction=0.2) ] data_location = self._writeTFExamplesToTFRecords(examples) model_specs = [ config.ModelSpec(prediction_key='prediction', label_key='label', example_weight_key='age') ] metrics = [ config.MetricConfig(class_name='ExampleCount'), config.MetricConfig(class_name='WeightedExampleCount'), config.MetricConfig(class_name='BinaryAccuracy') ] slicing_specs = [config.SlicingSpec(feature_keys=['language'])] eval_config = config.EvalConfig( model_specs=model_specs, metrics_specs=[config.MetricsSpec(metrics=metrics)], slicing_specs=slicing_specs) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, data_location=data_location, output_path=self._getTempDir()) expected = { (('language', 'chinese'), ): { 'binary_accuracy': { 'doubleValue': 0.375 }, 'weighted_example_count': { 'doubleValue': 8.0 }, 'example_count': { 'doubleValue': 2.0 }, }, (('language', 'english'), ): { 'binary_accuracy': { 'doubleValue': 1.0 }, 'weighted_example_count': { 'doubleValue': 7.0 }, 'example_count': { 'doubleValue': 2.0 }, } } self.assertEqual(eval_result.data_location, data_location) self.assertEqual(eval_result.config.slicing_specs[0], config.SlicingSpec(feature_keys=['language'])) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
def testValidateMetricsMetricTDistributionChangeAndThreshold( self, slicing_specs, slice_key): threshold = config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection.LOWER_IS_BETTER, absolute={'value': -1})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), config.ModelSpec(name='baseline', is_baseline=True) ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec(metrics=[ config.MetricConfig( class_name='AUC', threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]), ], model_names=['']), ], ) sliced_metrics = ( slice_key, { # This is the mean of the diff. metric_types.MetricKey(name='auc', model_name='baseline'): types.ValueWithTDistribution(sample_mean=0.91, unsampled_value=0.6), metric_types.MetricKey(name='auc', is_diff=True): types.ValueWithTDistribution(sample_mean=0.1, unsampled_value=0.1), }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok) expected = text_format.Parse( """ metric_validations_per_slice { failures { metric_key { name: "auc" is_diff: true } metric_value { double_value { value: 0.1 } } } }""", validation_result_pb2.ValidationResult()) expected.metric_validations_per_slice[0].failures[ 0].metric_threshold.CopyFrom(threshold) expected.metric_validations_per_slice[0].slice_key.CopyFrom( slicer.serialize_slice_key(slice_key)) for spec in slicing_specs or [None]: if (spec is None or slicer.SingleSliceSpec( spec=spec).is_slice_applicable(slice_key)): slicing_details = expected.validation_details.slicing_details.add( ) if spec is not None: slicing_details.slicing_spec.CopyFrom(spec) else: slicing_details.slicing_spec.CopyFrom(config.SlicingSpec()) slicing_details.num_matching_slices = 1 self.assertAlmostEqual(result, expected)
def testRunModelAnalysisWithKerasModel(self): input_layer = tf.keras.layers.Input(shape=(28 * 28, ), name='data') output_layer = tf.keras.layers.Dense( 10, activation=tf.nn.softmax)(input_layer) model = tf.keras.models.Model(input_layer, output_layer) model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001), loss=tf.keras.losses.categorical_crossentropy) features = {'data': [[0.0] * 28 * 28]} labels = [[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]] example_weights = [1.0] dataset = tf.data.Dataset.from_tensor_slices( (features, labels, example_weights)) dataset = dataset.shuffle(buffer_size=1).repeat().batch(1) model.fit(dataset, steps_per_epoch=1) model_location = os.path.join(self._getTempDir(), 'export_dir') model.save(model_location, save_format='tf') examples = [ self._makeExample(data=[0.0] * 28 * 28, label=1.0), self._makeExample(data=[1.0] * 28 * 28, label=5.0), self._makeExample(data=[1.0] * 28 * 28, label=9.0), ] data_location = self._writeTFExamplesToTFRecords(examples) metrics_spec = config.MetricsSpec() for metric in (tf.keras.metrics.AUC(), ): cfg = tf.keras.utils.serialize_keras_object(metric) metrics_spec.metrics.append( config.MetricConfig(class_name=cfg['class_name'], config=json.dumps(cfg['config']))) for class_id in (0, 5, 9): metrics_spec.binarize.class_ids.append(class_id) eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[ config.ModelSpec(location=model_location, label_key='label') ], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], metrics_specs=[metrics_spec]) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[ model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, tags=[tf.saved_model.SERVING]) ]) self.assertEqual(eval_result.config.model_specs[0].location, model_location) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertLen(eval_result.slicing_metrics, 1) got_slice_key, got_metrics = eval_result.slicing_metrics[0] self.assertEqual(got_slice_key, ()) self.assertIn('', got_metrics) # output_name got_metrics = got_metrics[''] expected_metrics = { 'classId:0': { 'auc': True, }, 'classId:5': { 'auc': True, }, 'classId:9': { 'auc': True, }, } for class_id in expected_metrics: self.assertIn(class_id, got_metrics) for k in expected_metrics[class_id]: self.assertIn(k, got_metrics[class_id])