def testToComputations(self): computations = metric_specs.to_computations( metric_specs.specs_from_metrics( { 'output_name': [ tf.keras.metrics.MeanSquaredError('mse'), calibration.MeanLabel('mean_label') ] }, model_names=['model_name'], binarize=config.BinarizationOptions(class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions(macro_average=True)), config.EvalConfig()) keys = [] for m in computations: for k in m.keys: if not k.name.startswith('_'): keys.append(k) self.assertLen(keys, 8) self.assertIn(metric_types.MetricKey(name='example_count'), keys) self.assertIn( metric_types.MetricKey( name='weighted_example_count', model_name='model_name', output_name='output_name'), keys) self.assertIn( metric_types.MetricKey( name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0)), keys) self.assertIn( metric_types.MetricKey( name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1)), keys) self.assertIn( metric_types.MetricKey( name='mse', model_name='model_name', output_name='output_name'), keys) self.assertIn( metric_types.MetricKey( name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0)), keys) self.assertIn( metric_types.MetricKey( name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1)), keys) self.assertIn( metric_types.MetricKey( name='mean_label', model_name='model_name', output_name='output_name'), keys)
def default_multi_class_classification_specs( model_names: Optional[List[Text]] = None, output_names: Optional[List[Text]] = None, output_weights: Optional[Dict[Text, float]] = None, binarize: Optional[config.BinarizationOptions] = None, aggregate: Optional[config.AggregationOptions] = None, sparse: bool = True) -> List[config.MetricsSpec]: """Returns default metric specs for multi-class classification problems. Args: model_names: Optional model names if multi-model evaluation. output_names: Optional list of output names (if multi-output model). output_weights: Optional output weights for creating overall metric aggregated across outputs (if multi-output model). If a weight is not provided for an output, it's weight defaults to 0.0 (i.e. output ignored). binarize: Optional settings for binarizing multi-class/multi-label metrics. aggregate: Optional settings for aggregating multi-class/multi-label metrics. sparse: True if the labels are sparse. """ if sparse: metrics = [ tf.keras.metrics.SparseCategoricalCrossentropy(name='loss'), tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy') ] else: metrics = [ tf.keras.metrics.CategoricalCrossentropy(name='loss'), tf.keras.metrics.CategoricalAccuracy(name='accuracy') ] metrics.append( multi_class_confusion_matrix_plot.MultiClassConfusionMatrixPlot()) if binarize is not None: for top_k in binarize.top_k_list.values: metrics.extend([ tf.keras.metrics.Precision(name='precision', top_k=top_k), tf.keras.metrics.Recall(name='recall', top_k=top_k) ]) binarize_without_top_k = config.BinarizationOptions() binarize_without_top_k.CopyFrom(binarize) binarize_without_top_k.ClearField('top_k_list') binarize = binarize_without_top_k multi_class_metrics = specs_from_metrics( metrics, model_names=model_names, output_names=output_names, output_weights=output_weights) if aggregate is None: aggregate = config.AggregationOptions(micro_average=True) multi_class_metrics.extend( default_binary_classification_specs( model_names=model_names, output_names=output_names, output_weights=output_weights, binarize=binarize, aggregate=aggregate)) return multi_class_metrics
def testToComputationsWithMixedAggregationAndNonAggregationMetrics(self): computations = metric_specs.to_computations([ config.MetricsSpec( metrics=[config.MetricConfig(class_name='CategoricalAccuracy')]), config.MetricsSpec( metrics=[config.MetricConfig(class_name='BinaryCrossentropy')], binarize=config.BinarizationOptions(class_ids={'values': [1]}), aggregate=config.AggregationOptions(micro_average=True)) ], config.EvalConfig()) # 3 separate computations should be used (one for aggregated metrics, one # for non-aggregated metrics, and one for metrics associated with class 1) self.assertLen(computations, 3)
def default_multi_class_classification_specs( model_names: Optional[List[Text]] = None, output_names: Optional[List[Text]] = None, binarize: Optional[config.BinarizationOptions] = None, aggregate: Optional[config.AggregationOptions] = None, sparse: bool = True) -> config.MetricsSpec: """Returns default metric specs for multi-class classification problems. Args: model_names: Optional model names if multi-model evaluation. output_names: Optional list of output names (if multi-output model). binarize: Optional settings for binarizing multi-class/multi-label metrics. aggregate: Optional settings for aggregating multi-class/multi-label metrics. sparse: True if the labels are sparse. """ if sparse: metrics = [ tf.keras.metrics.SparseCategoricalCrossentropy(name='loss'), tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy') ] else: metrics = [ tf.keras.metrics.CategoricalCrossentropy(name='loss'), tf.keras.metrics.CategoricalAccuracy(name='accuracy') ] metrics.append( multi_class_confusion_matrix_at_thresholds. MultiClassConfusionMatrixAtThresholds( name='multi_class_confusion_matrix_at_thresholds')) if binarize is not None: for top_k in binarize.top_k_list: metrics.extend([ tf.keras.metrics.Precision(name='precision', top_k=top_k), tf.keras.metrics.Recall(name='recall', top_k=top_k) ]) binarize = config.BinarizationOptions().CopyFrom(binarize) binarize.ClearField('top_k') multi_class_metrics = specs_from_metrics(metrics, model_names=model_names, output_names=output_names) if aggregate is None: aggregate = config.AggregationOptions(micro_average=True) multi_class_metrics.extend( default_binary_classification_specs(model_names=model_names, output_names=output_names, binarize=binarize, aggregate=aggregate)) return multi_class_metrics
def testSpecsFromMetrics(self): metrics_specs = metric_specs.specs_from_metrics( { 'output_name1': [ tf.keras.metrics.MeanSquaredError('mse'), tf.keras.losses.MeanAbsoluteError(name='mae'), calibration.MeanLabel('mean_label') ], 'output_name2': [ tf.keras.metrics.RootMeanSquaredError('rmse'), tf.keras.losses.MeanAbsolutePercentageError(name='mape'), calibration.MeanPrediction('mean_prediction') ] }, model_names=['model_name1', 'model_name2'], binarize=config.BinarizationOptions(class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions(macro_average=True)) self.assertLen(metrics_specs, 5) self.assertProtoEquals( metrics_specs[0], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='ExampleCount', config=json.dumps( {'name': 'example_count'})), ])) self.assertProtoEquals( metrics_specs[1], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='WeightedExampleCount', config=json.dumps( {'name': 'weighted_example_count'})), ], model_names=['model_name1', 'model_name2'], output_names=['output_name1'])) self.assertProtoEquals( metrics_specs[2], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='MeanSquaredError', config=json.dumps( { 'name': 'mse', 'dtype': 'float32' }, sort_keys=True)), config.MetricConfig(class_name='MeanAbsoluteError', module=metric_specs._TF_LOSSES_MODULE, config=json.dumps( { 'reduction': 'auto', 'name': 'mae' }, sort_keys=True)), config.MetricConfig(class_name='MeanLabel', config=json.dumps({'name': 'mean_label'})) ], model_names=['model_name1', 'model_name2'], output_names=['output_name1'], binarize=config.BinarizationOptions( class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions( macro_average=True))) self.assertProtoEquals( metrics_specs[3], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='WeightedExampleCount', config=json.dumps( {'name': 'weighted_example_count'})), ], model_names=['model_name1', 'model_name2'], output_names=['output_name2'])) self.assertProtoEquals( metrics_specs[4], config.MetricsSpec( metrics=[ config.MetricConfig(class_name='RootMeanSquaredError', config=json.dumps( { 'name': 'rmse', 'dtype': 'float32' }, sort_keys=True)), config.MetricConfig( class_name='MeanAbsolutePercentageError', module=metric_specs._TF_LOSSES_MODULE, config=json.dumps({ 'reduction': 'auto', 'name': 'mape' }, sort_keys=True)), config.MetricConfig(class_name='MeanPrediction', config=json.dumps( {'name': 'mean_prediction'})) ], model_names=['model_name1', 'model_name2'], output_names=['output_name2'], binarize=config.BinarizationOptions( class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions(macro_average=True)))
def testMetricThresholdsFromMetricsSpecs(self): metrics_specs = [ config.MetricsSpec( thresholds={ 'auc': config.MetricThreshold( value_threshold=config.GenericValueThreshold()), 'mean/label': config.MetricThreshold( value_threshold=config.GenericValueThreshold(), change_threshold=config.GenericChangeThreshold()), # The mse metric will be overridden by MetricConfig below. 'mse': config.MetricThreshold( change_threshold=config.GenericChangeThreshold()) }, # Model names and output_names should be ignored because # ExampleCount is model independent. model_names=['model_name'], output_names=['output_name']), config.MetricsSpec( metrics=[ config.MetricConfig( class_name='ExampleCount', config=json.dumps({'name': 'example_count'}), threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold())) ], # Model names and output_names should be ignored because # ExampleCount is model independent. model_names=['model_name1', 'model_name2'], output_names=['output_name1', 'output_name2']), config.MetricsSpec(metrics=[ config.MetricConfig( class_name='WeightedExampleCount', config=json.dumps({'name': 'weighted_example_count'}), threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold())) ], model_names=['model_name1', 'model_name2'], output_names=['output_name1', 'output_name2']), config.MetricsSpec( metrics=[ config.MetricConfig( class_name='MeanSquaredError', config=json.dumps({'name': 'mse'}), threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold())), config.MetricConfig( class_name='MeanLabel', config=json.dumps({'name': 'mean_label'}), threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold())) ], model_names=['model_name'], output_names=['output_name'], binarize=config.BinarizationOptions( class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions(macro_average=True)) ] thresholds = metric_specs.metric_thresholds_from_metrics_specs( metrics_specs) self.assertLen(thresholds, 14) self.assertIn( metric_types.MetricKey(name='auc', model_name='model_name', output_name='output_name'), thresholds) self.assertIn( metric_types.MetricKey(name='mean/label', model_name='model_name', output_name='output_name', is_diff=True), thresholds) self.assertIn( metric_types.MetricKey(name='mean/label', model_name='model_name', output_name='output_name', is_diff=False), thresholds) self.assertIn(metric_types.MetricKey(name='example_count'), thresholds) self.assertIn( metric_types.MetricKey(name='weighted_example_count', model_name='model_name1', output_name='output_name1'), thresholds) self.assertIn( metric_types.MetricKey(name='weighted_example_count', model_name='model_name1', output_name='output_name2'), thresholds) self.assertIn( metric_types.MetricKey(name='weighted_example_count', model_name='model_name2', output_name='output_name1'), thresholds) self.assertIn( metric_types.MetricKey(name='weighted_example_count', model_name='model_name2', output_name='output_name2'), thresholds) self.assertIn( metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0), is_diff=True), thresholds) self.assertIn( metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1), is_diff=True), thresholds) self.assertIn( metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', is_diff=True), thresholds) self.assertIn( metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0), is_diff=True), thresholds) self.assertIn( metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1), is_diff=True), thresholds) self.assertIn( metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', is_diff=True), thresholds)
def testRunModelAnalysisWithQueryBasedMetrics(self): input_layer = tf.keras.layers.Input(shape=(1, ), name='age') output_layer = tf.keras.layers.Dense( 1, activation=tf.nn.sigmoid)(input_layer) model = tf.keras.models.Model(input_layer, output_layer) model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001), loss=tf.keras.losses.binary_crossentropy) features = {'age': [[20.0]]} labels = [[1]] example_weights = [1.0] dataset = tf.data.Dataset.from_tensor_slices( (features, labels, example_weights)) dataset = dataset.shuffle(buffer_size=1).repeat().batch(1) model.fit(dataset, steps_per_epoch=1) model_location = os.path.join(self._getTempDir(), 'export_dir') model.save(model_location, save_format='tf') examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=0.0), self._makeExample(age=3.0, language='english', label=0.0), self._makeExample(age=5.0, language='chinese', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec()] eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[ config.ModelSpec(location=model_location, label_key='label') ], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], slicing_specs=slicing_specs, metrics_specs=metric_specs.specs_from_metrics( [ndcg.NDCG(gain_key='age', name='ndcg')], binarize=config.BinarizationOptions(top_k_list=[1]), query_key='language')) eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, tags=[tf.saved_model.SERVING]) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[eval_shared_model], evaluators=[ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ]) self.assertEqual(eval_result.config.model_specs[0].location, model_location) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertLen(eval_result.slicing_metrics, 1) got_slice_key, got_metrics = eval_result.slicing_metrics[0] self.assertEqual(got_slice_key, ()) self.assertIn('', got_metrics) # output_name got_metrics = got_metrics[''] expected_metrics = { '': { 'example_count': True, 'weighted_example_count': True, }, 'topK:1': { 'ndcg': True, }, } for group in expected_metrics: self.assertIn(group, got_metrics) for k in expected_metrics[group]: self.assertIn(k, got_metrics[group])
def testSpecsFromMetrics(self): metrics_specs = metric_specs.specs_from_metrics( { 'output_name1': [ tf.keras.metrics.MeanSquaredError('mse'), calibration.MeanLabel('mean_label') ], 'output_name2': [ tf.keras.metrics.RootMeanSquaredError('rmse'), calibration.MeanPrediction('mean_prediction') ] }, model_names=['model_name1', 'model_name2'], binarize=config.BinarizationOptions(class_ids=[0, 1]), aggregate=config.AggregationOptions(macro_average=True)) self.assertLen(metrics_specs, 5) self.assertProtoEquals( metrics_specs[0], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='ExampleCount', config=json.dumps( {'name': 'example_count'})), ])) self.assertProtoEquals( metrics_specs[1], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='WeightedExampleCount', config=json.dumps( {'name': 'weighted_example_count'})), ], model_names=['model_name1', 'model_name2'], output_names=['output_name1'])) self.assertProtoEquals( metrics_specs[2], config.MetricsSpec( metrics=[ config.MetricConfig(class_name='MeanSquaredError', config=json.dumps({ 'name': 'mse', 'dtype': 'float32' })), config.MetricConfig(class_name='MeanLabel', config=json.dumps( {'name': 'mean_label'})) ], model_names=['model_name1', 'model_name2'], output_names=['output_name1'], binarize=config.BinarizationOptions(class_ids=[0, 1]), aggregate=config.AggregationOptions(macro_average=True))) self.assertProtoEquals( metrics_specs[3], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='WeightedExampleCount', config=json.dumps( {'name': 'weighted_example_count'})), ], model_names=['model_name1', 'model_name2'], output_names=['output_name2'])) self.assertProtoEquals( metrics_specs[4], config.MetricsSpec( metrics=[ config.MetricConfig(class_name='RootMeanSquaredError', config=json.dumps({ 'name': 'rmse', 'dtype': 'float32' })), config.MetricConfig(class_name='MeanPrediction', config=json.dumps( {'name': 'mean_prediction'})) ], model_names=['model_name1', 'model_name2'], output_names=['output_name2'], binarize=config.BinarizationOptions(class_ids=[0, 1]), aggregate=config.AggregationOptions(macro_average=True)))
def testToComputations(self): computations = metric_specs.to_computations( metric_specs.specs_from_metrics( { 'output_name': [ tf.keras.metrics.MeanSquaredError('mse'), # Add a loss exactly same as metric # (https://github.com/tensorflow/tfx/issues/1550) tf.keras.losses.MeanSquaredError(name='loss'), calibration.MeanLabel('mean_label') ] }, model_names=['model_name'], binarize=config.BinarizationOptions( class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions(macro_average=True, class_weights={ 0: 1.0, 1: 1.0 })), config.EvalConfig()) keys = [] for m in computations: for k in m.keys: if not k.name.startswith('_'): keys.append(k) self.assertLen(keys, 11) self.assertIn( metric_types.MetricKey(name='example_count', model_name='model_name'), keys) self.assertIn( metric_types.MetricKey(name='weighted_example_count', model_name='model_name', output_name='output_name'), keys) self.assertIn( metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0)), keys) self.assertIn( metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1)), keys) self.assertIn( metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name'), keys) self.assertIn( metric_types.MetricKey(name='loss', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0)), keys) self.assertIn( metric_types.MetricKey(name='loss', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1)), keys) self.assertIn( metric_types.MetricKey(name='loss', model_name='model_name', output_name='output_name'), keys) self.assertIn( metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0)), keys) self.assertIn( metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1)), keys) self.assertIn( metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name'), keys)
def testMetricThresholdsFromMetricsSpecs(self): slice_specs = [ config.SlicingSpec(feature_keys=['feature1']), config.SlicingSpec(feature_values={'feature2': 'value1'}) ] # For cross slice tests. baseline_slice_spec = config.SlicingSpec(feature_keys=['feature3']) metrics_specs = [ config.MetricsSpec( thresholds={ 'auc': config.MetricThreshold( value_threshold=config.GenericValueThreshold()), 'mean/label': config.MetricThreshold( value_threshold=config.GenericValueThreshold(), change_threshold=config.GenericChangeThreshold()), 'mse': config.MetricThreshold( change_threshold=config.GenericChangeThreshold()) }, per_slice_thresholds={ 'auc': config.PerSliceMetricThresholds(thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slice_specs, threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( ))) ]), 'mean/label': config.PerSliceMetricThresholds(thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slice_specs, threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold(), change_threshold=config.GenericChangeThreshold( ))) ]) }, cross_slice_thresholds={ 'auc': config.CrossSliceMetricThresholds(thresholds=[ config.CrossSliceMetricThreshold( cross_slicing_specs=[ config.CrossSlicingSpec( baseline_spec=baseline_slice_spec, slicing_specs=slice_specs) ], threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold(), change_threshold=config.GenericChangeThreshold( ))) ]), 'mse': config.CrossSliceMetricThresholds(thresholds=[ config.CrossSliceMetricThreshold( cross_slicing_specs=[ config.CrossSlicingSpec( baseline_spec=baseline_slice_spec, slicing_specs=slice_specs) ], threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( ))), # Test for duplicate cross_slicing_spec. config.CrossSliceMetricThreshold( cross_slicing_specs=[ config.CrossSlicingSpec( baseline_spec=baseline_slice_spec, slicing_specs=slice_specs) ], threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold()) ) ]) }, model_names=['model_name'], output_names=['output_name']), config.MetricsSpec(metrics=[ config.MetricConfig( class_name='ExampleCount', config=json.dumps({'name': 'example_count'}), threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold())) ], model_names=['model_name1', 'model_name2'], output_names=['output_name1', 'output_name2']), config.MetricsSpec(metrics=[ config.MetricConfig( class_name='WeightedExampleCount', config=json.dumps({'name': 'weighted_example_count'}), threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold())) ], model_names=['model_name1', 'model_name2'], output_names=['output_name1', 'output_name2']), config.MetricsSpec(metrics=[ config.MetricConfig( class_name='MeanSquaredError', config=json.dumps({'name': 'mse'}), threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold())), config.MetricConfig( class_name='MeanLabel', config=json.dumps({'name': 'mean_label'}), threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold()), per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slice_specs, threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( ))), ], cross_slice_thresholds=[ config.CrossSliceMetricThreshold( cross_slicing_specs=[ config.CrossSlicingSpec( baseline_spec=baseline_slice_spec, slicing_specs=slice_specs) ], threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( ))) ]), ], model_names=['model_name'], output_names=['output_name'], binarize=config.BinarizationOptions( class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions( macro_average=True, class_weights={ 0: 1.0, 1: 1.0 })) ] thresholds = metric_specs.metric_thresholds_from_metrics_specs( metrics_specs) expected_keys_and_threshold_counts = { metric_types.MetricKey(name='auc', model_name='model_name', output_name='output_name', is_diff=False): 4, metric_types.MetricKey(name='auc', model_name='model_name', output_name='output_name', is_diff=True): 1, metric_types.MetricKey(name='mean/label', model_name='model_name', output_name='output_name', is_diff=True): 3, metric_types.MetricKey(name='mean/label', model_name='model_name', output_name='output_name', is_diff=False): 3, metric_types.MetricKey(name='example_count', model_name='model_name1', output_name='output_name1'): 1, metric_types.MetricKey(name='example_count', model_name='model_name1', output_name='output_name2'): 1, metric_types.MetricKey(name='example_count', model_name='model_name2', output_name='output_name1'): 1, metric_types.MetricKey(name='example_count', model_name='model_name2', output_name='output_name2'): 1, metric_types.MetricKey(name='weighted_example_count', model_name='model_name1', output_name='output_name1'): 1, metric_types.MetricKey(name='weighted_example_count', model_name='model_name1', output_name='output_name2'): 1, metric_types.MetricKey(name='weighted_example_count', model_name='model_name2', output_name='output_name1'): 1, metric_types.MetricKey(name='weighted_example_count', model_name='model_name2', output_name='output_name2'): 1, metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0), is_diff=True): 1, metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1), is_diff=True): 1, metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', is_diff=True): 2, metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', is_diff=False): 1, metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', aggregation_type=metric_types.AggregationType(macro_average=True), is_diff=True): 1, metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0), is_diff=True): 4, metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1), is_diff=True): 4, metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', aggregation_type=metric_types.AggregationType(macro_average=True), is_diff=True): 4 } self.assertLen(thresholds, len(expected_keys_and_threshold_counts)) for key, count in expected_keys_and_threshold_counts.items(): self.assertIn(key, thresholds) self.assertLen(thresholds[key], count, 'failed for key {}'.format(key))
def specs_from_metrics( metrics: Union[List[_TFOrTFMAMetric], Dict[Text, List[_TFOrTFMAMetric]]], model_names: Optional[List[Text]] = None, output_names: Optional[List[Text]] = None, class_ids: Optional[List[int]] = None, k_list: Optional[List[int]] = None, top_k_list: Optional[List[int]] = None, query_key: Optional[Text] = None, include_example_count: Optional[bool] = None, include_weighted_example_count: Optional[bool] = None ) -> List[config.MetricsSpec]: """Returns specs from tf.keras.metrics.Metric or tfma.metrics.Metric classes. Examples: metrics_specs = specs_from_metrics([ tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.AUC(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tfma.metrics.MeanLabel(), tfma.metrics.MeanPrediction() ... ]) metrics_specs = specs_from_metrics({ 'output1': [ tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.AUC(), tfma.metrics.MeanLabel(), tfma.metrics.MeanPrediction() ... ], 'output2': [ tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), ] }) Args: metrics: List of tf.keras.metrics.Metric or tfma.metrics.Metric. For multi-output models a dict of dicts may be passed where the first dict is indexed by the output_name. model_names: Optional model names (if multi-model evaluation). output_names: Optional output names (if multi-output models). If the metrics are a dict this should not be set. class_ids: Optional class IDs to computes metrics for particular classes of a multi-class model. If output_names are provided, all outputs are assumed to use the same class IDs. k_list: Optional list of k values to compute metrics for the kth predicted values in a multi-class model prediction. If output_names are provided, all outputs are assumed to use the same k value. top_k_list: Optional list of top_k values to compute metrics for the top k predicted values in a multi-class model prediction. If output_names are provided, all outputs are assumed to use the same top_k value. Metrics and plots will be based on treating each predicted value in the top_k as though they were separate predictions. query_key: Optional query key for query/ranking based metrics. include_example_count: True to add example_count metric. Default is True. include_weighted_example_count: True to add weighted_example_count metric. Default is True. A weighted example count will be added per output for multi-output models. """ if isinstance(metrics, dict) and output_names: raise ValueError('metrics cannot be a dict when output_names is used: ' 'metrics={}, output_names={}'.format( metrics, output_names)) if isinstance(metrics, dict): specs = [] for output_name in sorted(metrics.keys()): specs.extend( specs_from_metrics(metrics[output_name], model_names=model_names, output_names=[output_name], class_ids=class_ids, k_list=k_list, top_k_list=top_k_list, include_example_count=include_example_count, include_weighted_example_count= include_weighted_example_count)) include_example_count = False return specs if include_example_count is None: include_example_count = True if include_weighted_example_count is None: include_weighted_example_count = True # Add the computations for the example counts and weights since they are # independent of the model and class ID. specs = example_count_specs( model_names=model_names, output_names=output_names, include_example_count=include_example_count, include_weighted_example_count=include_weighted_example_count) metric_configs = [] for metric in metrics: if isinstance(metric, tf.keras.metrics.Metric): metric_configs.append(_serialize_tf_metric(metric)) else: metric_configs.append(_serialize_tfma_metric(metric)) if class_ids: specs.append( config.MetricsSpec( metrics=metric_configs, model_names=model_names, output_names=output_names, binarize=config.BinarizationOptions(class_ids=class_ids), query_key=query_key)) if k_list: specs.append( config.MetricsSpec( metrics=metric_configs, model_names=model_names, output_names=output_names, binarize=config.BinarizationOptions(k_list=k_list), query_key=query_key)) if top_k_list: specs.append( config.MetricsSpec( metrics=metric_configs, model_names=model_names, output_names=output_names, binarize=config.BinarizationOptions(top_k_list=top_k_list), query_key=query_key)) if not class_ids and not k_list and not top_k_list: specs.append( config.MetricsSpec(metrics=metric_configs, model_names=model_names, output_names=output_names, query_key=query_key)) return specs
def testEvaluateWithQueryBasedMetrics(self): temp_export_dir = self._getExportDir() _, export_dir = (fixed_prediction_estimator_extra_fields. simple_fixed_prediction_estimator_extra_fields( None, temp_export_dir)) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_key='label', example_weight_key='fixed_int') ], slicing_specs=[ config.SlicingSpec(), config.SlicingSpec(feature_keys=['fixed_string']), ], metrics_specs=metric_specs.specs_from_metrics( [ndcg.NDCG(gain_key='fixed_float', name='ndcg')], binarize=config.BinarizationOptions(top_k_list=[1, 2]), query_key='fixed_string')) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_models=[eval_shared_model]), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] # fixed_string used as query_key # fixed_float used as gain_key for NDCG # fixed_int used as example_weight_key for NDCG examples = [ self._makeExample(prediction=0.2, label=1.0, fixed_float=1.0, fixed_string='query1', fixed_int=1), self._makeExample(prediction=0.8, label=0.0, fixed_float=0.5, fixed_string='query1', fixed_int=1), self._makeExample(prediction=0.5, label=0.0, fixed_float=0.5, fixed_string='query2', fixed_int=2), self._makeExample(prediction=0.9, label=1.0, fixed_float=1.0, fixed_string='query2', fixed_int=2), self._makeExample(prediction=0.1, label=0.0, fixed_float=0.1, fixed_string='query2', fixed_int=2), self._makeExample(prediction=0.9, label=1.0, fixed_float=1.0, fixed_string='query3', fixed_int=3) ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 4) slices = {} for slice_key, value in got: slices[slice_key] = value overall_slice = () query1_slice = (('fixed_string', b'query1'), ) query2_slice = (('fixed_string', b'query2'), ) query3_slice = (('fixed_string', b'query3'), ) self.assertCountEqual(list(slices.keys()), [ overall_slice, query1_slice, query2_slice, query3_slice ]) example_count_key = metric_types.MetricKey( name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') ndcg1_key = metric_types.MetricKey( name='ndcg', sub_key=metric_types.SubKey(top_k=1)) ndcg2_key = metric_types.MetricKey( name='ndcg', sub_key=metric_types.SubKey(top_k=2)) # Query1 (weight=1): (p=0.8, g=0.5) (p=0.2, g=1.0) # Query2 (weight=2): (p=0.9, g=1.0) (p=0.5, g=0.5) (p=0.1, g=0.1) # Query3 (weight=3): (p=0.9, g=1.0) # # DCG@1: 0.5, 1.0, 1.0 # NDCG@1: 0.5, 1.0, 1.0 # Average NDCG@1: (1 * 0.5 + 2 * 1.0 + 3 * 1.0) / (1 + 2 + 3) ~ 0.92 # # DCG@2: (0.5 + 1.0/log(3) ~ 0.630930 # (1.0 + 0.5/log(3) ~ 1.315465 # 1.0 # NDCG@2: (0.5 + 1.0/log(3)) / (1.0 + 0.5/log(3)) ~ 0.85972 # (1.0 + 0.5/log(3)) / (1.0 + 0.5/log(3)) = 1.0 # 1.0 # Average NDCG@2: (1 * 0.860 + 2 * 1.0 + 3 * 1.0) / (1 + 2 + 3) ~ 0.97 self.assertDictElementsAlmostEqual( slices[overall_slice], { example_count_key: 6, weighted_example_count_key: 11.0, ndcg1_key: 0.9166667, ndcg2_key: 0.9766198 }) self.assertDictElementsAlmostEqual( slices[query1_slice], { example_count_key: 2, weighted_example_count_key: 2.0, ndcg1_key: 0.5, ndcg2_key: 0.85972 }) self.assertDictElementsAlmostEqual( slices[query2_slice], { example_count_key: 3, weighted_example_count_key: 6.0, ndcg1_key: 1.0, ndcg2_key: 1.0 }) self.assertDictElementsAlmostEqual( slices[query3_slice], { example_count_key: 1, weighted_example_count_key: 3.0, ndcg1_key: 1.0, ndcg2_key: 1.0 }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def testEvaluateWithMultiClassModel(self): n_classes = 3 temp_export_dir = self._getExportDir() _, export_dir = dnn_classifier.simple_dnn_classifier( None, temp_export_dir, n_classes=n_classes) # Add example_count and weighted_example_count eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_key='label', example_weight_key='age') ], slicing_specs=[config.SlicingSpec()], metrics_specs=metric_specs.specs_from_metrics( [calibration.MeanLabel('mean_label')], binarize=config.BinarizationOptions( class_ids=range(n_classes)))) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_models=[eval_shared_model]), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] examples = [ self._makeExample(age=1.0, language='english', label=0), self._makeExample(age=2.0, language='chinese', label=1), self._makeExample(age=3.0, language='english', label=2), self._makeExample(age=4.0, language='chinese', label=1), ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] example_count_key = metric_types.MetricKey( name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') label_key_class_0 = metric_types.MetricKey( name='mean_label', sub_key=metric_types.SubKey(class_id=0)) label_key_class_1 = metric_types.MetricKey( name='mean_label', sub_key=metric_types.SubKey(class_id=1)) label_key_class_2 = metric_types.MetricKey( name='mean_label', sub_key=metric_types.SubKey(class_id=2)) self.assertEqual(got_slice_key, ()) self.assertDictElementsAlmostEqual( got_metrics, { example_count_key: 4, weighted_example_count_key: (1.0 + 2.0 + 3.0 + 4.0), label_key_class_0: (1 * 1.0 + 0 * 2.0 + 0 * 3.0 + 0 * 4.0) / (1.0 + 2.0 + 3.0 + 4.0), label_key_class_1: (0 * 1.0 + 1 * 2.0 + 0 * 3.0 + 1 * 4.0) / (1.0 + 2.0 + 3.0 + 4.0), label_key_class_2: (0 * 1.0 + 0 * 2.0 + 1 * 3.0 + 0 * 4.0) / (1.0 + 2.0 + 3.0 + 4.0) }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')