def testToComputations(self): computations = metric_specs.to_computations( metric_specs.specs_from_metrics( { 'output_name': [ tf.keras.metrics.MeanSquaredError('mse'), calibration.MeanLabel('mean_label') ] }, model_names=['model_name'], binarize=config.BinarizationOptions(class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions(macro_average=True)), config.EvalConfig()) keys = [] for m in computations: for k in m.keys: if not k.name.startswith('_'): keys.append(k) self.assertLen(keys, 8) self.assertIn(metric_types.MetricKey(name='example_count'), keys) self.assertIn( metric_types.MetricKey( name='weighted_example_count', model_name='model_name', output_name='output_name'), keys) self.assertIn( metric_types.MetricKey( name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0)), keys) self.assertIn( metric_types.MetricKey( name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1)), keys) self.assertIn( metric_types.MetricKey( name='mse', model_name='model_name', output_name='output_name'), keys) self.assertIn( metric_types.MetricKey( name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0)), keys) self.assertIn( metric_types.MetricKey( name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1)), keys) self.assertIn( metric_types.MetricKey( name='mean_label', model_name='model_name', output_name='output_name'), keys)
def default_multi_class_classification_specs( model_names: Optional[List[Text]] = None, output_names: Optional[List[Text]] = None, output_weights: Optional[Dict[Text, float]] = None, binarize: Optional[config.BinarizationOptions] = None, aggregate: Optional[config.AggregationOptions] = None, sparse: bool = True) -> List[config.MetricsSpec]: """Returns default metric specs for multi-class classification problems. Args: model_names: Optional model names if multi-model evaluation. output_names: Optional list of output names (if multi-output model). output_weights: Optional output weights for creating overall metric aggregated across outputs (if multi-output model). If a weight is not provided for an output, it's weight defaults to 0.0 (i.e. output ignored). binarize: Optional settings for binarizing multi-class/multi-label metrics. aggregate: Optional settings for aggregating multi-class/multi-label metrics. sparse: True if the labels are sparse. """ if sparse: metrics = [ tf.keras.metrics.SparseCategoricalCrossentropy(name='loss'), tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy') ] else: metrics = [ tf.keras.metrics.CategoricalCrossentropy(name='loss'), tf.keras.metrics.CategoricalAccuracy(name='accuracy') ] metrics.append( multi_class_confusion_matrix_plot.MultiClassConfusionMatrixPlot()) if binarize is not None: for top_k in binarize.top_k_list.values: metrics.extend([ tf.keras.metrics.Precision(name='precision', top_k=top_k), tf.keras.metrics.Recall(name='recall', top_k=top_k) ]) binarize_without_top_k = config.BinarizationOptions() binarize_without_top_k.CopyFrom(binarize) binarize_without_top_k.ClearField('top_k_list') binarize = binarize_without_top_k multi_class_metrics = specs_from_metrics( metrics, model_names=model_names, output_names=output_names, output_weights=output_weights) if aggregate is None: aggregate = config.AggregationOptions(micro_average=True) multi_class_metrics.extend( default_binary_classification_specs( model_names=model_names, output_names=output_names, output_weights=output_weights, binarize=binarize, aggregate=aggregate)) return multi_class_metrics
def testToComputationsWithMixedAggregationAndNonAggregationMetrics(self): computations = metric_specs.to_computations([ config.MetricsSpec( metrics=[config.MetricConfig(class_name='CategoricalAccuracy')]), config.MetricsSpec( metrics=[config.MetricConfig(class_name='BinaryCrossentropy')], binarize=config.BinarizationOptions(class_ids={'values': [1]}), aggregate=config.AggregationOptions(micro_average=True)) ], config.EvalConfig()) # 3 separate computations should be used (one for aggregated metrics, one # for non-aggregated metrics, and one for metrics associated with class 1) self.assertLen(computations, 3)
def default_multi_class_classification_specs( model_names: Optional[List[Text]] = None, output_names: Optional[List[Text]] = None, binarize: Optional[config.BinarizationOptions] = None, aggregate: Optional[config.AggregationOptions] = None, sparse: bool = True) -> config.MetricsSpec: """Returns default metric specs for multi-class classification problems. Args: model_names: Optional model names if multi-model evaluation. output_names: Optional list of output names (if multi-output model). binarize: Optional settings for binarizing multi-class/multi-label metrics. aggregate: Optional settings for aggregating multi-class/multi-label metrics. sparse: True if the labels are sparse. """ if sparse: metrics = [ tf.keras.metrics.SparseCategoricalCrossentropy(name='loss'), tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy') ] else: metrics = [ tf.keras.metrics.CategoricalCrossentropy(name='loss'), tf.keras.metrics.CategoricalAccuracy(name='accuracy') ] metrics.append( multi_class_confusion_matrix_at_thresholds. MultiClassConfusionMatrixAtThresholds( name='multi_class_confusion_matrix_at_thresholds')) if binarize is not None: for top_k in binarize.top_k_list: metrics.extend([ tf.keras.metrics.Precision(name='precision', top_k=top_k), tf.keras.metrics.Recall(name='recall', top_k=top_k) ]) binarize = config.BinarizationOptions().CopyFrom(binarize) binarize.ClearField('top_k') multi_class_metrics = specs_from_metrics(metrics, model_names=model_names, output_names=output_names) if aggregate is None: aggregate = config.AggregationOptions(micro_average=True) multi_class_metrics.extend( default_binary_classification_specs(model_names=model_names, output_names=output_names, binarize=binarize, aggregate=aggregate)) return multi_class_metrics
def testSpecsFromMetrics(self): metrics_specs = metric_specs.specs_from_metrics( { 'output_name1': [ tf.keras.metrics.MeanSquaredError('mse'), tf.keras.losses.MeanAbsoluteError(name='mae'), calibration.MeanLabel('mean_label') ], 'output_name2': [ tf.keras.metrics.RootMeanSquaredError('rmse'), tf.keras.losses.MeanAbsolutePercentageError(name='mape'), calibration.MeanPrediction('mean_prediction') ] }, model_names=['model_name1', 'model_name2'], binarize=config.BinarizationOptions(class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions(macro_average=True)) self.assertLen(metrics_specs, 5) self.assertProtoEquals( metrics_specs[0], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='ExampleCount', config=json.dumps( {'name': 'example_count'})), ])) self.assertProtoEquals( metrics_specs[1], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='WeightedExampleCount', config=json.dumps( {'name': 'weighted_example_count'})), ], model_names=['model_name1', 'model_name2'], output_names=['output_name1'])) self.assertProtoEquals( metrics_specs[2], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='MeanSquaredError', config=json.dumps( { 'name': 'mse', 'dtype': 'float32' }, sort_keys=True)), config.MetricConfig(class_name='MeanAbsoluteError', module=metric_specs._TF_LOSSES_MODULE, config=json.dumps( { 'reduction': 'auto', 'name': 'mae' }, sort_keys=True)), config.MetricConfig(class_name='MeanLabel', config=json.dumps({'name': 'mean_label'})) ], model_names=['model_name1', 'model_name2'], output_names=['output_name1'], binarize=config.BinarizationOptions( class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions( macro_average=True))) self.assertProtoEquals( metrics_specs[3], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='WeightedExampleCount', config=json.dumps( {'name': 'weighted_example_count'})), ], model_names=['model_name1', 'model_name2'], output_names=['output_name2'])) self.assertProtoEquals( metrics_specs[4], config.MetricsSpec( metrics=[ config.MetricConfig(class_name='RootMeanSquaredError', config=json.dumps( { 'name': 'rmse', 'dtype': 'float32' }, sort_keys=True)), config.MetricConfig( class_name='MeanAbsolutePercentageError', module=metric_specs._TF_LOSSES_MODULE, config=json.dumps({ 'reduction': 'auto', 'name': 'mape' }, sort_keys=True)), config.MetricConfig(class_name='MeanPrediction', config=json.dumps( {'name': 'mean_prediction'})) ], model_names=['model_name1', 'model_name2'], output_names=['output_name2'], binarize=config.BinarizationOptions( class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions(macro_average=True)))
def testMetricThresholdsFromMetricsSpecs(self): metrics_specs = [ config.MetricsSpec( thresholds={ 'auc': config.MetricThreshold( value_threshold=config.GenericValueThreshold()), 'mean/label': config.MetricThreshold( value_threshold=config.GenericValueThreshold(), change_threshold=config.GenericChangeThreshold()), # The mse metric will be overridden by MetricConfig below. 'mse': config.MetricThreshold( change_threshold=config.GenericChangeThreshold()) }, # Model names and output_names should be ignored because # ExampleCount is model independent. model_names=['model_name'], output_names=['output_name']), config.MetricsSpec( metrics=[ config.MetricConfig( class_name='ExampleCount', config=json.dumps({'name': 'example_count'}), threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold())) ], # Model names and output_names should be ignored because # ExampleCount is model independent. model_names=['model_name1', 'model_name2'], output_names=['output_name1', 'output_name2']), config.MetricsSpec(metrics=[ config.MetricConfig( class_name='WeightedExampleCount', config=json.dumps({'name': 'weighted_example_count'}), threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold())) ], model_names=['model_name1', 'model_name2'], output_names=['output_name1', 'output_name2']), config.MetricsSpec( metrics=[ config.MetricConfig( class_name='MeanSquaredError', config=json.dumps({'name': 'mse'}), threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold())), config.MetricConfig( class_name='MeanLabel', config=json.dumps({'name': 'mean_label'}), threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold())) ], model_names=['model_name'], output_names=['output_name'], binarize=config.BinarizationOptions( class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions(macro_average=True)) ] thresholds = metric_specs.metric_thresholds_from_metrics_specs( metrics_specs) self.assertLen(thresholds, 14) self.assertIn( metric_types.MetricKey(name='auc', model_name='model_name', output_name='output_name'), thresholds) self.assertIn( metric_types.MetricKey(name='mean/label', model_name='model_name', output_name='output_name', is_diff=True), thresholds) self.assertIn( metric_types.MetricKey(name='mean/label', model_name='model_name', output_name='output_name', is_diff=False), thresholds) self.assertIn(metric_types.MetricKey(name='example_count'), thresholds) self.assertIn( metric_types.MetricKey(name='weighted_example_count', model_name='model_name1', output_name='output_name1'), thresholds) self.assertIn( metric_types.MetricKey(name='weighted_example_count', model_name='model_name1', output_name='output_name2'), thresholds) self.assertIn( metric_types.MetricKey(name='weighted_example_count', model_name='model_name2', output_name='output_name1'), thresholds) self.assertIn( metric_types.MetricKey(name='weighted_example_count', model_name='model_name2', output_name='output_name2'), thresholds) self.assertIn( metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0), is_diff=True), thresholds) self.assertIn( metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1), is_diff=True), thresholds) self.assertIn( metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', is_diff=True), thresholds) self.assertIn( metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0), is_diff=True), thresholds) self.assertIn( metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1), is_diff=True), thresholds) self.assertIn( metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', is_diff=True), thresholds)
def testSpecsFromMetrics(self): metrics_specs = metric_specs.specs_from_metrics( { 'output_name1': [ tf.keras.metrics.MeanSquaredError('mse'), calibration.MeanLabel('mean_label') ], 'output_name2': [ tf.keras.metrics.RootMeanSquaredError('rmse'), calibration.MeanPrediction('mean_prediction') ] }, model_names=['model_name1', 'model_name2'], binarize=config.BinarizationOptions(class_ids=[0, 1]), aggregate=config.AggregationOptions(macro_average=True)) self.assertLen(metrics_specs, 5) self.assertProtoEquals( metrics_specs[0], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='ExampleCount', config=json.dumps( {'name': 'example_count'})), ])) self.assertProtoEquals( metrics_specs[1], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='WeightedExampleCount', config=json.dumps( {'name': 'weighted_example_count'})), ], model_names=['model_name1', 'model_name2'], output_names=['output_name1'])) self.assertProtoEquals( metrics_specs[2], config.MetricsSpec( metrics=[ config.MetricConfig(class_name='MeanSquaredError', config=json.dumps({ 'name': 'mse', 'dtype': 'float32' })), config.MetricConfig(class_name='MeanLabel', config=json.dumps( {'name': 'mean_label'})) ], model_names=['model_name1', 'model_name2'], output_names=['output_name1'], binarize=config.BinarizationOptions(class_ids=[0, 1]), aggregate=config.AggregationOptions(macro_average=True))) self.assertProtoEquals( metrics_specs[3], config.MetricsSpec(metrics=[ config.MetricConfig(class_name='WeightedExampleCount', config=json.dumps( {'name': 'weighted_example_count'})), ], model_names=['model_name1', 'model_name2'], output_names=['output_name2'])) self.assertProtoEquals( metrics_specs[4], config.MetricsSpec( metrics=[ config.MetricConfig(class_name='RootMeanSquaredError', config=json.dumps({ 'name': 'rmse', 'dtype': 'float32' })), config.MetricConfig(class_name='MeanPrediction', config=json.dumps( {'name': 'mean_prediction'})) ], model_names=['model_name1', 'model_name2'], output_names=['output_name2'], binarize=config.BinarizationOptions(class_ids=[0, 1]), aggregate=config.AggregationOptions(macro_average=True)))
def testToComputations(self): computations = metric_specs.to_computations( metric_specs.specs_from_metrics( { 'output_name': [ tf.keras.metrics.MeanSquaredError('mse'), # Add a loss exactly same as metric # (https://github.com/tensorflow/tfx/issues/1550) tf.keras.losses.MeanSquaredError(name='loss'), calibration.MeanLabel('mean_label') ] }, model_names=['model_name'], binarize=config.BinarizationOptions( class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions(macro_average=True, class_weights={ 0: 1.0, 1: 1.0 })), config.EvalConfig()) keys = [] for m in computations: for k in m.keys: if not k.name.startswith('_'): keys.append(k) self.assertLen(keys, 11) self.assertIn( metric_types.MetricKey(name='example_count', model_name='model_name'), keys) self.assertIn( metric_types.MetricKey(name='weighted_example_count', model_name='model_name', output_name='output_name'), keys) self.assertIn( metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0)), keys) self.assertIn( metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1)), keys) self.assertIn( metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name'), keys) self.assertIn( metric_types.MetricKey(name='loss', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0)), keys) self.assertIn( metric_types.MetricKey(name='loss', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1)), keys) self.assertIn( metric_types.MetricKey(name='loss', model_name='model_name', output_name='output_name'), keys) self.assertIn( metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0)), keys) self.assertIn( metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1)), keys) self.assertIn( metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name'), keys)
def testMetricThresholdsFromMetricsSpecs(self): slice_specs = [ config.SlicingSpec(feature_keys=['feature1']), config.SlicingSpec(feature_values={'feature2': 'value1'}) ] # For cross slice tests. baseline_slice_spec = config.SlicingSpec(feature_keys=['feature3']) metrics_specs = [ config.MetricsSpec( thresholds={ 'auc': config.MetricThreshold( value_threshold=config.GenericValueThreshold()), 'mean/label': config.MetricThreshold( value_threshold=config.GenericValueThreshold(), change_threshold=config.GenericChangeThreshold()), 'mse': config.MetricThreshold( change_threshold=config.GenericChangeThreshold()) }, per_slice_thresholds={ 'auc': config.PerSliceMetricThresholds(thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slice_specs, threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( ))) ]), 'mean/label': config.PerSliceMetricThresholds(thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slice_specs, threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold(), change_threshold=config.GenericChangeThreshold( ))) ]) }, cross_slice_thresholds={ 'auc': config.CrossSliceMetricThresholds(thresholds=[ config.CrossSliceMetricThreshold( cross_slicing_specs=[ config.CrossSlicingSpec( baseline_spec=baseline_slice_spec, slicing_specs=slice_specs) ], threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold(), change_threshold=config.GenericChangeThreshold( ))) ]), 'mse': config.CrossSliceMetricThresholds(thresholds=[ config.CrossSliceMetricThreshold( cross_slicing_specs=[ config.CrossSlicingSpec( baseline_spec=baseline_slice_spec, slicing_specs=slice_specs) ], threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( ))), # Test for duplicate cross_slicing_spec. config.CrossSliceMetricThreshold( cross_slicing_specs=[ config.CrossSlicingSpec( baseline_spec=baseline_slice_spec, slicing_specs=slice_specs) ], threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold()) ) ]) }, model_names=['model_name'], output_names=['output_name']), config.MetricsSpec(metrics=[ config.MetricConfig( class_name='ExampleCount', config=json.dumps({'name': 'example_count'}), threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold())) ], model_names=['model_name1', 'model_name2'], output_names=['output_name1', 'output_name2']), config.MetricsSpec(metrics=[ config.MetricConfig( class_name='WeightedExampleCount', config=json.dumps({'name': 'weighted_example_count'}), threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold())) ], model_names=['model_name1', 'model_name2'], output_names=['output_name1', 'output_name2']), config.MetricsSpec(metrics=[ config.MetricConfig( class_name='MeanSquaredError', config=json.dumps({'name': 'mse'}), threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold())), config.MetricConfig( class_name='MeanLabel', config=json.dumps({'name': 'mean_label'}), threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold()), per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slice_specs, threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( ))), ], cross_slice_thresholds=[ config.CrossSliceMetricThreshold( cross_slicing_specs=[ config.CrossSlicingSpec( baseline_spec=baseline_slice_spec, slicing_specs=slice_specs) ], threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( ))) ]), ], model_names=['model_name'], output_names=['output_name'], binarize=config.BinarizationOptions( class_ids={'values': [0, 1]}), aggregate=config.AggregationOptions( macro_average=True, class_weights={ 0: 1.0, 1: 1.0 })) ] thresholds = metric_specs.metric_thresholds_from_metrics_specs( metrics_specs) expected_keys_and_threshold_counts = { metric_types.MetricKey(name='auc', model_name='model_name', output_name='output_name', is_diff=False): 4, metric_types.MetricKey(name='auc', model_name='model_name', output_name='output_name', is_diff=True): 1, metric_types.MetricKey(name='mean/label', model_name='model_name', output_name='output_name', is_diff=True): 3, metric_types.MetricKey(name='mean/label', model_name='model_name', output_name='output_name', is_diff=False): 3, metric_types.MetricKey(name='example_count', model_name='model_name1', output_name='output_name1'): 1, metric_types.MetricKey(name='example_count', model_name='model_name1', output_name='output_name2'): 1, metric_types.MetricKey(name='example_count', model_name='model_name2', output_name='output_name1'): 1, metric_types.MetricKey(name='example_count', model_name='model_name2', output_name='output_name2'): 1, metric_types.MetricKey(name='weighted_example_count', model_name='model_name1', output_name='output_name1'): 1, metric_types.MetricKey(name='weighted_example_count', model_name='model_name1', output_name='output_name2'): 1, metric_types.MetricKey(name='weighted_example_count', model_name='model_name2', output_name='output_name1'): 1, metric_types.MetricKey(name='weighted_example_count', model_name='model_name2', output_name='output_name2'): 1, metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0), is_diff=True): 1, metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1), is_diff=True): 1, metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', is_diff=True): 2, metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', is_diff=False): 1, metric_types.MetricKey(name='mse', model_name='model_name', output_name='output_name', aggregation_type=metric_types.AggregationType(macro_average=True), is_diff=True): 1, metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=0), is_diff=True): 4, metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', sub_key=metric_types.SubKey(class_id=1), is_diff=True): 4, metric_types.MetricKey(name='mean_label', model_name='model_name', output_name='output_name', aggregation_type=metric_types.AggregationType(macro_average=True), is_diff=True): 4 } self.assertLen(thresholds, len(expected_keys_and_threshold_counts)) for key, count in expected_keys_and_threshold_counts.items(): self.assertIn(key, thresholds) self.assertLen(thresholds[key], count, 'failed for key {}'.format(key))