def example_count_specs(
        model_names: Optional[List[Text]] = None,
        output_names: Optional[List[Text]] = None,
        include_example_count: bool = True,
        include_weighted_example_count: bool = True
) -> List[config.MetricsSpec]:
    """Returns metric specs for example count and weighted example counts.

  Args:
    model_names: Optional list of model names (if multi-model evaluation).
    output_names: Optional list of output names (if multi-output model).
    include_example_count: True to add example_count metric.
    include_weighted_example_count: True to add weighted_example_count metric. A
      weighted example count will be added per output for multi-output models.
  """
    specs = []
    if include_example_count:
        metric_config = _serialize_tfma_metric(example_count.ExampleCount())
        specs.append(
            config.MetricsSpec(metrics=[metric_config],
                               model_names=model_names))
    if include_weighted_example_count:
        metric_config = _serialize_tfma_metric(
            weighted_example_count.WeightedExampleCount())
        specs.append(
            config.MetricsSpec(metrics=[metric_config],
                               model_names=model_names,
                               output_names=output_names))
    return specs
Exemple #2
0
def example_count_specs(
    model_names: Optional[List[Text]] = None,
    output_names: Optional[List[Text]] = None,
    output_weights: Optional[Dict[Text, float]] = None,
    include_example_count: bool = True,
    include_weighted_example_count: bool = True) -> List[config.MetricsSpec]:
  """Returns metric specs for example count and weighted example counts.

  Args:
    model_names: Optional list of model names (if multi-model evaluation).
    output_names: Optional list of output names (if multi-output model).
    output_weights: Optional output weights for creating overall metric
      aggregated across outputs (if multi-output model). If a weight is not
      provided for an output, it's weight defaults to 0.0 (i.e. output ignored).
    include_example_count: True to add example_count metric.
    include_weighted_example_count: True to add weighted_example_count metric. A
      weighted example count will be added per output for multi-output models.
  """
  specs = []
  if include_example_count:
    metric_config = _serialize_tfma_metric(example_count.ExampleCount())
    specs.append(
        config.MetricsSpec(metrics=[metric_config], model_names=model_names))
  if include_weighted_example_count:
    metric_config = _serialize_tfma_metric(
        weighted_example_count.WeightedExampleCount())
    specs.append(
        config.MetricsSpec(
            metrics=[metric_config],
            model_names=model_names,
            output_names=output_names,
            output_weights=output_weights))
  return specs
Exemple #3
0
  def testToComputationsWithMixedAggregationAndNonAggregationMetrics(self):
    computations = metric_specs.to_computations([
        config.MetricsSpec(
            metrics=[config.MetricConfig(class_name='CategoricalAccuracy')]),
        config.MetricsSpec(
            metrics=[config.MetricConfig(class_name='BinaryCrossentropy')],
            binarize=config.BinarizationOptions(class_ids={'values': [1]}),
            aggregate=config.AggregationOptions(micro_average=True))
    ], config.EvalConfig())

    # 3 separate computations should be used (one for aggregated metrics, one
    # for non-aggregated metrics, and one for metrics associated with class 1)
    self.assertLen(computations, 3)
 def testValidateMetricsMetricValueAndThresholdIgnoreUnmatchedSlice(
         self, slicing_specs, slice_key):
     threshold = config.MetricThreshold(
         value_threshold=config.GenericValueThreshold(
             upper_bound={'value': 1}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='WeightedExampleCount',
                         # 1.5 < 1, NOT OK.
                         per_slice_thresholds=[
                             config.PerSliceMetricThreshold(
                                 slicing_specs=slicing_specs,
                                 threshold=threshold)
                         ]),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = (slice_key, {
         metric_types.MetricKey(name='weighted_example_count'):
         1.5,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertTrue(result.validation_ok)
 def testValidateMetricsValueThresholdLowerBoundPass(
         self, slicing_specs, slice_key):
     threshold = config.MetricThreshold(
         value_threshold=config.GenericValueThreshold(
             lower_bound={'value': 1}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='WeightedExampleCount',
                         # 2 > 1, OK.
                         threshold=threshold
                         if slicing_specs is None else None,
                         per_slice_thresholds=[
                             config.PerSliceMetricThreshold(
                                 slicing_specs=slicing_specs,
                                 threshold=threshold)
                         ]),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = (slice_key, {
         metric_types.MetricKey(name='weighted_example_count'):
         2,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertTrue(result.validation_ok)
 def testValidateMetricsCrossSliceThresholdFail(self, cross_slicing_specs,
                                                slice_key):
     threshold = config.MetricThreshold(
         value_threshold=config.GenericValueThreshold(
             upper_bound={'value': 1}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         cross_slicing_specs=cross_slicing_specs,
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='WeightedExampleCount',
                         # 1.5 < 1, NOT OK.
                         threshold=(threshold if cross_slicing_specs is None
                                    else None),
                         cross_slice_thresholds=[
                             config.CrossSliceMetricThreshold(
                                 cross_slicing_specs=cross_slicing_specs,
                                 threshold=threshold)
                         ]),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = (slice_key, {
         metric_types.MetricKey(name='weighted_example_count'):
         1.5,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
 def testValidateMetricsChangeThresholdHigherIsBetterFail(self):
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
             config.ModelSpec(name='baseline', is_baseline=True)
         ],
         slicing_specs=[config.SlicingSpec()],
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='MeanPrediction',
                         # Diff = -.333 > 0, NOT OK.
                         threshold=config.MetricThreshold(
                             change_threshold=config.GenericChangeThreshold(
                                 direction=config.MetricDirection.
                                 HIGHER_IS_BETTER,
                                 absolute={'value': 0}))),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = ((()), {
         metric_types.MetricKey(name='mean_prediction',
                                model_name='baseline'):
         0.333,
         metric_types.MetricKey(name='mean_prediction', is_diff=True):
         -0.333,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
 def testValidateMetricsValueThresholdLowerBoundPass(self):
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=[config.SlicingSpec()],
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='WeightedExampleCount',
                         # 2 > 1, OK.
                         threshold=config.MetricThreshold(
                             value_threshold=config.GenericValueThreshold(
                                 lower_bound={'value': 1}))),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = ((()), {
         metric_types.MetricKey(name='weighted_example_count'):
         2,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertTrue(result.validation_ok)
 def testValidateMetricsChangeThresholdRelativePass(self):
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
             config.ModelSpec(name='baseline', is_baseline=True)
         ],
         slicing_specs=[config.SlicingSpec()],
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='MeanPrediction',
                         # Diff = -.333
                         # Diff% = -.333/.333 = -100% < 0%, OK.
                         threshold=config.MetricThreshold(
                             change_threshold=config.GenericChangeThreshold(
                                 direction=config.MetricDirection.
                                 LOWER_IS_BETTER,
                                 relative={'value': 0}))),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = ((()), {
         metric_types.MetricKey(name='mean_prediction',
                                model_name='baseline'):
         0.333,
         metric_types.MetricKey(name='mean_prediction', is_diff=True):
         -0.333,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertTrue(result.validation_ok)
Exemple #10
0
 def testMetricKeysToSkipForConfidenceIntervals(self):
   metrics_specs = [
       config.MetricsSpec(
           metrics=[
               config.MetricConfig(
                   class_name='ExampleCount',
                   config=json.dumps({'name': 'example_count'}),
                   threshold=config.MetricThreshold(
                       value_threshold=config.GenericValueThreshold())),
               config.MetricConfig(
                   class_name='MeanLabel',
                   config=json.dumps({'name': 'mean_label'}),
                   threshold=config.MetricThreshold(
                       change_threshold=config.GenericChangeThreshold())),
               config.MetricConfig(
                   class_name='MeanSquaredError',
                   config=json.dumps({'name': 'mse'}),
                   threshold=config.MetricThreshold(
                       change_threshold=config.GenericChangeThreshold()))
           ],
           # Model names and output_names should be ignored because
           # ExampleCount is model independent.
           model_names=['model_name1', 'model_name2'],
           output_names=['output_name1', 'output_name2']),
   ]
   metrics_specs += metric_specs.specs_from_metrics(
       [tf.keras.metrics.MeanSquaredError('mse')])
   keys = metric_specs.metric_keys_to_skip_for_confidence_intervals(
       metrics_specs)
   self.assertLen(keys, 1)
   self.assertIn(metric_types.MetricKey(name='example_count'), keys)
 def testValidateMetricsMetricTDistributionValueAndThreshold(
         self, slicing_specs, slice_key):
     threshold = config.MetricThreshold(
         value_threshold=config.GenericValueThreshold(
             lower_bound={'value': 0.9}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(metrics=[
                 config.MetricConfig(
                     class_name='AUC',
                     threshold=threshold if slicing_specs is None else None,
                     per_slice_thresholds=[
                         config.PerSliceMetricThreshold(
                             slicing_specs=slicing_specs,
                             threshold=threshold)
                     ]),
             ],
                                model_names=['']),
         ],
     )
     sliced_metrics = (slice_key, {
         metric_types.MetricKey(name='auc'):
         types.ValueWithTDistribution(sample_mean=0.91, unsampled_value=0.8)
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
     expected = text_format.Parse(
         """
     metric_validations_per_slice {
       failures {
         metric_key {
           name: "auc"
         }
         metric_value {
           double_value {
             value: 0.8
           }
         }
       }
     }""", validation_result_pb2.ValidationResult())
     expected.metric_validations_per_slice[0].failures[
         0].metric_threshold.CopyFrom(threshold)
     expected.metric_validations_per_slice[0].slice_key.CopyFrom(
         slicer.serialize_slice_key(slice_key))
     for spec in slicing_specs or [None]:
         if (spec is None or slicer.SingleSliceSpec(
                 spec=spec).is_slice_applicable(slice_key)):
             slicing_details = expected.validation_details.slicing_details.add(
             )
             if spec is not None:
                 slicing_details.slicing_spec.CopyFrom(spec)
             else:
                 slicing_details.slicing_spec.CopyFrom(config.SlicingSpec())
             slicing_details.num_matching_slices = 1
     self.assertEqual(result, expected)
    def testGetMissingSlices(self):
        slicing_specs = [
            config.SlicingSpec(),
            config.SlicingSpec(feature_values={'feature1': 'value1'}),
            config.SlicingSpec(feature_values={'feature2': 'value2'})
        ]
        threshold = config.MetricThreshold(
            value_threshold=config.GenericValueThreshold(
                upper_bound={'value': 1}))
        eval_config = config.EvalConfig(
            model_specs=[
                config.ModelSpec(),
            ],
            slicing_specs=slicing_specs,
            metrics_specs=[
                config.MetricsSpec(
                    metrics=[
                        config.MetricConfig(
                            class_name='WeightedExampleCount',
                            # 1.5 < 1, NOT OK.
                            per_slice_thresholds=[
                                config.PerSliceMetricThreshold(
                                    slicing_specs=slicing_specs,
                                    threshold=threshold)
                            ]),
                    ],
                    model_names=['']),
            ],
        )
        sliced_metrics = ((('feature1', 'value1'), ), {
            metric_types.MetricKey(name='weighted_example_count'):
            0,
        })
        result = metrics_validator.validate_metrics(sliced_metrics,
                                                    eval_config)

        expected_checks = text_format.Parse(
            """
        validation_ok: true
        validation_details {
          slicing_details {
            slicing_spec {
              feature_values {
                key: "feature1"
                value: "value1"
              }
            }
            num_matching_slices: 1
          }
        }""", validation_result_pb2.ValidationResult())

        self.assertProtoEquals(expected_checks, result)

        missing = metrics_validator.get_missing_slices(
            result.validation_details.slicing_details, eval_config)
        self.assertLen(missing, 2)
        self.assertProtoEquals(missing[0], slicing_specs[0])
        self.assertProtoEquals(missing[1], slicing_specs[2])
  def testMetricSpecsFromKerasSequential(self):
    export_dir = os.path.join(self._getTempDir(), 'export_dir')
    model = tf.keras.models.Sequential([
        tf.keras.layers.InputLayer(input_shape=(1,), name='test'),
        tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)
    ])
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=[tf.keras.metrics.MeanSquaredError(name='mse')])
    features = [[0.0], [1.0]]
    labels = [[1], [0]]
    dataset = tf.data.Dataset.from_tensor_slices((features, labels))
    dataset = dataset.shuffle(buffer_size=1).repeat().batch(2)
    model.fit(dataset, steps_per_epoch=1)
    model.save(export_dir, save_format='tf')

    eval_shared_model = self.createTestEvalSharedModel(
        eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])

    metrics_specs = (
        keras_util.metrics_specs_from_keras('', eval_shared_model.model_loader))

    # TODO(b/149995449): Keras does not support re-loading metrics with the new
    #   API. Re-enable after this is fixed.
    model = eval_shared_model.model_loader.construct_fn(lambda x: None)()
    if not hasattr(model, 'loss_functions'):
      return

    self.assertLen(metrics_specs, 1)
    self.assertProtoEquals(
        self._comparable_spec(metrics_specs[0]),
        config.MetricsSpec(
            metrics=[
                config.MetricConfig(
                    class_name='BinaryCrossentropy',
                    config=json.dumps(
                        {
                            'from_logits': False,
                            'label_smoothing': 0,
                            'reduction': 'auto',
                            'name': 'binary_crossentropy'
                        },
                        sort_keys=True)),
                config.MetricConfig(
                    class_name='MeanSquaredError',
                    config=json.dumps({
                        'name': 'mse',
                        'dtype': 'float32'
                    },
                                      sort_keys=True))
            ],
            model_names=['']))
 def testValidateMetricsMetricValueAndThreshold(self):
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=[config.SlicingSpec()],
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='WeightedExampleCount',
                         # 1.5 < 1, NOT OK.
                         threshold=config.MetricThreshold(
                             value_threshold=config.GenericValueThreshold(
                                 upper_bound={'value': 1}))),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = ((()), {
         metric_types.MetricKey(name='weighted_example_count'):
         1.5,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
     expected = text_format.Parse(
         """
     metric_validations_per_slice {
       slice_key {
       }
       failures {
         metric_key {
           name: "weighted_example_count"
         }
         metric_threshold {
           value_threshold {
             upper_bound {
               value: 1.0
             }
           }
         }
         metric_value {
           double_value {
             value: 1.5
           }
         }
       }
     }""", validation_result_pb2.ValidationResult())
     self.assertEqual(result, expected)
  def testMetricsSpecBeamCounter(self):
    with beam.Pipeline() as pipeline:
      metrics_spec = config.MetricsSpec(
          metrics=[config.MetricConfig(class_name='FairnessIndicators')])
      _ = pipeline | counter_util.IncrementMetricsSpecsCounters([metrics_spec])

    result = pipeline.run()
    metric_filter = beam.metrics.metric.MetricsFilter().with_namespace(
        constants.METRICS_NAMESPACE).with_name(
            'metric_computed_FairnessIndicators_v2')
    actual_metrics_count = result.metrics().query(
        filter=metric_filter)['counters'][0].committed

    self.assertEqual(actual_metrics_count, 1)
 def testValidateMetricsMetricValueAndThreshold(self, slicing_specs,
                                                slice_key):
   threshold = config.MetricThreshold(
       value_threshold=config.GenericValueThreshold(upper_bound={'value': 1}))
   eval_config = config.EvalConfig(
       model_specs=[
           config.ModelSpec(),
       ],
       slicing_specs=slicing_specs,
       metrics_specs=[
           config.MetricsSpec(
               metrics=[
                   config.MetricConfig(
                       class_name='WeightedExampleCount',
                       # 1.5 < 1, NOT OK.
                       threshold=threshold if slicing_specs is None else None,
                       per_slice_thresholds=[
                           config.PerSliceMetricThreshold(
                               slicing_specs=slicing_specs,
                               threshold=threshold)
                       ]),
               ],
               model_names=['']),
       ],
   )
   sliced_metrics = (slice_key, {
       metric_types.MetricKey(name='weighted_example_count'): 1.5,
   })
   result = metrics_validator.validate_metrics(sliced_metrics, eval_config)
   self.assertFalse(result.validation_ok)
   expected = text_format.Parse(
       """
       metric_validations_per_slice {
         failures {
           metric_key {
             name: "weighted_example_count"
           }
           metric_value {
             double_value {
               value: 1.5
             }
           }
         }
       }""", validation_result_pb2.ValidationResult())
   expected.metric_validations_per_slice[0].failures[
       0].metric_threshold.CopyFrom(threshold)
   expected.metric_validations_per_slice[0].slice_key.CopyFrom(
       slicer.serialize_slice_key(slice_key))
   self.assertEqual(result, expected)
 def testValidateMetricsInvalidThreshold(self):
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=[config.SlicingSpec()],
         metrics_specs=[
             config.MetricsSpec(
                 thresholds={
                     'invalid_threshold':
                     config.MetricThreshold(
                         value_threshold=config.GenericValueThreshold(
                             lower_bound={'value': 0.2}))
                 })
         ],
     )
     sliced_metrics = ((()), {
         metric_types.MetricKey(name='weighted_example_count'):
         1.5,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
     expected = text_format.Parse(
         """
     metric_validations_per_slice {
       slice_key {
       }
       failures {
         metric_key {
           name: "invalid_threshold"
         }
         metric_threshold {
           value_threshold {
             lower_bound {
               value: 0.2
             }
           }
         }
         message: 'Metric not found.'
       }
     }""", validation_result_pb2.ValidationResult())
     self.assertProtoEquals(expected, result)
 def testValidateMetricsChangeThresholdRelativeFail(self, slicing_specs,
                                                    slice_key):
     threshold = config.MetricThreshold(
         change_threshold=config.GenericChangeThreshold(
             direction=config.MetricDirection.LOWER_IS_BETTER,
             relative={'value': -2}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
             config.ModelSpec(name='baseline', is_baseline=True)
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='MeanPrediction',
                         # Diff = -.333
                         # Diff% = -.333/.333 = -100% < -200%, NOT OK.
                         threshold=threshold
                         if slicing_specs is None else None,
                         per_slice_thresholds=[
                             config.PerSliceMetricThreshold(
                                 slicing_specs=slicing_specs,
                                 threshold=threshold)
                         ])
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = (slice_key, {
         metric_types.MetricKey(name='mean_prediction',
                                model_name='baseline'):
         0.333,
         metric_types.MetricKey(name='mean_prediction', is_diff=True):
         -0.333,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
 def testValidateMetricsChangeThresholdHigherIsBetterPass(
         self, slicing_specs, slice_key):
     threshold = config.MetricThreshold(
         change_threshold=config.GenericChangeThreshold(
             direction=config.MetricDirection.HIGHER_IS_BETTER,
             absolute={'value': -1}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
             config.ModelSpec(name='baseline', is_baseline=True)
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='MeanPrediction',
                         # Diff = -.333 > -1, OK.
                         threshold=threshold
                         if slicing_specs is None else None,
                         per_slice_thresholds=[
                             config.PerSliceMetricThreshold(
                                 slicing_specs=slicing_specs,
                                 threshold=threshold)
                         ])
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = (slice_key, {
         metric_types.MetricKey(name='mean_prediction',
                                model_name='baseline'):
         0.333,
         metric_types.MetricKey(name='mean_prediction', is_diff=True):
         -0.333,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertTrue(result.validation_ok)
Exemple #20
0
 def testValidateMetricsDivByZero(self):
   threshold = config.MetricThreshold(
       change_threshold=config.GenericChangeThreshold(
           direction=config.MetricDirection.HIGHER_IS_BETTER,
           relative={'value': 0.1}))
   slicing_specs = [config.SlicingSpec()]
   eval_config = config.EvalConfig(
       model_specs=[
           config.ModelSpec(name='candidate'),
           config.ModelSpec(name='baseline', is_baseline=True)
       ],
       slicing_specs=slicing_specs,
       metrics_specs=[
           config.MetricsSpec(
               metrics=[
                   config.MetricConfig(
                       class_name='MeanPrediction',
                       threshold=threshold if slicing_specs is None else None,
                       per_slice_thresholds=[
                           config.PerSliceMetricThreshold(
                               slicing_specs=slicing_specs,
                               threshold=threshold)
                       ])
               ],
               model_names=['baseline', 'candidate']),
       ],
   )
   sliced_metrics = ((()), {
       metric_types.MetricKey(name='mean_prediction', model_name='baseline'):
           0.0,
       metric_types.MetricKey(
           name='mean_prediction', model_name='candidate', is_diff=True):
           0.1,
   })
   result = metrics_validator.validate_metrics(sliced_metrics, eval_config)
   self.assertFalse(result.validation_ok)
Exemple #21
0
def specs_from_metrics(
    metrics: Union[List[_TFOrTFMAMetric], Dict[Text, List[_TFOrTFMAMetric]]],
    model_names: Optional[List[Text]] = None,
    output_names: Optional[List[Text]] = None,
    output_weights: Optional[Dict[Text, float]] = None,
    binarize: Optional[config.BinarizationOptions] = None,
    aggregate: Optional[config.AggregationOptions] = None,
    query_key: Optional[Text] = None,
    include_example_count: Optional[bool] = None,
    include_weighted_example_count: Optional[bool] = None
) -> List[config.MetricsSpec]:
  """Returns specs for tf.keras.metrics/losses or tfma.metrics classes.

  Examples:

    metrics_specs = specs_from_metrics([
        tf.keras.metrics.BinaryAccuracy(),
        tf.keras.metrics.AUC(),
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall(),
        tfma.metrics.MeanLabel(),
        tfma.metrics.MeanPrediction()
        ...
    ])

    metrics_specs = specs_from_metrics({
      'output1': [
          tf.keras.metrics.BinaryAccuracy(),
          tf.keras.metrics.AUC(),
          tfma.metrics.MeanLabel(),
          tfma.metrics.MeanPrediction()
          ...
      ],
      'output2': [
          tf.keras.metrics.Precision(),
          tf.keras.metrics.Recall(),
      ]
    })

  Args:
    metrics: List of tf.keras.metrics.Metric, tf.keras.losses.Loss, or
      tfma.metrics.Metric. For multi-output models a dict of dicts may be passed
      where the first dict is indexed by the output_name.
    model_names: Optional model names (if multi-model evaluation).
    output_names: Optional output names (if multi-output models). If the metrics
      are a dict this should not be set.
    output_weights: Optional output weights for creating overall metric
      aggregated across outputs (if multi-output model). If a weight is not
      provided for an output, it's weight defaults to 0.0 (i.e. output ignored).
    binarize: Optional settings for binarizing multi-class/multi-label metrics.
    aggregate: Optional settings for aggregating multi-class/multi-label
      metrics.
    query_key: Optional query key for query/ranking based metrics.
    include_example_count: True to add example_count metric. Default is True.
    include_weighted_example_count: True to add weighted_example_count metric.
      Default is True. A weighted example count will be added per output for
      multi-output models.
  """
  if isinstance(metrics, dict) and output_names:
    raise ValueError('metrics cannot be a dict when output_names is used: '
                     'metrics={}, output_names={}'.format(
                         metrics, output_names))
  if isinstance(metrics, dict):
    specs = []
    for output_name in sorted(metrics.keys()):
      specs.extend(
          specs_from_metrics(
              metrics[output_name],
              model_names=model_names,
              output_names=[output_name],
              binarize=binarize,
              aggregate=aggregate,
              include_example_count=include_example_count,
              include_weighted_example_count=include_weighted_example_count))
      include_example_count = False
    return specs

  if include_example_count is None:
    include_example_count = True
  if include_weighted_example_count is None:
    include_weighted_example_count = True

  # Add the computations for the example counts and weights since they are
  # independent of the model and class ID.
  specs = example_count_specs(
      model_names=model_names,
      output_names=output_names,
      output_weights=output_weights,
      include_example_count=include_example_count,
      include_weighted_example_count=include_weighted_example_count)

  metric_configs = []
  for metric in metrics:
    if isinstance(metric, tf.keras.metrics.Metric):
      metric_configs.append(_serialize_tf_metric(metric))
    elif isinstance(metric, tf.keras.losses.Loss):
      metric_configs.append(_serialize_tf_loss(metric))
    elif isinstance(metric, metric_types.Metric):
      metric_configs.append(_serialize_tfma_metric(metric))
    else:
      raise NotImplementedError('unknown metric type {}: metric={}'.format(
          type(metric), metric))
  specs.append(
      config.MetricsSpec(
          metrics=metric_configs,
          model_names=model_names,
          output_names=output_names,
          output_weights=output_weights,
          binarize=binarize,
          aggregate=aggregate,
          query_key=query_key))

  return specs
Exemple #22
0
def to_computations(
    metrics_specs: List[config.MetricsSpec],
    eval_config: Optional[config.EvalConfig] = None,
    schema: Optional[schema_pb2.Schema] = None
) -> metric_types.MetricComputations:
  """Returns computations associated with given metrics specs."""
  computations = []

  #
  # Split into TF metrics and TFMA metrics
  #

  # Dict[Text, Type[tf.keras.metrics.Metric]]
  tf_metric_classes = {}  # class_name -> class
  # Dict[Text, Type[tf.keras.losses.Loss]]
  tf_loss_classes = {}  # class_name -> class
  # List[metric_types.MetricsSpec]
  tf_metrics_specs = []
  # Dict[Text, Type[metric_types.Metric]]
  tfma_metric_classes = metric_types.registered_metrics()  # class_name -> class
  # List[metric_types.MetricsSpec]
  tfma_metrics_specs = []
  #
  # Note: Lists are used instead of Dicts for the following items because
  # protos are are no hashable.
  #
  # List[List[_TFOrTFMAMetric]] (offsets align with metrics_specs).
  per_spec_metric_instances = []
  # List[List[_TFMetricOrLoss]] (offsets align with tf_metrics_specs).
  per_tf_spec_metric_instances = []
  # List[List[metric_types.Metric]]] (offsets align with tfma_metrics_specs).
  per_tfma_spec_metric_instances = []
  for spec in metrics_specs:
    tf_spec = config.MetricsSpec()
    tf_spec.CopyFrom(spec)
    del tf_spec.metrics[:]
    tfma_spec = config.MetricsSpec()
    tfma_spec.CopyFrom(spec)
    del tfma_spec.metrics[:]
    for metric in spec.metrics:
      if metric.class_name in tfma_metric_classes:
        tfma_spec.metrics.append(metric)
      elif not metric.module:
        tf_spec.metrics.append(metric)
      else:
        cls = getattr(importlib.import_module(metric.module), metric.class_name)
        if issubclass(cls, tf.keras.metrics.Metric):
          tf_metric_classes[metric.class_name] = cls
          tf_spec.metrics.append(metric)
        elif issubclass(cls, tf.keras.losses.Loss):
          tf_loss_classes[metric.class_name] = cls
          tf_spec.metrics.append(metric)
        else:
          tfma_metric_classes[metric.class_name] = cls
          tfma_spec.metrics.append(metric)

    metric_instances = []
    if tf_spec.metrics:
      tf_metrics_specs.append(tf_spec)
      tf_metric_instances = []
      for m in tf_spec.metrics:
        # To distinguish losses from metrics, losses are required to set the
        # module name.
        if m.module == _TF_LOSSES_MODULE:
          tf_metric_instances.append(_deserialize_tf_loss(m, tf_loss_classes))
        else:
          tf_metric_instances.append(
              _deserialize_tf_metric(m, tf_metric_classes))
      per_tf_spec_metric_instances.append(tf_metric_instances)
      metric_instances.extend(tf_metric_instances)
    if tfma_spec.metrics:
      tfma_metrics_specs.append(tfma_spec)
      tfma_metric_instances = [
          _deserialize_tfma_metric(m, tfma_metric_classes)
          for m in tfma_spec.metrics
      ]
      per_tfma_spec_metric_instances.append(tfma_metric_instances)
      metric_instances.extend(tfma_metric_instances)
    per_spec_metric_instances.append(metric_instances)

  # Process TF specs
  computations.extend(
      _process_tf_metrics_specs(tf_metrics_specs, per_tf_spec_metric_instances,
                                eval_config))

  # Process TFMA specs
  computations.extend(
      _process_tfma_metrics_specs(tfma_metrics_specs,
                                  per_tfma_spec_metric_instances, eval_config,
                                  schema))

  # Process aggregation based metrics (output aggregation and macro averaging).
  # Note that processing of TF and TFMA specs were setup to create the binarized
  # metrics that macro averaging depends on.
  for i, spec in enumerate(metrics_specs):
    for aggregation_type, sub_keys in _create_sub_keys(spec).items():
      output_names = spec.output_names or ['']
      output_weights = dict(spec.output_weights)
      if not set(output_weights.keys()).issubset(output_names):
        raise ValueError(
            'one or more output_names used in output_weights does not exist: '
            'output_names={}, output_weights={}'.format(output_names,
                                                        output_weights))
      for model_name in spec.model_names or ['']:
        for sub_key in sub_keys:
          for metric in per_spec_metric_instances[i]:
            if (aggregation_type and (aggregation_type.macro_average or
                                      aggregation_type.weighted_macro_average)):
              class_weights = _class_weights(spec) or {}
              for output_name in output_names:
                sub_keys = _macro_average_sub_keys(sub_key, class_weights)
                if aggregation_type.macro_average:
                  computations.extend(
                      aggregation.macro_average(
                          metric.get_config()['name'],
                          sub_keys=sub_keys,
                          eval_config=eval_config,
                          model_name=model_name,
                          output_name=output_name,
                          sub_key=sub_key,
                          class_weights=class_weights))
                elif aggregation_type.weighted_macro_average:
                  computations.extend(
                      aggregation.weighted_macro_average(
                          metric.get_config()['name'],
                          sub_keys=sub_keys,
                          eval_config=eval_config,
                          model_name=model_name,
                          output_name=output_name,
                          sub_key=sub_key,
                          class_weights=class_weights))
            if output_weights:
              computations.extend(
                  aggregation.output_average(
                      metric.get_config()['name'],
                      output_weights=output_weights,
                      eval_config=eval_config,
                      model_name=model_name,
                      sub_key=sub_key))

  return computations
 def testRunModelAnalysisWithModelAgnosticPredictions(self):
     examples = [
         self._makeExample(age=3.0,
                           language='english',
                           label=1.0,
                           prediction=0.9),
         self._makeExample(age=3.0,
                           language='chinese',
                           label=0.0,
                           prediction=0.4),
         self._makeExample(age=4.0,
                           language='english',
                           label=1.0,
                           prediction=0.7),
         self._makeExample(age=5.0,
                           language='chinese',
                           label=1.0,
                           prediction=0.2)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     model_specs = [
         config.ModelSpec(prediction_key='prediction',
                          label_key='label',
                          example_weight_key='age')
     ]
     metrics = [
         config.MetricConfig(class_name='ExampleCount'),
         config.MetricConfig(class_name='WeightedExampleCount'),
         config.MetricConfig(class_name='BinaryAccuracy')
     ]
     slicing_specs = [config.SlicingSpec(feature_keys=['language'])]
     eval_config = config.EvalConfig(
         model_specs=model_specs,
         metrics_specs=[config.MetricsSpec(metrics=metrics)],
         slicing_specs=slicing_specs)
     eval_result = model_eval_lib.run_model_analysis(
         eval_config=eval_config,
         data_location=data_location,
         output_path=self._getTempDir())
     expected = {
         (('language', 'chinese'), ): {
             'binary_accuracy': {
                 'doubleValue': 0.375
             },
             'weighted_example_count': {
                 'doubleValue': 8.0
             },
             'example_count': {
                 'doubleValue': 2.0
             },
         },
         (('language', 'english'), ): {
             'binary_accuracy': {
                 'doubleValue': 1.0
             },
             'weighted_example_count': {
                 'doubleValue': 7.0
             },
             'example_count': {
                 'doubleValue': 2.0
             },
         }
     }
     self.assertEqual(eval_result.data_location, data_location)
     self.assertEqual(eval_result.config.slicing_specs[0],
                      config.SlicingSpec(feature_keys=['language']))
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
Exemple #24
0
    def testSpecsFromMetrics(self):
        metrics_specs = metric_specs.specs_from_metrics(
            {
                'output_name1': [
                    tf.keras.metrics.MeanSquaredError('mse'),
                    tf.keras.losses.MeanAbsoluteError(name='mae'),
                    calibration.MeanLabel('mean_label')
                ],
                'output_name2': [
                    tf.keras.metrics.RootMeanSquaredError('rmse'),
                    tf.keras.losses.MeanAbsolutePercentageError(name='mape'),
                    calibration.MeanPrediction('mean_prediction')
                ]
            },
            model_names=['model_name1', 'model_name2'],
            binarize=config.BinarizationOptions(class_ids={'values': [0, 1]}),
            aggregate=config.AggregationOptions(macro_average=True))

        self.assertLen(metrics_specs, 5)
        self.assertProtoEquals(
            metrics_specs[0],
            config.MetricsSpec(metrics=[
                config.MetricConfig(class_name='ExampleCount',
                                    config=json.dumps(
                                        {'name': 'example_count'})),
            ]))
        self.assertProtoEquals(
            metrics_specs[1],
            config.MetricsSpec(metrics=[
                config.MetricConfig(class_name='WeightedExampleCount',
                                    config=json.dumps(
                                        {'name': 'weighted_example_count'})),
            ],
                               model_names=['model_name1', 'model_name2'],
                               output_names=['output_name1']))
        self.assertProtoEquals(
            metrics_specs[2],
            config.MetricsSpec(metrics=[
                config.MetricConfig(class_name='MeanSquaredError',
                                    config=json.dumps(
                                        {
                                            'name': 'mse',
                                            'dtype': 'float32'
                                        },
                                        sort_keys=True)),
                config.MetricConfig(class_name='MeanAbsoluteError',
                                    module=metric_specs._TF_LOSSES_MODULE,
                                    config=json.dumps(
                                        {
                                            'reduction': 'auto',
                                            'name': 'mae'
                                        },
                                        sort_keys=True)),
                config.MetricConfig(class_name='MeanLabel',
                                    config=json.dumps({'name': 'mean_label'}))
            ],
                               model_names=['model_name1', 'model_name2'],
                               output_names=['output_name1'],
                               binarize=config.BinarizationOptions(
                                   class_ids={'values': [0, 1]}),
                               aggregate=config.AggregationOptions(
                                   macro_average=True)))
        self.assertProtoEquals(
            metrics_specs[3],
            config.MetricsSpec(metrics=[
                config.MetricConfig(class_name='WeightedExampleCount',
                                    config=json.dumps(
                                        {'name': 'weighted_example_count'})),
            ],
                               model_names=['model_name1', 'model_name2'],
                               output_names=['output_name2']))
        self.assertProtoEquals(
            metrics_specs[4],
            config.MetricsSpec(
                metrics=[
                    config.MetricConfig(class_name='RootMeanSquaredError',
                                        config=json.dumps(
                                            {
                                                'name': 'rmse',
                                                'dtype': 'float32'
                                            },
                                            sort_keys=True)),
                    config.MetricConfig(
                        class_name='MeanAbsolutePercentageError',
                        module=metric_specs._TF_LOSSES_MODULE,
                        config=json.dumps({
                            'reduction': 'auto',
                            'name': 'mape'
                        },
                                          sort_keys=True)),
                    config.MetricConfig(class_name='MeanPrediction',
                                        config=json.dumps(
                                            {'name': 'mean_prediction'}))
                ],
                model_names=['model_name1', 'model_name2'],
                output_names=['output_name2'],
                binarize=config.BinarizationOptions(
                    class_ids={'values': [0, 1]}),
                aggregate=config.AggregationOptions(macro_average=True)))
Exemple #25
0
 def testMetricThresholdsFromMetricsSpecs(self):
     metrics_specs = [
         config.MetricsSpec(
             thresholds={
                 'auc':
                 config.MetricThreshold(
                     value_threshold=config.GenericValueThreshold()),
                 'mean/label':
                 config.MetricThreshold(
                     value_threshold=config.GenericValueThreshold(),
                     change_threshold=config.GenericChangeThreshold()),
                 # The mse metric will be overridden by MetricConfig below.
                 'mse':
                 config.MetricThreshold(
                     change_threshold=config.GenericChangeThreshold())
             },
             # Model names and output_names should be ignored because
             # ExampleCount is model independent.
             model_names=['model_name'],
             output_names=['output_name']),
         config.MetricsSpec(
             metrics=[
                 config.MetricConfig(
                     class_name='ExampleCount',
                     config=json.dumps({'name': 'example_count'}),
                     threshold=config.MetricThreshold(
                         value_threshold=config.GenericValueThreshold()))
             ],
             # Model names and output_names should be ignored because
             # ExampleCount is model independent.
             model_names=['model_name1', 'model_name2'],
             output_names=['output_name1', 'output_name2']),
         config.MetricsSpec(metrics=[
             config.MetricConfig(
                 class_name='WeightedExampleCount',
                 config=json.dumps({'name': 'weighted_example_count'}),
                 threshold=config.MetricThreshold(
                     value_threshold=config.GenericValueThreshold()))
         ],
                            model_names=['model_name1', 'model_name2'],
                            output_names=['output_name1', 'output_name2']),
         config.MetricsSpec(
             metrics=[
                 config.MetricConfig(
                     class_name='MeanSquaredError',
                     config=json.dumps({'name': 'mse'}),
                     threshold=config.MetricThreshold(
                         change_threshold=config.GenericChangeThreshold())),
                 config.MetricConfig(
                     class_name='MeanLabel',
                     config=json.dumps({'name': 'mean_label'}),
                     threshold=config.MetricThreshold(
                         change_threshold=config.GenericChangeThreshold()))
             ],
             model_names=['model_name'],
             output_names=['output_name'],
             binarize=config.BinarizationOptions(
                 class_ids={'values': [0, 1]}),
             aggregate=config.AggregationOptions(macro_average=True))
     ]
     thresholds = metric_specs.metric_thresholds_from_metrics_specs(
         metrics_specs)
     self.assertLen(thresholds, 14)
     self.assertIn(
         metric_types.MetricKey(name='auc',
                                model_name='model_name',
                                output_name='output_name'), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='mean/label',
                                model_name='model_name',
                                output_name='output_name',
                                is_diff=True), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='mean/label',
                                model_name='model_name',
                                output_name='output_name',
                                is_diff=False), thresholds)
     self.assertIn(metric_types.MetricKey(name='example_count'), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='weighted_example_count',
                                model_name='model_name1',
                                output_name='output_name1'), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='weighted_example_count',
                                model_name='model_name1',
                                output_name='output_name2'), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='weighted_example_count',
                                model_name='model_name2',
                                output_name='output_name1'), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='weighted_example_count',
                                model_name='model_name2',
                                output_name='output_name2'), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='mse',
                                model_name='model_name',
                                output_name='output_name',
                                sub_key=metric_types.SubKey(class_id=0),
                                is_diff=True), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='mse',
                                model_name='model_name',
                                output_name='output_name',
                                sub_key=metric_types.SubKey(class_id=1),
                                is_diff=True), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='mse',
                                model_name='model_name',
                                output_name='output_name',
                                is_diff=True), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='mean_label',
                                model_name='model_name',
                                output_name='output_name',
                                sub_key=metric_types.SubKey(class_id=0),
                                is_diff=True), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='mean_label',
                                model_name='model_name',
                                output_name='output_name',
                                sub_key=metric_types.SubKey(class_id=1),
                                is_diff=True), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='mean_label',
                                model_name='model_name',
                                output_name='output_name',
                                is_diff=True), thresholds)
  def testWriteValidationResults(self):
    model_dir, baseline_dir = self._getExportDir(), self._getBaselineDir()
    eval_shared_model = self._build_keras_model(model_dir, mul=0)
    baseline_eval_shared_model = self._build_keras_model(baseline_dir, mul=1)
    validations_file = os.path.join(self._getTempDir(),
                                    constants.VALIDATIONS_KEY)
    examples = [
        self._makeExample(
            input=0.0,
            label=1.0,
            example_weight=1.0,
            extra_feature='non_model_feature'),
        self._makeExample(
            input=1.0,
            label=0.0,
            example_weight=0.5,
            extra_feature='non_model_feature'),
    ]

    eval_config = config.EvalConfig(
        model_specs=[
            config.ModelSpec(
                name='candidate',
                label_key='label',
                example_weight_key='example_weight'),
            config.ModelSpec(
                name='baseline',
                label_key='label',
                example_weight_key='example_weight',
                is_baseline=True)
        ],
        slicing_specs=[config.SlicingSpec()],
        metrics_specs=[
            config.MetricsSpec(
                metrics=[
                    config.MetricConfig(
                        class_name='WeightedExampleCount',
                        # 1.5 < 1, NOT OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                upper_bound={'value': 1}))),
                    config.MetricConfig(
                        class_name='ExampleCount',
                        # 2 > 10, NOT OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                lower_bound={'value': 10}))),
                    config.MetricConfig(
                        class_name='MeanLabel',
                        # 0 > 0 and 0 > 0%?: NOT OK.
                        threshold=config.MetricThreshold(
                            change_threshold=config.GenericChangeThreshold(
                                direction=config.MetricDirection
                                .HIGHER_IS_BETTER,
                                relative={'value': 0},
                                absolute={'value': 0}))),
                    config.MetricConfig(
                        # MeanPrediction = (0+0)/(1+0.5) = 0
                        class_name='MeanPrediction',
                        # -.01 < 0 < .01, OK.
                        # Diff% = -.333/.333 = -100% < -99%, OK.
                        # Diff = 0 - .333 = -.333 < 0, OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                upper_bound={'value': .01},
                                lower_bound={'value': -.01}),
                            change_threshold=config.GenericChangeThreshold(
                                direction=config.MetricDirection
                                .LOWER_IS_BETTER,
                                relative={'value': -.99},
                                absolute={'value': 0})))
                ],
                model_names=['candidate', 'baseline']),
        ],
        options=config.Options(
            disabled_outputs={'values': ['eval_config.json']}),
    )
    slice_spec = [
        slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
    ]
    eval_shared_models = {
        'candidate': eval_shared_model,
        'baseline': baseline_eval_shared_model
    }
    extractors = [
        input_extractor.InputExtractor(eval_config),
        predict_extractor_v2.PredictExtractor(
            eval_shared_model=eval_shared_models, eval_config=eval_config),
        slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
    ]
    evaluators = [
        metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
            eval_config=eval_config, eval_shared_model=eval_shared_models)
    ]
    output_paths = {
        constants.VALIDATIONS_KEY: validations_file,
    }
    writers = [
        metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter(
            output_paths, add_metrics_callbacks=[])
    ]

    with beam.Pipeline() as pipeline:

      # pylint: disable=no-value-for-parameter
      _ = (
          pipeline
          | 'Create' >> beam.Create([e.SerializeToString() for e in examples])
          | 'ExtractEvaluateAndWriteResults' >>
          model_eval_lib.ExtractEvaluateAndWriteResults(
              eval_config=eval_config,
              eval_shared_model=eval_shared_model,
              extractors=extractors,
              evaluators=evaluators,
              writers=writers))
      # pylint: enable=no-value-for-parameter

    validation_result = model_eval_lib.load_validation_result(
        os.path.dirname(validations_file))

    expected_validations = [
        text_format.Parse(
            """
            metric_key {
              name: "weighted_example_count"
              model_name: "candidate"
            }
            metric_threshold {
              value_threshold {
                upper_bound {
                  value: 1.0
                }
              }
            }
            metric_value {
              double_value {
                value: 1.5
              }
            }
            """, validation_result_pb2.ValidationFailure()),
        text_format.Parse(
            """
            metric_key {
              name: "example_count"
            }
            metric_threshold {
              value_threshold {
                lower_bound {
                  value: 10.0
                }
              }
            }
            metric_value {
              double_value {
                value: 2.0
              }
            }
            """, validation_result_pb2.ValidationFailure()),
        text_format.Parse(
            """
            metric_key {
              name: "mean_label"
              model_name: "candidate"
              is_diff: true
            }
            metric_threshold {
              change_threshold {
                absolute {
                  value: 0.0
                }
                relative {
                  value: 0.0
                }
                direction: HIGHER_IS_BETTER
              }
            }
            metric_value {
              double_value {
                value: 0.0
              }
            }
            """, validation_result_pb2.ValidationFailure()),
    ]
    self.assertFalse(validation_result.validation_ok)
    self.assertLen(validation_result.metric_validations_per_slice, 1)
    self.assertCountEqual(
        expected_validations,
        validation_result.metric_validations_per_slice[0].failures)
    def testSpecsFromMetrics(self):
        metrics_specs = metric_specs.specs_from_metrics(
            {
                'output_name1': [
                    tf.keras.metrics.MeanSquaredError('mse'),
                    calibration.MeanLabel('mean_label')
                ],
                'output_name2': [
                    tf.keras.metrics.RootMeanSquaredError('rmse'),
                    calibration.MeanPrediction('mean_prediction')
                ]
            },
            model_names=['model_name1', 'model_name2'],
            binarize=config.BinarizationOptions(class_ids=[0, 1]),
            aggregate=config.AggregationOptions(macro_average=True))

        self.assertLen(metrics_specs, 5)
        self.assertProtoEquals(
            metrics_specs[0],
            config.MetricsSpec(metrics=[
                config.MetricConfig(class_name='ExampleCount',
                                    config=json.dumps(
                                        {'name': 'example_count'})),
            ]))
        self.assertProtoEquals(
            metrics_specs[1],
            config.MetricsSpec(metrics=[
                config.MetricConfig(class_name='WeightedExampleCount',
                                    config=json.dumps(
                                        {'name': 'weighted_example_count'})),
            ],
                               model_names=['model_name1', 'model_name2'],
                               output_names=['output_name1']))
        self.assertProtoEquals(
            metrics_specs[2],
            config.MetricsSpec(
                metrics=[
                    config.MetricConfig(class_name='MeanSquaredError',
                                        config=json.dumps({
                                            'name': 'mse',
                                            'dtype': 'float32'
                                        })),
                    config.MetricConfig(class_name='MeanLabel',
                                        config=json.dumps(
                                            {'name': 'mean_label'}))
                ],
                model_names=['model_name1', 'model_name2'],
                output_names=['output_name1'],
                binarize=config.BinarizationOptions(class_ids=[0, 1]),
                aggregate=config.AggregationOptions(macro_average=True)))
        self.assertProtoEquals(
            metrics_specs[3],
            config.MetricsSpec(metrics=[
                config.MetricConfig(class_name='WeightedExampleCount',
                                    config=json.dumps(
                                        {'name': 'weighted_example_count'})),
            ],
                               model_names=['model_name1', 'model_name2'],
                               output_names=['output_name2']))
        self.assertProtoEquals(
            metrics_specs[4],
            config.MetricsSpec(
                metrics=[
                    config.MetricConfig(class_name='RootMeanSquaredError',
                                        config=json.dumps({
                                            'name': 'rmse',
                                            'dtype': 'float32'
                                        })),
                    config.MetricConfig(class_name='MeanPrediction',
                                        config=json.dumps(
                                            {'name': 'mean_prediction'}))
                ],
                model_names=['model_name1', 'model_name2'],
                output_names=['output_name2'],
                binarize=config.BinarizationOptions(class_ids=[0, 1]),
                aggregate=config.AggregationOptions(macro_average=True)))
Exemple #28
0
    def testRunModelAnalysisWithKerasModel(self):
        input_layer = tf.keras.layers.Input(shape=(28 * 28, ), name='data')
        output_layer = tf.keras.layers.Dense(
            10, activation=tf.nn.softmax)(input_layer)
        model = tf.keras.models.Model(input_layer, output_layer)
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001),
                      loss=tf.keras.losses.categorical_crossentropy)

        features = {'data': [[0.0] * 28 * 28]}
        labels = [[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]]
        example_weights = [1.0]
        dataset = tf.data.Dataset.from_tensor_slices(
            (features, labels, example_weights))
        dataset = dataset.shuffle(buffer_size=1).repeat().batch(1)
        model.fit(dataset, steps_per_epoch=1)

        model_location = os.path.join(self._getTempDir(), 'export_dir')
        model.save(model_location, save_format='tf')

        examples = [
            self._makeExample(data=[0.0] * 28 * 28, label=1.0),
            self._makeExample(data=[1.0] * 28 * 28, label=5.0),
            self._makeExample(data=[1.0] * 28 * 28, label=9.0),
        ]
        data_location = self._writeTFExamplesToTFRecords(examples)
        metrics_spec = config.MetricsSpec()
        for metric in (tf.keras.metrics.AUC(), ):
            cfg = tf.keras.utils.serialize_keras_object(metric)
            metrics_spec.metrics.append(
                config.MetricConfig(class_name=cfg['class_name'],
                                    config=json.dumps(cfg['config'])))
        for class_id in (0, 5, 9):
            metrics_spec.binarize.class_ids.append(class_id)
        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec(location=data_location)],
            model_specs=[
                config.ModelSpec(location=model_location, label_key='label')
            ],
            output_data_specs=[
                config.OutputDataSpec(default_location=self._getTempDir())
            ],
            metrics_specs=[metrics_spec])
        eval_result = model_eval_lib.run_model_analysis(
            eval_config=eval_config,
            eval_shared_models=[
                model_eval_lib.default_eval_shared_model(
                    eval_saved_model_path=model_location,
                    tags=[tf.saved_model.SERVING])
            ])
        self.assertEqual(eval_result.config.model_specs[0].location,
                         model_location)
        self.assertEqual(eval_result.config.input_data_specs[0].location,
                         data_location)
        self.assertLen(eval_result.slicing_metrics, 1)
        got_slice_key, got_metrics = eval_result.slicing_metrics[0]
        self.assertEqual(got_slice_key, ())
        self.assertIn('', got_metrics)  # output_name
        got_metrics = got_metrics['']
        expected_metrics = {
            'classId:0': {
                'auc': True,
            },
            'classId:5': {
                'auc': True,
            },
            'classId:9': {
                'auc': True,
            },
        }
        for class_id in expected_metrics:
            self.assertIn(class_id, got_metrics)
            for k in expected_metrics[class_id]:
                self.assertIn(k, got_metrics[class_id])
def to_computations(
    metrics_specs: List[config.MetricsSpec],
    eval_config: Optional[config.EvalConfig] = None,
    schema: Optional[schema_pb2.Schema] = None
) -> metric_types.MetricComputations:
    """Returns computations associated with given metrics specs."""
    computations = []

    #
    # Split into TF metrics and TFMA metrics
    #

    # Dict[Text, Type[tf.keras.metrics.Metric]]
    tf_metric_classes = {}  # class_name -> class
    # Dict[Text, Type[tf.keras.losses.Loss]]
    tf_loss_classes = {}  # class_name -> class
    # List[metric_types.MetricsSpec]
    tf_metrics_specs = []
    # Dict[Text, Type[metric_types.Metric]]
    tfma_metric_classes = metric_types.registered_metrics(
    )  # class_name -> class
    # List[metric_types.MetricsSpec]
    tfma_metrics_specs = []
    #
    # Note: Lists are used instead of Dicts for the following items because
    # protos are are no hashable.
    #
    # List[List[_TFOrTFMAMetric]] (offsets align with metrics_specs).
    per_spec_metric_instances = []
    # List[List[_TFMetricOrLoss]] (offsets align with tf_metrics_specs).
    per_tf_spec_metric_instances = []
    # List[List[metric_types.Metric]]] (offsets align with tfma_metrics_specs).
    per_tfma_spec_metric_instances = []
    for spec in metrics_specs:
        tf_spec = config.MetricsSpec()
        tf_spec.CopyFrom(spec)
        del tf_spec.metrics[:]
        tfma_spec = config.MetricsSpec()
        tfma_spec.CopyFrom(spec)
        del tfma_spec.metrics[:]
        for metric in spec.metrics:
            if metric.class_name in tfma_metric_classes:
                tfma_spec.metrics.append(metric)
            elif not metric.module:
                tf_spec.metrics.append(metric)
            else:
                cls = getattr(importlib.import_module(metric.module),
                              metric.class_name)
                if issubclass(cls, tf.keras.metrics.Metric):
                    tf_metric_classes[metric.class_name] = cls
                    tf_spec.metrics.append(metric)
                elif issubclass(cls, tf.keras.losses.Loss):
                    tf_loss_classes[metric.class_name] = cls
                    tf_spec.metrics.append(metric)
                else:
                    tfma_metric_classes[metric.class_name] = cls
                    tfma_spec.metrics.append(metric)

        metric_instances = []
        if tf_spec.metrics:
            tf_metrics_specs.append(tf_spec)
            tf_metric_instances = []
            for m in tf_spec.metrics:
                # To distinguish losses from metrics, losses are required to set the
                # module name.
                if m.module == _TF_LOSSES_MODULE:
                    tf_metric_instances.append(
                        _deserialize_tf_loss(m, tf_loss_classes))
                else:
                    tf_metric_instances.append(
                        _deserialize_tf_metric(m, tf_metric_classes))
            per_tf_spec_metric_instances.append(tf_metric_instances)
            metric_instances.extend(tf_metric_instances)
        if tfma_spec.metrics:
            tfma_metrics_specs.append(tfma_spec)
            tfma_metric_instances = [
                _deserialize_tfma_metric(m, tfma_metric_classes)
                for m in tfma_spec.metrics
            ]
            per_tfma_spec_metric_instances.append(tfma_metric_instances)
            metric_instances.extend(tfma_metric_instances)
        per_spec_metric_instances.append(metric_instances)

    #
    # Group TF metrics by the subkeys, models and outputs. This is done in reverse
    # because model and subkey processing is done outside of TF and so each unique
    # sub key combination needs to be run through a separate model instance. Note
    # that output_names are handled by the tf_metric_computation since all the
    # outputs are batch calculated in a single model evaluation call.
    #

    # Dict[metric_types.SubKey, Dict[Text, List[int]]
    tf_spec_indices_by_subkey = {
    }  # SubKey -> model_name -> [index(MetricSpec)]
    for i, spec in enumerate(tf_metrics_specs):
        sub_keys = _create_sub_keys(spec)
        if not sub_keys:
            sub_keys = [None]
        for sub_key in sub_keys:
            if sub_key not in tf_spec_indices_by_subkey:
                tf_spec_indices_by_subkey[sub_key] = {}
            # Dict[Text, List[config.MetricSpec]]
            tf_spec_indices_by_model = (tf_spec_indices_by_subkey[sub_key]
                                        )  # name -> [ModelSpec]
            model_names = spec.model_names
            if not model_names:
                model_names = [''
                               ]  # '' is name used when only one model is used
            for model_name in model_names:
                if model_name not in tf_spec_indices_by_model:
                    tf_spec_indices_by_model[model_name] = []
                tf_spec_indices_by_model[model_name].append(i)
    for sub_key, spec_indices_by_model in tf_spec_indices_by_subkey.items():
        for model_name, indices in spec_indices_by_model.items():
            # Class weights are a dict that is not hashable, so we store index to spec
            # containing class weights.
            metrics_by_class_weights_by_output = collections.defaultdict(dict)
            for i in indices:
                class_weights_i = None
                if tf_metrics_specs[i].HasField('aggregate'):
                    class_weights_i = i
                metrics_by_output = metrics_by_class_weights_by_output[
                    class_weights_i]
                output_names = ['']  # '' is name used when only one output
                if tf_metrics_specs[i].output_names:
                    output_names = tf_metrics_specs[i].output_names
                for output_name in output_names:
                    if output_name not in metrics_by_output:
                        metrics_by_output[output_name] = []
                    metrics_by_output[output_name].extend(
                        per_tf_spec_metric_instances[i])
            for i, metrics_by_output in metrics_by_class_weights_by_output.items(
            ):
                class_weights = None
                if i is not None:
                    class_weights = dict(
                        tf_metrics_specs[i].aggregate.class_weights)
                computations.extend(
                    tf_metric_wrapper.tf_metric_computations(
                        metrics_by_output,
                        eval_config=eval_config,
                        model_name=model_name,
                        sub_key=sub_key,
                        class_weights=class_weights))

    #
    # Group TFMA metric specs by the metric classes
    #

    # Dict[bytes, List[config.MetricSpec]]
    tfma_specs_by_metric_config = {}  # hash(MetricConfig) -> [MetricSpec]
    # Dict[bytes, metric_types.Metric]
    hashed_metrics = {}  # hash(MetricConfig) -> Metric
    for i, spec in enumerate(tfma_metrics_specs):
        for metric_config, metric in zip(spec.metrics,
                                         per_tfma_spec_metric_instances[i]):
            # Note that hashing by SerializeToString() is only safe if used within the
            # same process.
            config_hash = metric_config.SerializeToString()
            if config_hash not in tfma_specs_by_metric_config:
                hashed_metrics[config_hash] = metric
                tfma_specs_by_metric_config[config_hash] = []
            tfma_specs_by_metric_config[config_hash].append(spec)
    for config_hash, specs in tfma_specs_by_metric_config.items():
        metric = hashed_metrics[config_hash]
        for spec in specs:
            sub_keys = _create_sub_keys(spec)
            class_weights = None
            if spec.HasField('aggregate'):
                class_weights = dict(spec.aggregate.class_weights)
            computations.extend(
                metric.computations(
                    eval_config=eval_config,
                    schema=schema,
                    model_names=spec.model_names if spec.model_names else [''],
                    output_names=spec.output_names
                    if spec.output_names else [''],
                    sub_keys=sub_keys,
                    class_weights=class_weights,
                    query_key=spec.query_key))

    #
    # Create macro averaging metrics
    #

    for i, spec in enumerate(metrics_specs):
        if spec.aggregate.macro_average or spec.aggregate.weighted_macro_average:
            sub_keys = _create_sub_keys(spec)
            if sub_keys is None:
                raise ValueError(
                    'binarize settings are required when aggregate.macro_average or '
                    'aggregate.weighted_macro_average is used: spec={}'.format(
                        spec))
            for model_name in spec.model_names or ['']:
                for output_name in spec.output_names or ['']:
                    for metric in per_spec_metric_instances[i]:
                        if spec.aggregate.macro_average:
                            computations.extend(
                                aggregation.macro_average(
                                    metric.get_config()['name'],
                                    eval_config=eval_config,
                                    model_name=model_name,
                                    output_name=output_name,
                                    sub_keys=sub_keys,
                                    class_weights=dict(
                                        spec.aggregate.class_weights)))
                        elif spec.aggregate.weighted_macro_average:
                            computations.extend(
                                aggregation.weighted_macro_average(
                                    metric.get_config()['name'],
                                    eval_config=eval_config,
                                    model_name=model_name,
                                    output_name=output_name,
                                    sub_keys=sub_keys,
                                    class_weights=dict(
                                        spec.aggregate.class_weights)))

    return computations
 def testValidateMetricsMetricTDistributionChangeAndThreshold(
         self, slicing_specs, slice_key):
     threshold = config.MetricThreshold(
         change_threshold=config.GenericChangeThreshold(
             direction=config.MetricDirection.LOWER_IS_BETTER,
             absolute={'value': -1}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
             config.ModelSpec(name='baseline', is_baseline=True)
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(metrics=[
                 config.MetricConfig(
                     class_name='AUC',
                     threshold=threshold if slicing_specs is None else None,
                     per_slice_thresholds=[
                         config.PerSliceMetricThreshold(
                             slicing_specs=slicing_specs,
                             threshold=threshold)
                     ]),
             ],
                                model_names=['']),
         ],
     )
     sliced_metrics = (
         slice_key,
         {
             # This is the mean of the diff.
             metric_types.MetricKey(name='auc', model_name='baseline'):
             types.ValueWithTDistribution(sample_mean=0.91,
                                          unsampled_value=0.6),
             metric_types.MetricKey(name='auc', is_diff=True):
             types.ValueWithTDistribution(sample_mean=0.1,
                                          unsampled_value=0.1),
         })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
     expected = text_format.Parse(
         """
     metric_validations_per_slice {
       failures {
         metric_key {
           name: "auc"
           is_diff: true
         }
         metric_value {
           double_value {
             value: 0.1
           }
         }
       }
     }""", validation_result_pb2.ValidationResult())
     expected.metric_validations_per_slice[0].failures[
         0].metric_threshold.CopyFrom(threshold)
     expected.metric_validations_per_slice[0].slice_key.CopyFrom(
         slicer.serialize_slice_key(slice_key))
     for spec in slicing_specs or [None]:
         if (spec is None or slicer.SingleSliceSpec(
                 spec=spec).is_slice_applicable(slice_key)):
             slicing_details = expected.validation_details.slicing_details.add(
             )
             if spec is not None:
                 slicing_details.slicing_spec.CopyFrom(spec)
             else:
                 slicing_details.slicing_spec.CopyFrom(config.SlicingSpec())
             slicing_details.num_matching_slices = 1
     self.assertAlmostEqual(result, expected)