Beispiel #1
0
 def testMetricKeysToSkipForConfidenceIntervals(self):
   metrics_specs = [
       config.MetricsSpec(
           metrics=[
               config.MetricConfig(
                   class_name='ExampleCount',
                   config=json.dumps({'name': 'example_count'}),
                   threshold=config.MetricThreshold(
                       value_threshold=config.GenericValueThreshold())),
               config.MetricConfig(
                   class_name='MeanLabel',
                   config=json.dumps({'name': 'mean_label'}),
                   threshold=config.MetricThreshold(
                       change_threshold=config.GenericChangeThreshold())),
               config.MetricConfig(
                   class_name='MeanSquaredError',
                   config=json.dumps({'name': 'mse'}),
                   threshold=config.MetricThreshold(
                       change_threshold=config.GenericChangeThreshold()))
           ],
           # Model names and output_names should be ignored because
           # ExampleCount is model independent.
           model_names=['model_name1', 'model_name2'],
           output_names=['output_name1', 'output_name2']),
   ]
   metrics_specs += metric_specs.specs_from_metrics(
       [tf.keras.metrics.MeanSquaredError('mse')])
   keys = metric_specs.metric_keys_to_skip_for_confidence_intervals(
       metrics_specs)
   self.assertLen(keys, 1)
   self.assertIn(metric_types.MetricKey(name='example_count'), keys)
  def testMetricSpecsFromKerasSequential(self):
    export_dir = os.path.join(self._getTempDir(), 'export_dir')
    model = tf.keras.models.Sequential([
        tf.keras.layers.InputLayer(input_shape=(1,), name='test'),
        tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)
    ])
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=[tf.keras.metrics.MeanSquaredError(name='mse')])
    features = [[0.0], [1.0]]
    labels = [[1], [0]]
    dataset = tf.data.Dataset.from_tensor_slices((features, labels))
    dataset = dataset.shuffle(buffer_size=1).repeat().batch(2)
    model.fit(dataset, steps_per_epoch=1)
    model.save(export_dir, save_format='tf')

    eval_shared_model = self.createTestEvalSharedModel(
        eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])

    metrics_specs = (
        keras_util.metrics_specs_from_keras('', eval_shared_model.model_loader))

    # TODO(b/149995449): Keras does not support re-loading metrics with the new
    #   API. Re-enable after this is fixed.
    model = eval_shared_model.model_loader.construct_fn(lambda x: None)()
    if not hasattr(model, 'loss_functions'):
      return

    self.assertLen(metrics_specs, 1)
    self.assertProtoEquals(
        self._comparable_spec(metrics_specs[0]),
        config.MetricsSpec(
            metrics=[
                config.MetricConfig(
                    class_name='BinaryCrossentropy',
                    config=json.dumps(
                        {
                            'from_logits': False,
                            'label_smoothing': 0,
                            'reduction': 'auto',
                            'name': 'binary_crossentropy'
                        },
                        sort_keys=True)),
                config.MetricConfig(
                    class_name='MeanSquaredError',
                    config=json.dumps({
                        'name': 'mse',
                        'dtype': 'float32'
                    },
                                      sort_keys=True))
            ],
            model_names=['']))
Beispiel #3
0
  def testToComputationsWithMixedAggregationAndNonAggregationMetrics(self):
    computations = metric_specs.to_computations([
        config.MetricsSpec(
            metrics=[config.MetricConfig(class_name='CategoricalAccuracy')]),
        config.MetricsSpec(
            metrics=[config.MetricConfig(class_name='BinaryCrossentropy')],
            binarize=config.BinarizationOptions(class_ids={'values': [1]}),
            aggregate=config.AggregationOptions(micro_average=True))
    ], config.EvalConfig())

    # 3 separate computations should be used (one for aggregated metrics, one
    # for non-aggregated metrics, and one for metrics associated with class 1)
    self.assertLen(computations, 3)
 def testValidateMetricsMetricValueAndThresholdIgnoreUnmatchedSlice(
         self, slicing_specs, slice_key):
     threshold = config.MetricThreshold(
         value_threshold=config.GenericValueThreshold(
             upper_bound={'value': 1}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='WeightedExampleCount',
                         # 1.5 < 1, NOT OK.
                         per_slice_thresholds=[
                             config.PerSliceMetricThreshold(
                                 slicing_specs=slicing_specs,
                                 threshold=threshold)
                         ]),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = (slice_key, {
         metric_types.MetricKey(name='weighted_example_count'):
         1.5,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertTrue(result.validation_ok)
 def testValidateMetricsValueThresholdLowerBoundPass(
         self, slicing_specs, slice_key):
     threshold = config.MetricThreshold(
         value_threshold=config.GenericValueThreshold(
             lower_bound={'value': 1}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='WeightedExampleCount',
                         # 2 > 1, OK.
                         threshold=threshold
                         if slicing_specs is None else None,
                         per_slice_thresholds=[
                             config.PerSliceMetricThreshold(
                                 slicing_specs=slicing_specs,
                                 threshold=threshold)
                         ]),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = (slice_key, {
         metric_types.MetricKey(name='weighted_example_count'):
         2,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertTrue(result.validation_ok)
 def testValidateMetricsCrossSliceThresholdFail(self, cross_slicing_specs,
                                                slice_key):
     threshold = config.MetricThreshold(
         value_threshold=config.GenericValueThreshold(
             upper_bound={'value': 1}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         cross_slicing_specs=cross_slicing_specs,
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='WeightedExampleCount',
                         # 1.5 < 1, NOT OK.
                         threshold=(threshold if cross_slicing_specs is None
                                    else None),
                         cross_slice_thresholds=[
                             config.CrossSliceMetricThreshold(
                                 cross_slicing_specs=cross_slicing_specs,
                                 threshold=threshold)
                         ]),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = (slice_key, {
         metric_types.MetricKey(name='weighted_example_count'):
         1.5,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
Beispiel #7
0
def _serialize_tf_metric(
    metric: tf.keras.metrics.Metric) -> config.MetricConfig:
  """Serializes TF metric."""
  cfg = metric_util.serialize_metric(metric)
  return config.MetricConfig(
      class_name=cfg['class_name'],
      config=json.dumps(cfg['config'], sort_keys=True))
 def testValidateMetricsChangeThresholdHigherIsBetterFail(self):
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
             config.ModelSpec(name='baseline', is_baseline=True)
         ],
         slicing_specs=[config.SlicingSpec()],
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='MeanPrediction',
                         # Diff = -.333 > 0, NOT OK.
                         threshold=config.MetricThreshold(
                             change_threshold=config.GenericChangeThreshold(
                                 direction=config.MetricDirection.
                                 HIGHER_IS_BETTER,
                                 absolute={'value': 0}))),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = ((()), {
         metric_types.MetricKey(name='mean_prediction',
                                model_name='baseline'):
         0.333,
         metric_types.MetricKey(name='mean_prediction', is_diff=True):
         -0.333,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
 def testValidateMetricsChangeThresholdRelativePass(self):
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
             config.ModelSpec(name='baseline', is_baseline=True)
         ],
         slicing_specs=[config.SlicingSpec()],
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='MeanPrediction',
                         # Diff = -.333
                         # Diff% = -.333/.333 = -100% < 0%, OK.
                         threshold=config.MetricThreshold(
                             change_threshold=config.GenericChangeThreshold(
                                 direction=config.MetricDirection.
                                 LOWER_IS_BETTER,
                                 relative={'value': 0}))),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = ((()), {
         metric_types.MetricKey(name='mean_prediction',
                                model_name='baseline'):
         0.333,
         metric_types.MetricKey(name='mean_prediction', is_diff=True):
         -0.333,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertTrue(result.validation_ok)
 def testValidateMetricsValueThresholdLowerBoundPass(self):
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=[config.SlicingSpec()],
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='WeightedExampleCount',
                         # 2 > 1, OK.
                         threshold=config.MetricThreshold(
                             value_threshold=config.GenericValueThreshold(
                                 lower_bound={'value': 1}))),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = ((()), {
         metric_types.MetricKey(name='weighted_example_count'):
         2,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertTrue(result.validation_ok)
 def testValidateMetricsMetricTDistributionValueAndThreshold(
         self, slicing_specs, slice_key):
     threshold = config.MetricThreshold(
         value_threshold=config.GenericValueThreshold(
             lower_bound={'value': 0.9}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(metrics=[
                 config.MetricConfig(
                     class_name='AUC',
                     threshold=threshold if slicing_specs is None else None,
                     per_slice_thresholds=[
                         config.PerSliceMetricThreshold(
                             slicing_specs=slicing_specs,
                             threshold=threshold)
                     ]),
             ],
                                model_names=['']),
         ],
     )
     sliced_metrics = (slice_key, {
         metric_types.MetricKey(name='auc'):
         types.ValueWithTDistribution(sample_mean=0.91, unsampled_value=0.8)
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
     expected = text_format.Parse(
         """
     metric_validations_per_slice {
       failures {
         metric_key {
           name: "auc"
         }
         metric_value {
           double_value {
             value: 0.8
           }
         }
       }
     }""", validation_result_pb2.ValidationResult())
     expected.metric_validations_per_slice[0].failures[
         0].metric_threshold.CopyFrom(threshold)
     expected.metric_validations_per_slice[0].slice_key.CopyFrom(
         slicer.serialize_slice_key(slice_key))
     for spec in slicing_specs or [None]:
         if (spec is None or slicer.SingleSliceSpec(
                 spec=spec).is_slice_applicable(slice_key)):
             slicing_details = expected.validation_details.slicing_details.add(
             )
             if spec is not None:
                 slicing_details.slicing_spec.CopyFrom(spec)
             else:
                 slicing_details.slicing_spec.CopyFrom(config.SlicingSpec())
             slicing_details.num_matching_slices = 1
     self.assertEqual(result, expected)
Beispiel #12
0
def _serialize_tf_loss(loss: tf.keras.losses.Loss) -> config.MetricConfig:
  """Serializes TF loss."""
  cfg = metric_util.serialize_loss(loss)
  return config.MetricConfig(
      class_name=cfg['class_name'],
      module=loss.__class__.__module__,
      config=json.dumps(cfg['config'], sort_keys=True))
def _serialize_tfma_metric(metric: metric_types.Metric) -> config.MetricConfig:
    """Serializes TFMA metric."""
    # This implementation is identical to _serialize_tf_metric, but keeping two
    # implementations for symmetry with deserialize where separate implementations
    # are required (and to be consistent with the keras implementation).
    cfg = tf.keras.utils.serialize_keras_object(metric)
    return config.MetricConfig(class_name=cfg['class_name'],
                               config=json.dumps(cfg['config']))
    def testGetMissingSlices(self):
        slicing_specs = [
            config.SlicingSpec(),
            config.SlicingSpec(feature_values={'feature1': 'value1'}),
            config.SlicingSpec(feature_values={'feature2': 'value2'})
        ]
        threshold = config.MetricThreshold(
            value_threshold=config.GenericValueThreshold(
                upper_bound={'value': 1}))
        eval_config = config.EvalConfig(
            model_specs=[
                config.ModelSpec(),
            ],
            slicing_specs=slicing_specs,
            metrics_specs=[
                config.MetricsSpec(
                    metrics=[
                        config.MetricConfig(
                            class_name='WeightedExampleCount',
                            # 1.5 < 1, NOT OK.
                            per_slice_thresholds=[
                                config.PerSliceMetricThreshold(
                                    slicing_specs=slicing_specs,
                                    threshold=threshold)
                            ]),
                    ],
                    model_names=['']),
            ],
        )
        sliced_metrics = ((('feature1', 'value1'), ), {
            metric_types.MetricKey(name='weighted_example_count'):
            0,
        })
        result = metrics_validator.validate_metrics(sliced_metrics,
                                                    eval_config)

        expected_checks = text_format.Parse(
            """
        validation_ok: true
        validation_details {
          slicing_details {
            slicing_spec {
              feature_values {
                key: "feature1"
                value: "value1"
              }
            }
            num_matching_slices: 1
          }
        }""", validation_result_pb2.ValidationResult())

        self.assertProtoEquals(expected_checks, result)

        missing = metrics_validator.get_missing_slices(
            result.validation_details.slicing_details, eval_config)
        self.assertLen(missing, 2)
        self.assertProtoEquals(missing[0], slicing_specs[0])
        self.assertProtoEquals(missing[1], slicing_specs[2])
 def testValidateMetricsMetricValueAndThreshold(self):
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=[config.SlicingSpec()],
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='WeightedExampleCount',
                         # 1.5 < 1, NOT OK.
                         threshold=config.MetricThreshold(
                             value_threshold=config.GenericValueThreshold(
                                 upper_bound={'value': 1}))),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = ((()), {
         metric_types.MetricKey(name='weighted_example_count'):
         1.5,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
     expected = text_format.Parse(
         """
     metric_validations_per_slice {
       slice_key {
       }
       failures {
         metric_key {
           name: "weighted_example_count"
         }
         metric_threshold {
           value_threshold {
             upper_bound {
               value: 1.0
             }
           }
         }
         metric_value {
           double_value {
             value: 1.5
           }
         }
       }
     }""", validation_result_pb2.ValidationResult())
     self.assertEqual(result, expected)
  def testMetricsSpecBeamCounter(self):
    with beam.Pipeline() as pipeline:
      metrics_spec = config.MetricsSpec(
          metrics=[config.MetricConfig(class_name='FairnessIndicators')])
      _ = pipeline | counter_util.IncrementMetricsSpecsCounters([metrics_spec])

    result = pipeline.run()
    metric_filter = beam.metrics.metric.MetricsFilter().with_namespace(
        constants.METRICS_NAMESPACE).with_name(
            'metric_computed_FairnessIndicators_v2')
    actual_metrics_count = result.metrics().query(
        filter=metric_filter)['counters'][0].committed

    self.assertEqual(actual_metrics_count, 1)
 def testValidateMetricsMetricValueAndThreshold(self, slicing_specs,
                                                slice_key):
   threshold = config.MetricThreshold(
       value_threshold=config.GenericValueThreshold(upper_bound={'value': 1}))
   eval_config = config.EvalConfig(
       model_specs=[
           config.ModelSpec(),
       ],
       slicing_specs=slicing_specs,
       metrics_specs=[
           config.MetricsSpec(
               metrics=[
                   config.MetricConfig(
                       class_name='WeightedExampleCount',
                       # 1.5 < 1, NOT OK.
                       threshold=threshold if slicing_specs is None else None,
                       per_slice_thresholds=[
                           config.PerSliceMetricThreshold(
                               slicing_specs=slicing_specs,
                               threshold=threshold)
                       ]),
               ],
               model_names=['']),
       ],
   )
   sliced_metrics = (slice_key, {
       metric_types.MetricKey(name='weighted_example_count'): 1.5,
   })
   result = metrics_validator.validate_metrics(sliced_metrics, eval_config)
   self.assertFalse(result.validation_ok)
   expected = text_format.Parse(
       """
       metric_validations_per_slice {
         failures {
           metric_key {
             name: "weighted_example_count"
           }
           metric_value {
             double_value {
               value: 1.5
             }
           }
         }
       }""", validation_result_pb2.ValidationResult())
   expected.metric_validations_per_slice[0].failures[
       0].metric_threshold.CopyFrom(threshold)
   expected.metric_validations_per_slice[0].slice_key.CopyFrom(
       slicer.serialize_slice_key(slice_key))
   self.assertEqual(result, expected)
 def testValidateMetricsChangeThresholdRelativeFail(self, slicing_specs,
                                                    slice_key):
     threshold = config.MetricThreshold(
         change_threshold=config.GenericChangeThreshold(
             direction=config.MetricDirection.LOWER_IS_BETTER,
             relative={'value': -2}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
             config.ModelSpec(name='baseline', is_baseline=True)
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='MeanPrediction',
                         # Diff = -.333
                         # Diff% = -.333/.333 = -100% < -200%, NOT OK.
                         threshold=threshold
                         if slicing_specs is None else None,
                         per_slice_thresholds=[
                             config.PerSliceMetricThreshold(
                                 slicing_specs=slicing_specs,
                                 threshold=threshold)
                         ])
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = (slice_key, {
         metric_types.MetricKey(name='mean_prediction',
                                model_name='baseline'):
         0.333,
         metric_types.MetricKey(name='mean_prediction', is_diff=True):
         -0.333,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
 def testValidateMetricsChangeThresholdHigherIsBetterPass(
         self, slicing_specs, slice_key):
     threshold = config.MetricThreshold(
         change_threshold=config.GenericChangeThreshold(
             direction=config.MetricDirection.HIGHER_IS_BETTER,
             absolute={'value': -1}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
             config.ModelSpec(name='baseline', is_baseline=True)
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='MeanPrediction',
                         # Diff = -.333 > -1, OK.
                         threshold=threshold
                         if slicing_specs is None else None,
                         per_slice_thresholds=[
                             config.PerSliceMetricThreshold(
                                 slicing_specs=slicing_specs,
                                 threshold=threshold)
                         ])
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = (slice_key, {
         metric_types.MetricKey(name='mean_prediction',
                                model_name='baseline'):
         0.333,
         metric_types.MetricKey(name='mean_prediction', is_diff=True):
         -0.333,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertTrue(result.validation_ok)
Beispiel #20
0
 def testValidateMetricsDivByZero(self):
   threshold = config.MetricThreshold(
       change_threshold=config.GenericChangeThreshold(
           direction=config.MetricDirection.HIGHER_IS_BETTER,
           relative={'value': 0.1}))
   slicing_specs = [config.SlicingSpec()]
   eval_config = config.EvalConfig(
       model_specs=[
           config.ModelSpec(name='candidate'),
           config.ModelSpec(name='baseline', is_baseline=True)
       ],
       slicing_specs=slicing_specs,
       metrics_specs=[
           config.MetricsSpec(
               metrics=[
                   config.MetricConfig(
                       class_name='MeanPrediction',
                       threshold=threshold if slicing_specs is None else None,
                       per_slice_thresholds=[
                           config.PerSliceMetricThreshold(
                               slicing_specs=slicing_specs,
                               threshold=threshold)
                       ])
               ],
               model_names=['baseline', 'candidate']),
       ],
   )
   sliced_metrics = ((()), {
       metric_types.MetricKey(name='mean_prediction', model_name='baseline'):
           0.0,
       metric_types.MetricKey(
           name='mean_prediction', model_name='candidate', is_diff=True):
           0.1,
   })
   result = metrics_validator.validate_metrics(sliced_metrics, eval_config)
   self.assertFalse(result.validation_ok)
    def testSpecsFromMetrics(self):
        metrics_specs = metric_specs.specs_from_metrics(
            {
                'output_name1': [
                    tf.keras.metrics.MeanSquaredError('mse'),
                    calibration.MeanLabel('mean_label')
                ],
                'output_name2': [
                    tf.keras.metrics.RootMeanSquaredError('rmse'),
                    calibration.MeanPrediction('mean_prediction')
                ]
            },
            model_names=['model_name1', 'model_name2'],
            binarize=config.BinarizationOptions(class_ids=[0, 1]),
            aggregate=config.AggregationOptions(macro_average=True))

        self.assertLen(metrics_specs, 5)
        self.assertProtoEquals(
            metrics_specs[0],
            config.MetricsSpec(metrics=[
                config.MetricConfig(class_name='ExampleCount',
                                    config=json.dumps(
                                        {'name': 'example_count'})),
            ]))
        self.assertProtoEquals(
            metrics_specs[1],
            config.MetricsSpec(metrics=[
                config.MetricConfig(class_name='WeightedExampleCount',
                                    config=json.dumps(
                                        {'name': 'weighted_example_count'})),
            ],
                               model_names=['model_name1', 'model_name2'],
                               output_names=['output_name1']))
        self.assertProtoEquals(
            metrics_specs[2],
            config.MetricsSpec(
                metrics=[
                    config.MetricConfig(class_name='MeanSquaredError',
                                        config=json.dumps({
                                            'name': 'mse',
                                            'dtype': 'float32'
                                        })),
                    config.MetricConfig(class_name='MeanLabel',
                                        config=json.dumps(
                                            {'name': 'mean_label'}))
                ],
                model_names=['model_name1', 'model_name2'],
                output_names=['output_name1'],
                binarize=config.BinarizationOptions(class_ids=[0, 1]),
                aggregate=config.AggregationOptions(macro_average=True)))
        self.assertProtoEquals(
            metrics_specs[3],
            config.MetricsSpec(metrics=[
                config.MetricConfig(class_name='WeightedExampleCount',
                                    config=json.dumps(
                                        {'name': 'weighted_example_count'})),
            ],
                               model_names=['model_name1', 'model_name2'],
                               output_names=['output_name2']))
        self.assertProtoEquals(
            metrics_specs[4],
            config.MetricsSpec(
                metrics=[
                    config.MetricConfig(class_name='RootMeanSquaredError',
                                        config=json.dumps({
                                            'name': 'rmse',
                                            'dtype': 'float32'
                                        })),
                    config.MetricConfig(class_name='MeanPrediction',
                                        config=json.dumps(
                                            {'name': 'mean_prediction'}))
                ],
                model_names=['model_name1', 'model_name2'],
                output_names=['output_name2'],
                binarize=config.BinarizationOptions(class_ids=[0, 1]),
                aggregate=config.AggregationOptions(macro_average=True)))
def _serialize_tf_metric(
        metric: tf.keras.metrics.Metric) -> config.MetricConfig:
    """Serializes TF metric."""
    cfg = tf.keras.metrics.serialize(metric)
    return config.MetricConfig(class_name=cfg['class_name'],
                               config=json.dumps(cfg['config']))
Beispiel #23
0
    def testSpecsFromMetrics(self):
        metrics_specs = metric_specs.specs_from_metrics(
            {
                'output_name1': [
                    tf.keras.metrics.MeanSquaredError('mse'),
                    tf.keras.losses.MeanAbsoluteError(name='mae'),
                    calibration.MeanLabel('mean_label')
                ],
                'output_name2': [
                    tf.keras.metrics.RootMeanSquaredError('rmse'),
                    tf.keras.losses.MeanAbsolutePercentageError(name='mape'),
                    calibration.MeanPrediction('mean_prediction')
                ]
            },
            model_names=['model_name1', 'model_name2'],
            binarize=config.BinarizationOptions(class_ids={'values': [0, 1]}),
            aggregate=config.AggregationOptions(macro_average=True))

        self.assertLen(metrics_specs, 5)
        self.assertProtoEquals(
            metrics_specs[0],
            config.MetricsSpec(metrics=[
                config.MetricConfig(class_name='ExampleCount',
                                    config=json.dumps(
                                        {'name': 'example_count'})),
            ]))
        self.assertProtoEquals(
            metrics_specs[1],
            config.MetricsSpec(metrics=[
                config.MetricConfig(class_name='WeightedExampleCount',
                                    config=json.dumps(
                                        {'name': 'weighted_example_count'})),
            ],
                               model_names=['model_name1', 'model_name2'],
                               output_names=['output_name1']))
        self.assertProtoEquals(
            metrics_specs[2],
            config.MetricsSpec(metrics=[
                config.MetricConfig(class_name='MeanSquaredError',
                                    config=json.dumps(
                                        {
                                            'name': 'mse',
                                            'dtype': 'float32'
                                        },
                                        sort_keys=True)),
                config.MetricConfig(class_name='MeanAbsoluteError',
                                    module=metric_specs._TF_LOSSES_MODULE,
                                    config=json.dumps(
                                        {
                                            'reduction': 'auto',
                                            'name': 'mae'
                                        },
                                        sort_keys=True)),
                config.MetricConfig(class_name='MeanLabel',
                                    config=json.dumps({'name': 'mean_label'}))
            ],
                               model_names=['model_name1', 'model_name2'],
                               output_names=['output_name1'],
                               binarize=config.BinarizationOptions(
                                   class_ids={'values': [0, 1]}),
                               aggregate=config.AggregationOptions(
                                   macro_average=True)))
        self.assertProtoEquals(
            metrics_specs[3],
            config.MetricsSpec(metrics=[
                config.MetricConfig(class_name='WeightedExampleCount',
                                    config=json.dumps(
                                        {'name': 'weighted_example_count'})),
            ],
                               model_names=['model_name1', 'model_name2'],
                               output_names=['output_name2']))
        self.assertProtoEquals(
            metrics_specs[4],
            config.MetricsSpec(
                metrics=[
                    config.MetricConfig(class_name='RootMeanSquaredError',
                                        config=json.dumps(
                                            {
                                                'name': 'rmse',
                                                'dtype': 'float32'
                                            },
                                            sort_keys=True)),
                    config.MetricConfig(
                        class_name='MeanAbsolutePercentageError',
                        module=metric_specs._TF_LOSSES_MODULE,
                        config=json.dumps({
                            'reduction': 'auto',
                            'name': 'mape'
                        },
                                          sort_keys=True)),
                    config.MetricConfig(class_name='MeanPrediction',
                                        config=json.dumps(
                                            {'name': 'mean_prediction'}))
                ],
                model_names=['model_name1', 'model_name2'],
                output_names=['output_name2'],
                binarize=config.BinarizationOptions(
                    class_ids={'values': [0, 1]}),
                aggregate=config.AggregationOptions(macro_average=True)))
Beispiel #24
0
 def testMetricThresholdsFromMetricsSpecs(self):
     metrics_specs = [
         config.MetricsSpec(
             thresholds={
                 'auc':
                 config.MetricThreshold(
                     value_threshold=config.GenericValueThreshold()),
                 'mean/label':
                 config.MetricThreshold(
                     value_threshold=config.GenericValueThreshold(),
                     change_threshold=config.GenericChangeThreshold()),
                 # The mse metric will be overridden by MetricConfig below.
                 'mse':
                 config.MetricThreshold(
                     change_threshold=config.GenericChangeThreshold())
             },
             # Model names and output_names should be ignored because
             # ExampleCount is model independent.
             model_names=['model_name'],
             output_names=['output_name']),
         config.MetricsSpec(
             metrics=[
                 config.MetricConfig(
                     class_name='ExampleCount',
                     config=json.dumps({'name': 'example_count'}),
                     threshold=config.MetricThreshold(
                         value_threshold=config.GenericValueThreshold()))
             ],
             # Model names and output_names should be ignored because
             # ExampleCount is model independent.
             model_names=['model_name1', 'model_name2'],
             output_names=['output_name1', 'output_name2']),
         config.MetricsSpec(metrics=[
             config.MetricConfig(
                 class_name='WeightedExampleCount',
                 config=json.dumps({'name': 'weighted_example_count'}),
                 threshold=config.MetricThreshold(
                     value_threshold=config.GenericValueThreshold()))
         ],
                            model_names=['model_name1', 'model_name2'],
                            output_names=['output_name1', 'output_name2']),
         config.MetricsSpec(
             metrics=[
                 config.MetricConfig(
                     class_name='MeanSquaredError',
                     config=json.dumps({'name': 'mse'}),
                     threshold=config.MetricThreshold(
                         change_threshold=config.GenericChangeThreshold())),
                 config.MetricConfig(
                     class_name='MeanLabel',
                     config=json.dumps({'name': 'mean_label'}),
                     threshold=config.MetricThreshold(
                         change_threshold=config.GenericChangeThreshold()))
             ],
             model_names=['model_name'],
             output_names=['output_name'],
             binarize=config.BinarizationOptions(
                 class_ids={'values': [0, 1]}),
             aggregate=config.AggregationOptions(macro_average=True))
     ]
     thresholds = metric_specs.metric_thresholds_from_metrics_specs(
         metrics_specs)
     self.assertLen(thresholds, 14)
     self.assertIn(
         metric_types.MetricKey(name='auc',
                                model_name='model_name',
                                output_name='output_name'), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='mean/label',
                                model_name='model_name',
                                output_name='output_name',
                                is_diff=True), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='mean/label',
                                model_name='model_name',
                                output_name='output_name',
                                is_diff=False), thresholds)
     self.assertIn(metric_types.MetricKey(name='example_count'), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='weighted_example_count',
                                model_name='model_name1',
                                output_name='output_name1'), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='weighted_example_count',
                                model_name='model_name1',
                                output_name='output_name2'), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='weighted_example_count',
                                model_name='model_name2',
                                output_name='output_name1'), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='weighted_example_count',
                                model_name='model_name2',
                                output_name='output_name2'), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='mse',
                                model_name='model_name',
                                output_name='output_name',
                                sub_key=metric_types.SubKey(class_id=0),
                                is_diff=True), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='mse',
                                model_name='model_name',
                                output_name='output_name',
                                sub_key=metric_types.SubKey(class_id=1),
                                is_diff=True), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='mse',
                                model_name='model_name',
                                output_name='output_name',
                                is_diff=True), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='mean_label',
                                model_name='model_name',
                                output_name='output_name',
                                sub_key=metric_types.SubKey(class_id=0),
                                is_diff=True), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='mean_label',
                                model_name='model_name',
                                output_name='output_name',
                                sub_key=metric_types.SubKey(class_id=1),
                                is_diff=True), thresholds)
     self.assertIn(
         metric_types.MetricKey(name='mean_label',
                                model_name='model_name',
                                output_name='output_name',
                                is_diff=True), thresholds)
  def testWriteValidationResults(self):
    model_dir, baseline_dir = self._getExportDir(), self._getBaselineDir()
    eval_shared_model = self._build_keras_model(model_dir, mul=0)
    baseline_eval_shared_model = self._build_keras_model(baseline_dir, mul=1)
    validations_file = os.path.join(self._getTempDir(),
                                    constants.VALIDATIONS_KEY)
    examples = [
        self._makeExample(
            input=0.0,
            label=1.0,
            example_weight=1.0,
            extra_feature='non_model_feature'),
        self._makeExample(
            input=1.0,
            label=0.0,
            example_weight=0.5,
            extra_feature='non_model_feature'),
    ]

    eval_config = config.EvalConfig(
        model_specs=[
            config.ModelSpec(
                name='candidate',
                label_key='label',
                example_weight_key='example_weight'),
            config.ModelSpec(
                name='baseline',
                label_key='label',
                example_weight_key='example_weight',
                is_baseline=True)
        ],
        slicing_specs=[config.SlicingSpec()],
        metrics_specs=[
            config.MetricsSpec(
                metrics=[
                    config.MetricConfig(
                        class_name='WeightedExampleCount',
                        # 1.5 < 1, NOT OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                upper_bound={'value': 1}))),
                    config.MetricConfig(
                        class_name='ExampleCount',
                        # 2 > 10, NOT OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                lower_bound={'value': 10}))),
                    config.MetricConfig(
                        class_name='MeanLabel',
                        # 0 > 0 and 0 > 0%?: NOT OK.
                        threshold=config.MetricThreshold(
                            change_threshold=config.GenericChangeThreshold(
                                direction=config.MetricDirection
                                .HIGHER_IS_BETTER,
                                relative={'value': 0},
                                absolute={'value': 0}))),
                    config.MetricConfig(
                        # MeanPrediction = (0+0)/(1+0.5) = 0
                        class_name='MeanPrediction',
                        # -.01 < 0 < .01, OK.
                        # Diff% = -.333/.333 = -100% < -99%, OK.
                        # Diff = 0 - .333 = -.333 < 0, OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                upper_bound={'value': .01},
                                lower_bound={'value': -.01}),
                            change_threshold=config.GenericChangeThreshold(
                                direction=config.MetricDirection
                                .LOWER_IS_BETTER,
                                relative={'value': -.99},
                                absolute={'value': 0})))
                ],
                model_names=['candidate', 'baseline']),
        ],
        options=config.Options(
            disabled_outputs={'values': ['eval_config.json']}),
    )
    slice_spec = [
        slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
    ]
    eval_shared_models = {
        'candidate': eval_shared_model,
        'baseline': baseline_eval_shared_model
    }
    extractors = [
        input_extractor.InputExtractor(eval_config),
        predict_extractor_v2.PredictExtractor(
            eval_shared_model=eval_shared_models, eval_config=eval_config),
        slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
    ]
    evaluators = [
        metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
            eval_config=eval_config, eval_shared_model=eval_shared_models)
    ]
    output_paths = {
        constants.VALIDATIONS_KEY: validations_file,
    }
    writers = [
        metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter(
            output_paths, add_metrics_callbacks=[])
    ]

    with beam.Pipeline() as pipeline:

      # pylint: disable=no-value-for-parameter
      _ = (
          pipeline
          | 'Create' >> beam.Create([e.SerializeToString() for e in examples])
          | 'ExtractEvaluateAndWriteResults' >>
          model_eval_lib.ExtractEvaluateAndWriteResults(
              eval_config=eval_config,
              eval_shared_model=eval_shared_model,
              extractors=extractors,
              evaluators=evaluators,
              writers=writers))
      # pylint: enable=no-value-for-parameter

    validation_result = model_eval_lib.load_validation_result(
        os.path.dirname(validations_file))

    expected_validations = [
        text_format.Parse(
            """
            metric_key {
              name: "weighted_example_count"
              model_name: "candidate"
            }
            metric_threshold {
              value_threshold {
                upper_bound {
                  value: 1.0
                }
              }
            }
            metric_value {
              double_value {
                value: 1.5
              }
            }
            """, validation_result_pb2.ValidationFailure()),
        text_format.Parse(
            """
            metric_key {
              name: "example_count"
            }
            metric_threshold {
              value_threshold {
                lower_bound {
                  value: 10.0
                }
              }
            }
            metric_value {
              double_value {
                value: 2.0
              }
            }
            """, validation_result_pb2.ValidationFailure()),
        text_format.Parse(
            """
            metric_key {
              name: "mean_label"
              model_name: "candidate"
              is_diff: true
            }
            metric_threshold {
              change_threshold {
                absolute {
                  value: 0.0
                }
                relative {
                  value: 0.0
                }
                direction: HIGHER_IS_BETTER
              }
            }
            metric_value {
              double_value {
                value: 0.0
              }
            }
            """, validation_result_pb2.ValidationFailure()),
    ]
    self.assertFalse(validation_result.validation_ok)
    self.assertLen(validation_result.metric_validations_per_slice, 1)
    self.assertCountEqual(
        expected_validations,
        validation_result.metric_validations_per_slice[0].failures)
Beispiel #26
0
    def testMetricThresholdsFromMetricsSpecs(self):
        slice_specs = [
            config.SlicingSpec(feature_keys=['feature1']),
            config.SlicingSpec(feature_values={'feature2': 'value1'})
        ]

        # For cross slice tests.
        baseline_slice_spec = config.SlicingSpec(feature_keys=['feature3'])

        metrics_specs = [
            config.MetricsSpec(
                thresholds={
                    'auc':
                    config.MetricThreshold(
                        value_threshold=config.GenericValueThreshold()),
                    'mean/label':
                    config.MetricThreshold(
                        value_threshold=config.GenericValueThreshold(),
                        change_threshold=config.GenericChangeThreshold()),
                    'mse':
                    config.MetricThreshold(
                        change_threshold=config.GenericChangeThreshold())
                },
                per_slice_thresholds={
                    'auc':
                    config.PerSliceMetricThresholds(thresholds=[
                        config.PerSliceMetricThreshold(
                            slicing_specs=slice_specs,
                            threshold=config.MetricThreshold(
                                value_threshold=config.GenericValueThreshold(
                                )))
                    ]),
                    'mean/label':
                    config.PerSliceMetricThresholds(thresholds=[
                        config.PerSliceMetricThreshold(
                            slicing_specs=slice_specs,
                            threshold=config.MetricThreshold(
                                value_threshold=config.GenericValueThreshold(),
                                change_threshold=config.GenericChangeThreshold(
                                )))
                    ])
                },
                cross_slice_thresholds={
                    'auc':
                    config.CrossSliceMetricThresholds(thresholds=[
                        config.CrossSliceMetricThreshold(
                            cross_slicing_specs=[
                                config.CrossSlicingSpec(
                                    baseline_spec=baseline_slice_spec,
                                    slicing_specs=slice_specs)
                            ],
                            threshold=config.MetricThreshold(
                                value_threshold=config.GenericValueThreshold(),
                                change_threshold=config.GenericChangeThreshold(
                                )))
                    ]),
                    'mse':
                    config.CrossSliceMetricThresholds(thresholds=[
                        config.CrossSliceMetricThreshold(
                            cross_slicing_specs=[
                                config.CrossSlicingSpec(
                                    baseline_spec=baseline_slice_spec,
                                    slicing_specs=slice_specs)
                            ],
                            threshold=config.MetricThreshold(
                                change_threshold=config.GenericChangeThreshold(
                                ))),
                        # Test for duplicate cross_slicing_spec.
                        config.CrossSliceMetricThreshold(
                            cross_slicing_specs=[
                                config.CrossSlicingSpec(
                                    baseline_spec=baseline_slice_spec,
                                    slicing_specs=slice_specs)
                            ],
                            threshold=config.MetricThreshold(
                                value_threshold=config.GenericValueThreshold())
                        )
                    ])
                },
                model_names=['model_name'],
                output_names=['output_name']),
            config.MetricsSpec(metrics=[
                config.MetricConfig(
                    class_name='ExampleCount',
                    config=json.dumps({'name': 'example_count'}),
                    threshold=config.MetricThreshold(
                        value_threshold=config.GenericValueThreshold()))
            ],
                               model_names=['model_name1', 'model_name2'],
                               output_names=['output_name1', 'output_name2']),
            config.MetricsSpec(metrics=[
                config.MetricConfig(
                    class_name='WeightedExampleCount',
                    config=json.dumps({'name': 'weighted_example_count'}),
                    threshold=config.MetricThreshold(
                        value_threshold=config.GenericValueThreshold()))
            ],
                               model_names=['model_name1', 'model_name2'],
                               output_names=['output_name1', 'output_name2']),
            config.MetricsSpec(metrics=[
                config.MetricConfig(
                    class_name='MeanSquaredError',
                    config=json.dumps({'name': 'mse'}),
                    threshold=config.MetricThreshold(
                        change_threshold=config.GenericChangeThreshold())),
                config.MetricConfig(
                    class_name='MeanLabel',
                    config=json.dumps({'name': 'mean_label'}),
                    threshold=config.MetricThreshold(
                        change_threshold=config.GenericChangeThreshold()),
                    per_slice_thresholds=[
                        config.PerSliceMetricThreshold(
                            slicing_specs=slice_specs,
                            threshold=config.MetricThreshold(
                                change_threshold=config.GenericChangeThreshold(
                                ))),
                    ],
                    cross_slice_thresholds=[
                        config.CrossSliceMetricThreshold(
                            cross_slicing_specs=[
                                config.CrossSlicingSpec(
                                    baseline_spec=baseline_slice_spec,
                                    slicing_specs=slice_specs)
                            ],
                            threshold=config.MetricThreshold(
                                change_threshold=config.GenericChangeThreshold(
                                )))
                    ]),
            ],
                               model_names=['model_name'],
                               output_names=['output_name'],
                               binarize=config.BinarizationOptions(
                                   class_ids={'values': [0, 1]}),
                               aggregate=config.AggregationOptions(
                                   macro_average=True,
                                   class_weights={
                                       0: 1.0,
                                       1: 1.0
                                   }))
        ]

        thresholds = metric_specs.metric_thresholds_from_metrics_specs(
            metrics_specs)

        expected_keys_and_threshold_counts = {
            metric_types.MetricKey(name='auc',
                                   model_name='model_name',
                                   output_name='output_name',
                                   is_diff=False):
            4,
            metric_types.MetricKey(name='auc',
                                   model_name='model_name',
                                   output_name='output_name',
                                   is_diff=True):
            1,
            metric_types.MetricKey(name='mean/label',
                                   model_name='model_name',
                                   output_name='output_name',
                                   is_diff=True):
            3,
            metric_types.MetricKey(name='mean/label',
                                   model_name='model_name',
                                   output_name='output_name',
                                   is_diff=False):
            3,
            metric_types.MetricKey(name='example_count',
                                   model_name='model_name1',
                                   output_name='output_name1'):
            1,
            metric_types.MetricKey(name='example_count',
                                   model_name='model_name1',
                                   output_name='output_name2'):
            1,
            metric_types.MetricKey(name='example_count',
                                   model_name='model_name2',
                                   output_name='output_name1'):
            1,
            metric_types.MetricKey(name='example_count',
                                   model_name='model_name2',
                                   output_name='output_name2'):
            1,
            metric_types.MetricKey(name='weighted_example_count',
                                   model_name='model_name1',
                                   output_name='output_name1'):
            1,
            metric_types.MetricKey(name='weighted_example_count',
                                   model_name='model_name1',
                                   output_name='output_name2'):
            1,
            metric_types.MetricKey(name='weighted_example_count',
                                   model_name='model_name2',
                                   output_name='output_name1'):
            1,
            metric_types.MetricKey(name='weighted_example_count',
                                   model_name='model_name2',
                                   output_name='output_name2'):
            1,
            metric_types.MetricKey(name='mse',
                                   model_name='model_name',
                                   output_name='output_name',
                                   sub_key=metric_types.SubKey(class_id=0),
                                   is_diff=True):
            1,
            metric_types.MetricKey(name='mse',
                                   model_name='model_name',
                                   output_name='output_name',
                                   sub_key=metric_types.SubKey(class_id=1),
                                   is_diff=True):
            1,
            metric_types.MetricKey(name='mse',
                                   model_name='model_name',
                                   output_name='output_name',
                                   is_diff=True):
            2,
            metric_types.MetricKey(name='mse',
                                   model_name='model_name',
                                   output_name='output_name',
                                   is_diff=False):
            1,
            metric_types.MetricKey(name='mse',
                                   model_name='model_name',
                                   output_name='output_name',
                                   aggregation_type=metric_types.AggregationType(macro_average=True),
                                   is_diff=True):
            1,
            metric_types.MetricKey(name='mean_label',
                                   model_name='model_name',
                                   output_name='output_name',
                                   sub_key=metric_types.SubKey(class_id=0),
                                   is_diff=True):
            4,
            metric_types.MetricKey(name='mean_label',
                                   model_name='model_name',
                                   output_name='output_name',
                                   sub_key=metric_types.SubKey(class_id=1),
                                   is_diff=True):
            4,
            metric_types.MetricKey(name='mean_label',
                                   model_name='model_name',
                                   output_name='output_name',
                                   aggregation_type=metric_types.AggregationType(macro_average=True),
                                   is_diff=True):
            4
        }
        self.assertLen(thresholds, len(expected_keys_and_threshold_counts))
        for key, count in expected_keys_and_threshold_counts.items():
            self.assertIn(key, thresholds)
            self.assertLen(thresholds[key], count,
                           'failed for key {}'.format(key))
Beispiel #27
0
  def testWriteValidationResults(self, output_file_format):
    model_dir, baseline_dir = self._getExportDir(), self._getBaselineDir()
    eval_shared_model = self._build_keras_model(model_dir, mul=0)
    baseline_eval_shared_model = self._build_keras_model(baseline_dir, mul=1)
    validations_file = os.path.join(self._getTempDir(),
                                    constants.VALIDATIONS_KEY)
    schema = text_format.Parse(
        """
        tensor_representation_group {
          key: ""
          value {
            tensor_representation {
              key: "input"
              value {
                dense_tensor {
                  column_name: "input"
                  shape { dim { size: 1 } }
                }
              }
            }
          }
        }
        feature {
          name: "input"
          type: FLOAT
        }
        feature {
          name: "label"
          type: FLOAT
        }
        feature {
          name: "example_weight"
          type: FLOAT
        }
        feature {
          name: "extra_feature"
          type: BYTES
        }
        """, schema_pb2.Schema())
    tfx_io = test_util.InMemoryTFExampleRecord(
        schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN)
    tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
        arrow_schema=tfx_io.ArrowSchema(),
        tensor_representations=tfx_io.TensorRepresentations())
    examples = [
        self._makeExample(
            input=0.0,
            label=1.0,
            example_weight=1.0,
            extra_feature='non_model_feature'),
        self._makeExample(
            input=1.0,
            label=0.0,
            example_weight=0.5,
            extra_feature='non_model_feature'),
    ]

    eval_config = config.EvalConfig(
        model_specs=[
            config.ModelSpec(
                name='candidate',
                label_key='label',
                example_weight_key='example_weight'),
            config.ModelSpec(
                name='baseline',
                label_key='label',
                example_weight_key='example_weight',
                is_baseline=True)
        ],
        slicing_specs=[config.SlicingSpec()],
        metrics_specs=[
            config.MetricsSpec(
                metrics=[
                    config.MetricConfig(
                        class_name='WeightedExampleCount',
                        # 1.5 < 1, NOT OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                upper_bound={'value': 1}))),
                    config.MetricConfig(
                        class_name='ExampleCount',
                        # 2 > 10, NOT OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                lower_bound={'value': 10}))),
                    config.MetricConfig(
                        class_name='MeanLabel',
                        # 0 > 0 and 0 > 0%?: NOT OK.
                        threshold=config.MetricThreshold(
                            change_threshold=config.GenericChangeThreshold(
                                direction=config.MetricDirection
                                .HIGHER_IS_BETTER,
                                relative={'value': 0},
                                absolute={'value': 0}))),
                    config.MetricConfig(
                        # MeanPrediction = (0+0)/(1+0.5) = 0
                        class_name='MeanPrediction',
                        # -.01 < 0 < .01, OK.
                        # Diff% = -.333/.333 = -100% < -99%, OK.
                        # Diff = 0 - .333 = -.333 < 0, OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                upper_bound={'value': .01},
                                lower_bound={'value': -.01}),
                            change_threshold=config.GenericChangeThreshold(
                                direction=config.MetricDirection
                                .LOWER_IS_BETTER,
                                relative={'value': -.99},
                                absolute={'value': 0})))
                ],
                model_names=['candidate', 'baseline']),
        ],
        options=config.Options(
            disabled_outputs={'values': ['eval_config.json']}),
    )
    slice_spec = [
        slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
    ]
    eval_shared_models = {
        'candidate': eval_shared_model,
        'baseline': baseline_eval_shared_model
    }
    extractors = [
        batched_input_extractor.BatchedInputExtractor(eval_config),
        batched_predict_extractor_v2.BatchedPredictExtractor(
            eval_shared_model=eval_shared_models,
            eval_config=eval_config,
            tensor_adapter_config=tensor_adapter_config),
        unbatch_extractor.UnbatchExtractor(),
        slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
    ]
    evaluators = [
        metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
            eval_config=eval_config, eval_shared_model=eval_shared_models)
    ]
    output_paths = {
        constants.VALIDATIONS_KEY: validations_file,
    }
    writers = [
        metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter(
            output_paths,
            add_metrics_callbacks=[],
            output_file_format=output_file_format)
    ]

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      _ = (
          pipeline
          | 'Create' >> beam.Create([e.SerializeToString() for e in examples])
          | 'BatchExamples' >> tfx_io.BeamSource()
          | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
          | 'ExtractEvaluate' >> model_eval_lib.ExtractAndEvaluate(
              extractors=extractors, evaluators=evaluators)
          | 'WriteResults' >> model_eval_lib.WriteResults(writers=writers))
      # pylint: enable=no-value-for-parameter

    validation_result = (
        metrics_plots_and_validations_writer
        .load_and_deserialize_validation_result(
            os.path.dirname(validations_file)))

    expected_validations = [
        text_format.Parse(
            """
            metric_key {
              name: "weighted_example_count"
              model_name: "candidate"
            }
            metric_threshold {
              value_threshold {
                upper_bound {
                  value: 1.0
                }
              }
            }
            metric_value {
              double_value {
                value: 1.5
              }
            }
            """, validation_result_pb2.ValidationFailure()),
        text_format.Parse(
            """
            metric_key {
              name: "example_count"
              model_name: "candidate"
            }
            metric_threshold {
              value_threshold {
                lower_bound {
                  value: 10.0
                }
              }
            }
            metric_value {
              double_value {
                value: 2.0
              }
            }
            """, validation_result_pb2.ValidationFailure()),
        text_format.Parse(
            """
            metric_key {
              name: "mean_label"
              model_name: "candidate"
              is_diff: true
            }
            metric_threshold {
              change_threshold {
                absolute {
                  value: 0.0
                }
                relative {
                  value: 0.0
                }
                direction: HIGHER_IS_BETTER
              }
            }
            metric_value {
              double_value {
                value: 0.0
              }
            }
            """, validation_result_pb2.ValidationFailure()),
    ]
    self.assertFalse(validation_result.validation_ok)
    self.assertLen(validation_result.metric_validations_per_slice, 1)
    self.assertCountEqual(
        expected_validations,
        validation_result.metric_validations_per_slice[0].failures)
 def testRunModelAnalysisWithModelAgnosticPredictions(self):
     examples = [
         self._makeExample(age=3.0,
                           language='english',
                           label=1.0,
                           prediction=0.9),
         self._makeExample(age=3.0,
                           language='chinese',
                           label=0.0,
                           prediction=0.4),
         self._makeExample(age=4.0,
                           language='english',
                           label=1.0,
                           prediction=0.7),
         self._makeExample(age=5.0,
                           language='chinese',
                           label=1.0,
                           prediction=0.2)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     model_specs = [
         config.ModelSpec(prediction_key='prediction',
                          label_key='label',
                          example_weight_key='age')
     ]
     metrics = [
         config.MetricConfig(class_name='ExampleCount'),
         config.MetricConfig(class_name='WeightedExampleCount'),
         config.MetricConfig(class_name='BinaryAccuracy')
     ]
     slicing_specs = [config.SlicingSpec(feature_keys=['language'])]
     eval_config = config.EvalConfig(
         model_specs=model_specs,
         metrics_specs=[config.MetricsSpec(metrics=metrics)],
         slicing_specs=slicing_specs)
     eval_result = model_eval_lib.run_model_analysis(
         eval_config=eval_config,
         data_location=data_location,
         output_path=self._getTempDir())
     expected = {
         (('language', 'chinese'), ): {
             'binary_accuracy': {
                 'doubleValue': 0.375
             },
             'weighted_example_count': {
                 'doubleValue': 8.0
             },
             'example_count': {
                 'doubleValue': 2.0
             },
         },
         (('language', 'english'), ): {
             'binary_accuracy': {
                 'doubleValue': 1.0
             },
             'weighted_example_count': {
                 'doubleValue': 7.0
             },
             'example_count': {
                 'doubleValue': 2.0
             },
         }
     }
     self.assertEqual(eval_result.data_location, data_location)
     self.assertEqual(eval_result.config.slicing_specs[0],
                      config.SlicingSpec(feature_keys=['language']))
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
 def testValidateMetricsMetricTDistributionChangeAndThreshold(
         self, slicing_specs, slice_key):
     threshold = config.MetricThreshold(
         change_threshold=config.GenericChangeThreshold(
             direction=config.MetricDirection.LOWER_IS_BETTER,
             absolute={'value': -1}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
             config.ModelSpec(name='baseline', is_baseline=True)
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(metrics=[
                 config.MetricConfig(
                     class_name='AUC',
                     threshold=threshold if slicing_specs is None else None,
                     per_slice_thresholds=[
                         config.PerSliceMetricThreshold(
                             slicing_specs=slicing_specs,
                             threshold=threshold)
                     ]),
             ],
                                model_names=['']),
         ],
     )
     sliced_metrics = (
         slice_key,
         {
             # This is the mean of the diff.
             metric_types.MetricKey(name='auc', model_name='baseline'):
             types.ValueWithTDistribution(sample_mean=0.91,
                                          unsampled_value=0.6),
             metric_types.MetricKey(name='auc', is_diff=True):
             types.ValueWithTDistribution(sample_mean=0.1,
                                          unsampled_value=0.1),
         })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
     expected = text_format.Parse(
         """
     metric_validations_per_slice {
       failures {
         metric_key {
           name: "auc"
           is_diff: true
         }
         metric_value {
           double_value {
             value: 0.1
           }
         }
       }
     }""", validation_result_pb2.ValidationResult())
     expected.metric_validations_per_slice[0].failures[
         0].metric_threshold.CopyFrom(threshold)
     expected.metric_validations_per_slice[0].slice_key.CopyFrom(
         slicer.serialize_slice_key(slice_key))
     for spec in slicing_specs or [None]:
         if (spec is None or slicer.SingleSliceSpec(
                 spec=spec).is_slice_applicable(slice_key)):
             slicing_details = expected.validation_details.slicing_details.add(
             )
             if spec is not None:
                 slicing_details.slicing_spec.CopyFrom(spec)
             else:
                 slicing_details.slicing_spec.CopyFrom(config.SlicingSpec())
             slicing_details.num_matching_slices = 1
     self.assertAlmostEqual(result, expected)
Beispiel #30
0
    def testRunModelAnalysisWithKerasModel(self):
        input_layer = tf.keras.layers.Input(shape=(28 * 28, ), name='data')
        output_layer = tf.keras.layers.Dense(
            10, activation=tf.nn.softmax)(input_layer)
        model = tf.keras.models.Model(input_layer, output_layer)
        model.compile(optimizer=tf.keras.optimizers.Adam(lr=.001),
                      loss=tf.keras.losses.categorical_crossentropy)

        features = {'data': [[0.0] * 28 * 28]}
        labels = [[0, 0, 0, 0, 0, 0, 0, 1, 0, 0]]
        example_weights = [1.0]
        dataset = tf.data.Dataset.from_tensor_slices(
            (features, labels, example_weights))
        dataset = dataset.shuffle(buffer_size=1).repeat().batch(1)
        model.fit(dataset, steps_per_epoch=1)

        model_location = os.path.join(self._getTempDir(), 'export_dir')
        model.save(model_location, save_format='tf')

        examples = [
            self._makeExample(data=[0.0] * 28 * 28, label=1.0),
            self._makeExample(data=[1.0] * 28 * 28, label=5.0),
            self._makeExample(data=[1.0] * 28 * 28, label=9.0),
        ]
        data_location = self._writeTFExamplesToTFRecords(examples)
        metrics_spec = config.MetricsSpec()
        for metric in (tf.keras.metrics.AUC(), ):
            cfg = tf.keras.utils.serialize_keras_object(metric)
            metrics_spec.metrics.append(
                config.MetricConfig(class_name=cfg['class_name'],
                                    config=json.dumps(cfg['config'])))
        for class_id in (0, 5, 9):
            metrics_spec.binarize.class_ids.append(class_id)
        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec(location=data_location)],
            model_specs=[
                config.ModelSpec(location=model_location, label_key='label')
            ],
            output_data_specs=[
                config.OutputDataSpec(default_location=self._getTempDir())
            ],
            metrics_specs=[metrics_spec])
        eval_result = model_eval_lib.run_model_analysis(
            eval_config=eval_config,
            eval_shared_models=[
                model_eval_lib.default_eval_shared_model(
                    eval_saved_model_path=model_location,
                    tags=[tf.saved_model.SERVING])
            ])
        self.assertEqual(eval_result.config.model_specs[0].location,
                         model_location)
        self.assertEqual(eval_result.config.input_data_specs[0].location,
                         data_location)
        self.assertLen(eval_result.slicing_metrics, 1)
        got_slice_key, got_metrics = eval_result.slicing_metrics[0]
        self.assertEqual(got_slice_key, ())
        self.assertIn('', got_metrics)  # output_name
        got_metrics = got_metrics['']
        expected_metrics = {
            'classId:0': {
                'auc': True,
            },
            'classId:5': {
                'auc': True,
            },
            'classId:9': {
                'auc': True,
            },
        }
        for class_id in expected_metrics:
            self.assertIn(class_id, got_metrics)
            for k in expected_metrics[class_id]:
                self.assertIn(k, got_metrics[class_id])