def testValidateMetricsCrossSliceThresholdFail(self, cross_slicing_specs,
                                                slice_key):
     threshold = config.MetricThreshold(
         value_threshold=config.GenericValueThreshold(
             upper_bound={'value': 1}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         cross_slicing_specs=cross_slicing_specs,
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='WeightedExampleCount',
                         # 1.5 < 1, NOT OK.
                         threshold=(threshold if cross_slicing_specs is None
                                    else None),
                         cross_slice_thresholds=[
                             config.CrossSliceMetricThreshold(
                                 cross_slicing_specs=cross_slicing_specs,
                                 threshold=threshold)
                         ]),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = (slice_key, {
         metric_types.MetricKey(name='weighted_example_count'):
         1.5,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
 def testValidateMetricsValueThresholdLowerBoundPass(
         self, slicing_specs, slice_key):
     threshold = config.MetricThreshold(
         value_threshold=config.GenericValueThreshold(
             lower_bound={'value': 1}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='WeightedExampleCount',
                         # 2 > 1, OK.
                         threshold=threshold
                         if slicing_specs is None else None,
                         per_slice_thresholds=[
                             config.PerSliceMetricThreshold(
                                 slicing_specs=slicing_specs,
                                 threshold=threshold)
                         ]),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = (slice_key, {
         metric_types.MetricKey(name='weighted_example_count'):
         2,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertTrue(result.validation_ok)
 def testValidateMetricsChangeThresholdHigherIsBetterFail(self):
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
             config.ModelSpec(name='baseline', is_baseline=True)
         ],
         slicing_specs=[config.SlicingSpec()],
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='MeanPrediction',
                         # Diff = -.333 > 0, NOT OK.
                         threshold=config.MetricThreshold(
                             change_threshold=config.GenericChangeThreshold(
                                 direction=config.MetricDirection.
                                 HIGHER_IS_BETTER,
                                 absolute={'value': 0}))),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = ((()), {
         metric_types.MetricKey(name='mean_prediction',
                                model_name='baseline'):
         0.333,
         metric_types.MetricKey(name='mean_prediction', is_diff=True):
         -0.333,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
 def testValidateMetricsMetricValueAndThresholdIgnoreUnmatchedSlice(
         self, slicing_specs, slice_key):
     threshold = config.MetricThreshold(
         value_threshold=config.GenericValueThreshold(
             upper_bound={'value': 1}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='WeightedExampleCount',
                         # 1.5 < 1, NOT OK.
                         per_slice_thresholds=[
                             config.PerSliceMetricThreshold(
                                 slicing_specs=slicing_specs,
                                 threshold=threshold)
                         ]),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = (slice_key, {
         metric_types.MetricKey(name='weighted_example_count'):
         1.5,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertTrue(result.validation_ok)
 def testValidateMetricsMetricTDistributionValueAndThreshold(
         self, slicing_specs, slice_key):
     threshold = config.MetricThreshold(
         value_threshold=config.GenericValueThreshold(
             lower_bound={'value': 0.9}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(metrics=[
                 config.MetricConfig(
                     class_name='AUC',
                     threshold=threshold if slicing_specs is None else None,
                     per_slice_thresholds=[
                         config.PerSliceMetricThreshold(
                             slicing_specs=slicing_specs,
                             threshold=threshold)
                     ]),
             ],
                                model_names=['']),
         ],
     )
     sliced_metrics = (slice_key, {
         metric_types.MetricKey(name='auc'):
         types.ValueWithTDistribution(sample_mean=0.91, unsampled_value=0.8)
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
     expected = text_format.Parse(
         """
     metric_validations_per_slice {
       failures {
         metric_key {
           name: "auc"
         }
         metric_value {
           double_value {
             value: 0.8
           }
         }
       }
     }""", validation_result_pb2.ValidationResult())
     expected.metric_validations_per_slice[0].failures[
         0].metric_threshold.CopyFrom(threshold)
     expected.metric_validations_per_slice[0].slice_key.CopyFrom(
         slicer.serialize_slice_key(slice_key))
     for spec in slicing_specs or [None]:
         if (spec is None or slicer.SingleSliceSpec(
                 spec=spec).is_slice_applicable(slice_key)):
             slicing_details = expected.validation_details.slicing_details.add(
             )
             if spec is not None:
                 slicing_details.slicing_spec.CopyFrom(spec)
             else:
                 slicing_details.slicing_spec.CopyFrom(config.SlicingSpec())
             slicing_details.num_matching_slices = 1
     self.assertEqual(result, expected)
 def testValidateMetricsChangeThresholdRelativePass(self):
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
             config.ModelSpec(name='baseline', is_baseline=True)
         ],
         slicing_specs=[config.SlicingSpec()],
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='MeanPrediction',
                         # Diff = -.333
                         # Diff% = -.333/.333 = -100% < 0%, OK.
                         threshold=config.MetricThreshold(
                             change_threshold=config.GenericChangeThreshold(
                                 direction=config.MetricDirection.
                                 LOWER_IS_BETTER,
                                 relative={'value': 0}))),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = ((()), {
         metric_types.MetricKey(name='mean_prediction',
                                model_name='baseline'):
         0.333,
         metric_types.MetricKey(name='mean_prediction', is_diff=True):
         -0.333,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertTrue(result.validation_ok)
 def testValidateMetricsValueThresholdLowerBoundPass(self):
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=[config.SlicingSpec()],
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='WeightedExampleCount',
                         # 2 > 1, OK.
                         threshold=config.MetricThreshold(
                             value_threshold=config.GenericValueThreshold(
                                 lower_bound={'value': 1}))),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = ((()), {
         metric_types.MetricKey(name='weighted_example_count'):
         2,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertTrue(result.validation_ok)
    def testGetMissingSlices(self):
        slicing_specs = [
            config.SlicingSpec(),
            config.SlicingSpec(feature_values={'feature1': 'value1'}),
            config.SlicingSpec(feature_values={'feature2': 'value2'})
        ]
        threshold = config.MetricThreshold(
            value_threshold=config.GenericValueThreshold(
                upper_bound={'value': 1}))
        eval_config = config.EvalConfig(
            model_specs=[
                config.ModelSpec(),
            ],
            slicing_specs=slicing_specs,
            metrics_specs=[
                config.MetricsSpec(
                    metrics=[
                        config.MetricConfig(
                            class_name='WeightedExampleCount',
                            # 1.5 < 1, NOT OK.
                            per_slice_thresholds=[
                                config.PerSliceMetricThreshold(
                                    slicing_specs=slicing_specs,
                                    threshold=threshold)
                            ]),
                    ],
                    model_names=['']),
            ],
        )
        sliced_metrics = ((('feature1', 'value1'), ), {
            metric_types.MetricKey(name='weighted_example_count'):
            0,
        })
        result = metrics_validator.validate_metrics(sliced_metrics,
                                                    eval_config)

        expected_checks = text_format.Parse(
            """
        validation_ok: true
        validation_details {
          slicing_details {
            slicing_spec {
              feature_values {
                key: "feature1"
                value: "value1"
              }
            }
            num_matching_slices: 1
          }
        }""", validation_result_pb2.ValidationResult())

        self.assertProtoEquals(expected_checks, result)

        missing = metrics_validator.get_missing_slices(
            result.validation_details.slicing_details, eval_config)
        self.assertLen(missing, 2)
        self.assertProtoEquals(missing[0], slicing_specs[0])
        self.assertProtoEquals(missing[1], slicing_specs[2])
 def testValidateMetricsMetricValueAndThreshold(self):
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=[config.SlicingSpec()],
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='WeightedExampleCount',
                         # 1.5 < 1, NOT OK.
                         threshold=config.MetricThreshold(
                             value_threshold=config.GenericValueThreshold(
                                 upper_bound={'value': 1}))),
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = ((()), {
         metric_types.MetricKey(name='weighted_example_count'):
         1.5,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
     expected = text_format.Parse(
         """
     metric_validations_per_slice {
       slice_key {
       }
       failures {
         metric_key {
           name: "weighted_example_count"
         }
         metric_threshold {
           value_threshold {
             upper_bound {
               value: 1.0
             }
           }
         }
         metric_value {
           double_value {
             value: 1.5
           }
         }
       }
     }""", validation_result_pb2.ValidationResult())
     self.assertEqual(result, expected)
 def testValidateMetricsMetricValueAndThreshold(self, slicing_specs,
                                                slice_key):
   threshold = config.MetricThreshold(
       value_threshold=config.GenericValueThreshold(upper_bound={'value': 1}))
   eval_config = config.EvalConfig(
       model_specs=[
           config.ModelSpec(),
       ],
       slicing_specs=slicing_specs,
       metrics_specs=[
           config.MetricsSpec(
               metrics=[
                   config.MetricConfig(
                       class_name='WeightedExampleCount',
                       # 1.5 < 1, NOT OK.
                       threshold=threshold if slicing_specs is None else None,
                       per_slice_thresholds=[
                           config.PerSliceMetricThreshold(
                               slicing_specs=slicing_specs,
                               threshold=threshold)
                       ]),
               ],
               model_names=['']),
       ],
   )
   sliced_metrics = (slice_key, {
       metric_types.MetricKey(name='weighted_example_count'): 1.5,
   })
   result = metrics_validator.validate_metrics(sliced_metrics, eval_config)
   self.assertFalse(result.validation_ok)
   expected = text_format.Parse(
       """
       metric_validations_per_slice {
         failures {
           metric_key {
             name: "weighted_example_count"
           }
           metric_value {
             double_value {
               value: 1.5
             }
           }
         }
       }""", validation_result_pb2.ValidationResult())
   expected.metric_validations_per_slice[0].failures[
       0].metric_threshold.CopyFrom(threshold)
   expected.metric_validations_per_slice[0].slice_key.CopyFrom(
       slicer.serialize_slice_key(slice_key))
   self.assertEqual(result, expected)
 def testValidateMetricsInvalidThreshold(self):
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=[config.SlicingSpec()],
         metrics_specs=[
             config.MetricsSpec(
                 thresholds={
                     'invalid_threshold':
                     config.MetricThreshold(
                         value_threshold=config.GenericValueThreshold(
                             lower_bound={'value': 0.2}))
                 })
         ],
     )
     sliced_metrics = ((()), {
         metric_types.MetricKey(name='weighted_example_count'):
         1.5,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
     expected = text_format.Parse(
         """
     metric_validations_per_slice {
       slice_key {
       }
       failures {
         metric_key {
           name: "invalid_threshold"
         }
         metric_threshold {
           value_threshold {
             lower_bound {
               value: 0.2
             }
           }
         }
         message: 'Metric not found.'
       }
     }""", validation_result_pb2.ValidationResult())
     self.assertProtoEquals(expected, result)
 def testValidateMetricsChangeThresholdRelativeFail(self, slicing_specs,
                                                    slice_key):
     threshold = config.MetricThreshold(
         change_threshold=config.GenericChangeThreshold(
             direction=config.MetricDirection.LOWER_IS_BETTER,
             relative={'value': -2}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
             config.ModelSpec(name='baseline', is_baseline=True)
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='MeanPrediction',
                         # Diff = -.333
                         # Diff% = -.333/.333 = -100% < -200%, NOT OK.
                         threshold=threshold
                         if slicing_specs is None else None,
                         per_slice_thresholds=[
                             config.PerSliceMetricThreshold(
                                 slicing_specs=slicing_specs,
                                 threshold=threshold)
                         ])
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = (slice_key, {
         metric_types.MetricKey(name='mean_prediction',
                                model_name='baseline'):
         0.333,
         metric_types.MetricKey(name='mean_prediction', is_diff=True):
         -0.333,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
 def testValidateMetricsChangeThresholdHigherIsBetterPass(
         self, slicing_specs, slice_key):
     threshold = config.MetricThreshold(
         change_threshold=config.GenericChangeThreshold(
             direction=config.MetricDirection.HIGHER_IS_BETTER,
             absolute={'value': -1}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
             config.ModelSpec(name='baseline', is_baseline=True)
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='MeanPrediction',
                         # Diff = -.333 > -1, OK.
                         threshold=threshold
                         if slicing_specs is None else None,
                         per_slice_thresholds=[
                             config.PerSliceMetricThreshold(
                                 slicing_specs=slicing_specs,
                                 threshold=threshold)
                         ])
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = (slice_key, {
         metric_types.MetricKey(name='mean_prediction',
                                model_name='baseline'):
         0.333,
         metric_types.MetricKey(name='mean_prediction', is_diff=True):
         -0.333,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertTrue(result.validation_ok)
Example #14
0
 def testValidateMetricsDivByZero(self):
   threshold = config.MetricThreshold(
       change_threshold=config.GenericChangeThreshold(
           direction=config.MetricDirection.HIGHER_IS_BETTER,
           relative={'value': 0.1}))
   slicing_specs = [config.SlicingSpec()]
   eval_config = config.EvalConfig(
       model_specs=[
           config.ModelSpec(name='candidate'),
           config.ModelSpec(name='baseline', is_baseline=True)
       ],
       slicing_specs=slicing_specs,
       metrics_specs=[
           config.MetricsSpec(
               metrics=[
                   config.MetricConfig(
                       class_name='MeanPrediction',
                       threshold=threshold if slicing_specs is None else None,
                       per_slice_thresholds=[
                           config.PerSliceMetricThreshold(
                               slicing_specs=slicing_specs,
                               threshold=threshold)
                       ])
               ],
               model_names=['baseline', 'candidate']),
       ],
   )
   sliced_metrics = ((()), {
       metric_types.MetricKey(name='mean_prediction', model_name='baseline'):
           0.0,
       metric_types.MetricKey(
           name='mean_prediction', model_name='candidate', is_diff=True):
           0.1,
   })
   result = metrics_validator.validate_metrics(sliced_metrics, eval_config)
   self.assertFalse(result.validation_ok)
 def testValidateMetricsMetricTDistributionChangeAndThreshold(
         self, slicing_specs, slice_key):
     threshold = config.MetricThreshold(
         change_threshold=config.GenericChangeThreshold(
             direction=config.MetricDirection.LOWER_IS_BETTER,
             absolute={'value': -1}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
             config.ModelSpec(name='baseline', is_baseline=True)
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(metrics=[
                 config.MetricConfig(
                     class_name='AUC',
                     threshold=threshold if slicing_specs is None else None,
                     per_slice_thresholds=[
                         config.PerSliceMetricThreshold(
                             slicing_specs=slicing_specs,
                             threshold=threshold)
                     ]),
             ],
                                model_names=['']),
         ],
     )
     sliced_metrics = (
         slice_key,
         {
             # This is the mean of the diff.
             metric_types.MetricKey(name='auc', model_name='baseline'):
             types.ValueWithTDistribution(sample_mean=0.91,
                                          unsampled_value=0.6),
             metric_types.MetricKey(name='auc', is_diff=True):
             types.ValueWithTDistribution(sample_mean=0.1,
                                          unsampled_value=0.1),
         })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
     expected = text_format.Parse(
         """
     metric_validations_per_slice {
       failures {
         metric_key {
           name: "auc"
           is_diff: true
         }
         metric_value {
           double_value {
             value: 0.1
           }
         }
       }
     }""", validation_result_pb2.ValidationResult())
     expected.metric_validations_per_slice[0].failures[
         0].metric_threshold.CopyFrom(threshold)
     expected.metric_validations_per_slice[0].slice_key.CopyFrom(
         slicer.serialize_slice_key(slice_key))
     for spec in slicing_specs or [None]:
         if (spec is None or slicer.SingleSliceSpec(
                 spec=spec).is_slice_applicable(slice_key)):
             slicing_details = expected.validation_details.slicing_details.add(
             )
             if spec is not None:
                 slicing_details.slicing_spec.CopyFrom(spec)
             else:
                 slicing_details.slicing_spec.CopyFrom(config.SlicingSpec())
             slicing_details.num_matching_slices = 1
     self.assertAlmostEqual(result, expected)
Example #16
0
 def testValidateMetricsChangeThresholdEqualPass(self, slicing_specs,
                                                 slice_key):
   # Change thresholds.
   threshold1 = config.MetricThreshold(
       change_threshold=config.GenericChangeThreshold(
           direction=config.MetricDirection.HIGHER_IS_BETTER,
           absolute={'value': -.333},
           relative={'value': -.333}))
   threshold2 = config.MetricThreshold(
       change_threshold=config.GenericChangeThreshold(
           direction=config.MetricDirection.LOWER_IS_BETTER,
           absolute={'value': -.333},
           relative={'value': -.333}))
   # Value thresholds.
   threshold3 = config.MetricThreshold(
       value_threshold=config.GenericValueThreshold(lower_bound={'value': 1}))
   threshold4 = config.MetricThreshold(
       value_threshold=config.GenericValueThreshold(upper_bound={'value': 1}))
   eval_config = config.EvalConfig(
       model_specs=[
           config.ModelSpec(name='candidate'),
           config.ModelSpec(name='baseline', is_baseline=True)
       ],
       slicing_specs=slicing_specs,
       metrics_specs=[
           config.MetricsSpec(
               metrics=[
                   config.MetricConfig(
                       class_name='MeanPrediction',
                       # Diff = -.333 == -.333, OK.
                       threshold=threshold1 if slicing_specs is None else None,
                       per_slice_thresholds=[
                           config.PerSliceMetricThreshold(
                               slicing_specs=slicing_specs,
                               threshold=threshold1)
                       ]),
                   config.MetricConfig(
                       class_name='MeanLabel',
                       # Diff = -.333 == -.333, OK.
                       threshold=threshold2 if slicing_specs is None else None,
                       per_slice_thresholds=[
                           config.PerSliceMetricThreshold(
                               slicing_specs=slicing_specs,
                               threshold=threshold2)
                       ]),
                   config.MetricConfig(
                       class_name='ExampleCount',
                       # 1 == 1, OK.
                       threshold=threshold3 if slicing_specs is None else None,
                       per_slice_thresholds=[
                           config.PerSliceMetricThreshold(
                               slicing_specs=slicing_specs,
                               threshold=threshold3)
                       ]),
                   config.MetricConfig(
                       class_name='WeightedExampleCount',
                       # 1 == 1, OK.
                       threshold=threshold4 if slicing_specs is None else None,
                       per_slice_thresholds=[
                           config.PerSliceMetricThreshold(
                               slicing_specs=slicing_specs,
                               threshold=threshold4)
                       ]),
               ],
               model_names=['candidate']),
       ],
   )
   sliced_metrics = (slice_key, {
       metric_types.MetricKey(name='mean_prediction', model_name='candidate'):
           0.677,
       metric_types.MetricKey(name='mean_prediction', model_name='baseline'):
           1,
       metric_types.MetricKey(
           name='mean_prediction', is_diff=True, model_name='candidate'):
           -0.333,
       metric_types.MetricKey(name='mean_label', model_name='candidate'):
           0.677,
       metric_types.MetricKey(name='mean_label', model_name='baseline'):
           1,
       metric_types.MetricKey(
           name='mean_label', is_diff=True, model_name='candidate'):
           -0.333,
       metric_types.MetricKey(name='example_count', model_name='candidate'):
           1,
       metric_types.MetricKey(
           name='weighted_example_count', model_name='candidate'):
           1,
   })
   result = metrics_validator.validate_metrics(sliced_metrics, eval_config)
   self.assertTrue(result.validation_ok)