def testCalculateConfidenceInterval(self): sampling_data_list = [ np.array([ [0, 0, 2, 7, 0.77777779, 1], [1, 0, 2, 6, 0.75, 0.85714287], [4, 0, 2, 3, 0.60000002, 0.42857143], [4, 2, 0, 3, 1, 0.42857143], [7, 2, 0, 0, float('nan'), 0], ]), np.array([ [7, 2, 0, 0, float('nan'), 0], [0, 0, 2, 7, 0.77777779, 1], [1, 0, 2, 6, 0.75, 0.85714287], [4, 0, 2, 3, 0.60000002, 0.42857143], [4, 2, 0, 3, 1, 0.42857143], ]), ] unsampled_data = np.array([ [4, 2, 0, 3, 1, 0.42857143], [7, 2, 0, 0, float('nan'), 0], [0, 0, 2, 7, 0.77777779, 1], [1, 0, 2, 6, 0.75, 0.85714287], [4, 0, 2, 3, 0.60000002, 0.42857143], ]) result = poisson_bootstrap._calculate_t_distribution( sampling_data_list, unsampled_data) self.assertIsInstance(result, np.ndarray) self.assertEqual(result.shape, (5, 6)) self.assertAlmostEqual(result[0][0].sample_mean, 3.5, delta=0.1) self.assertAlmostEqual( result[0][0].sample_standard_deviation, 4.94, delta=0.1) self.assertEqual(result[0][0].sample_degrees_of_freedom, 1) self.assertEqual(result[0][0].unsampled_value, 4.0) self.assertAlmostEqual(result[0][4].sample_mean, 0.77, delta=0.1) self.assertTrue(np.isnan(result[0][4].sample_standard_deviation)) self.assertEqual(result[0][4].sample_degrees_of_freedom, 0) self.assertEqual(result[0][4].unsampled_value, 1.0) sampling_data_list = [ np.array([1, 2]), np.array([1, 2]), np.array([1, float('nan')]) ] unsampled_data = np.array([1, 2]) result = poisson_bootstrap._calculate_t_distribution( sampling_data_list, unsampled_data) self.assertIsInstance(result, np.ndarray) self.assertEqual(result.tolist(), [ types.ValueWithTDistribution( sample_mean=1.0, sample_standard_deviation=0.0, sample_degrees_of_freedom=2, unsampled_value=1), types.ValueWithTDistribution( sample_mean=2.0, sample_standard_deviation=0.0, sample_degrees_of_freedom=1, unsampled_value=2) ])
def testCalculateConfidenceInterval(self): np.testing.assert_almost_equal( math_util.calculate_confidence_interval( types.ValueWithTDistribution(10, 2, 9, 10)), (10, 5.4756856744035902196, 14.524314325596410669)) mid, lb, ub = math_util.calculate_confidence_interval( types.ValueWithTDistribution(-1, -1, -1, -1)) self.assertEqual(mid, -1) self.assertTrue(math.isnan(lb)) self.assertTrue(math.isnan(ub))
def testCalculateConfidenceInterval(self): self.assertEqual( math_util.calculate_confidence_interval( types.ValueWithTDistribution(10, 2, 9, 10)), (10, 8.5692861880948552, 11.430713811905145)) mean, lb, ub = math_util.calculate_confidence_interval( types.ValueWithTDistribution(-1, -1, -1, -1)) self.assertEqual(mean, -1) self.assertTrue(math.isnan(lb)) self.assertTrue(math.isnan(ub))
def testUncertaintyValuedMetrics(self): slice_key = _make_slice_key() slice_metrics = { 'one_dim': types.ValueWithTDistribution(2.0, 1.0, 3, 2.0), 'nans': types.ValueWithTDistribution(float('nan'), float('nan'), -1, float('nan')), } expected_metrics_for_slice = text_format.Parse( """ slice_key {} metrics { key: "one_dim" value { bounded_value { value { value: 2.0 } lower_bound { value: -1.1824463 } upper_bound { value: 5.1824463 } methodology: POISSON_BOOTSTRAP } } } metrics { key: "nans" value { bounded_value { value { value: nan } lower_bound { value: nan } upper_bound { value: nan } methodology: POISSON_BOOTSTRAP } } } """, metrics_for_slice_pb2.MetricsForSlice()) got = metrics_and_plots_serialization._serialize_metrics( (slice_key, slice_metrics), []) self.assertProtoEquals( expected_metrics_for_slice, metrics_for_slice_pb2.MetricsForSlice.FromString(got))
def check_result(got_pcoll): expected_pcoll = [ ( (slice_key1, ), { x_key: types.ValueWithTDistribution( sample_mean=1.5, # (((100 - 100/2)/(100/2))*np.var([1, 2]))**0.5 sample_standard_deviation=.5, sample_degrees_of_freedom=1, unsampled_value=1.6), y_key: types.ValueWithTDistribution( sample_mean=15, # (((100 - 100/2)/(100/2))*np.var([10, 20]))**0.5 sample_standard_deviation=5, sample_degrees_of_freedom=1, unsampled_value=16), cm_key: cm_metric, example_count_key: 100, }), ( (slice_key2, ), { x_key: types.ValueWithTDistribution( sample_mean=3, # (((1000 - 1000/2)/(1000/2))*np.var([2, 4]))**0.5 sample_standard_deviation=1, sample_degrees_of_freedom=1, unsampled_value=3.3), y_key: types.ValueWithTDistribution( sample_mean=30, # (((1000 - 1000/2)/(1000/2))*np.var([20, 40]))**0.5 sample_standard_deviation=10, sample_degrees_of_freedom=1, unsampled_value=33), cm_key: cm_metric, example_count_key: 1000, }), ] self.assertCountEqual(expected_pcoll, got_pcoll)
def extract_output(self, accumulator): # Compute the jackknife standard error for each metric. # See delete-d bootstrap method described in: # https://www.stat.berkeley.edu/~hhuang/STAT152/Jackknife-Bootstrap.pdf # Rather than normalize by all possible n-choose-d samples, we normalize by # the actual number of samples. self._num_slices_counter.inc(1) unsampled_values = accumulator.unsampled_values assert _JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY in unsampled_values, ( 'Expected unsampled jackknife values to contain the example count key: ' '"{}". Instead, found keys: {}'.format( _JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY, unsampled_values.keys())) n = unsampled_values.pop(_JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY) result = {} missing_samples = False # If we don't get at least one example in each sample, don't compute CI. if accumulator.num_samples < self._num_jackknife_samples: self._missing_samples_counter.inc(1) missing_samples = True result[metric_types.MetricKey(metric_keys.ERROR_METRIC)] = ( 'CI not computed because only {num_samples} samples were non-empty. ' 'Expected {num_jackknife_samples}.'.format( num_samples=accumulator.num_samples, num_jackknife_samples=self._num_jackknife_samples)) # set d to expected size of a sample holdout d = n / float(accumulator.num_samples) if d < n**0.5: # if d < sqrt(n) the jackknife standard error will behave poorly for some # metrics (including the median). self._small_samples_counter.inc(1) jackknife_scaling_factor = (n - d) / d dof = accumulator.num_samples - 1 num_samples = accumulator.num_samples for metric_key, unsampled_value in unsampled_values.items(): if (missing_samples or metric_key not in accumulator.sums or (self._skip_ci_metric_keys and metric_key in self._skip_ci_metric_keys)): result[metric_key] = unsampled_value else: mean = accumulator.sums[metric_key] / num_samples sum_of_squares = accumulator.sums_of_squares[metric_key] # one-pass variance formula with num_samples degrees of freedom sample_variance = sum_of_squares / float( num_samples) - mean * mean if sample_variance < 0: self._negative_variance_dist.update(n) standard_error = (jackknife_scaling_factor * sample_variance)**0.5 if standard_error == 0: self._zero_variance_dist.update(n) result[metric_key] = types.ValueWithTDistribution( sample_mean=mean, sample_standard_deviation=standard_error, sample_degrees_of_freedom=dof, unsampled_value=unsampled_value) return result
def testValidateMetricsMetricTDistributionValueAndThreshold( self, slicing_specs, slice_key): threshold = config.MetricThreshold( value_threshold=config.GenericValueThreshold( lower_bound={'value': 0.9})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec(metrics=[ config.MetricConfig( class_name='AUC', threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]), ], model_names=['']), ], ) sliced_metrics = (slice_key, { metric_types.MetricKey(name='auc'): types.ValueWithTDistribution(sample_mean=0.91, unsampled_value=0.8) }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok) expected = text_format.Parse( """ metric_validations_per_slice { failures { metric_key { name: "auc" } metric_value { double_value { value: 0.8 } } } }""", validation_result_pb2.ValidationResult()) expected.metric_validations_per_slice[0].failures[ 0].metric_threshold.CopyFrom(threshold) expected.metric_validations_per_slice[0].slice_key.CopyFrom( slicer.serialize_slice_key(slice_key)) for spec in slicing_specs or [None]: if (spec is None or slicer.SingleSliceSpec( spec=spec).is_slice_applicable(slice_key)): slicing_details = expected.validation_details.slicing_details.add( ) if spec is not None: slicing_details.slicing_spec.CopyFrom(spec) else: slicing_details.slicing_spec.CopyFrom(config.SlicingSpec()) slicing_details.num_matching_slices = 1 self.assertEqual(result, expected)
def _get_metrics_as_dict( metrics: metrics_for_slice_pb2.MetricsForSlice ) -> Dict[metric_types.MetricKey, types.ValueWithTDistribution]: """Convert slice metrics to a Dict of types.ValueWithTDistribution. For metrics missing the confidence interval message, an empty ValueWithTDistribution will be created and the double_value or bounded_value.value will be set as the unsampled value. Any metrics which are not represented as double_values or bounded_values will be ommitted from the result. Args: metrics: The MetricsForSlice proto to be converted. Returns: A dict from metric keys to ValueWithTDistributions. """ result = {} for metric in metrics.metric_keys_and_values: value_type = metric.value.WhichOneof('type') unsampled_value = float('nan') if value_type == 'bounded_value': unsampled_value = metric.value.bounded_value.value.value elif value_type == 'double_value': unsampled_value = metric.value.double_value.value t_distribution_value = metric.value.confidence_interval.t_distribution_value result[metric_types.MetricKey.from_proto( metric.key)] = types.ValueWithTDistribution( sample_mean=t_distribution_value.sample_mean.value, sample_standard_deviation=t_distribution_value. sample_standard_deviation.value, sample_degrees_of_freedom=t_distribution_value. sample_degrees_of_freedom.value, unsampled_value=unsampled_value) return result
def check_result(got_pcoll): expected_pcoll = [ ((slice_key, ), { metric_key: types.ValueWithTDistribution( sample_mean=5293977041.15, sample_standard_deviation=12845957824.018991, sample_degrees_of_freedom=19, unsampled_value=1), }), ] self.assertCountEqual(expected_pcoll, got_pcoll)
def check_result(got_pcoll): expected_pcoll = [ { metric_key: types.ValueWithTDistribution( sample_mean=5293977041.15, sample_standard_deviation=3023624729.537024, sample_degrees_of_freedom=19, unsampled_value=1), }, ] self.assertCountEqual(expected_pcoll, got_pcoll)
def _get_metrics_as_dict(metrics): """Convert slice metrics to a Dict of types.ValueWithTDistribution.""" result = {} for metric in metrics.metric_keys_and_values: value_type = metric.value.WhichOneof('type') if value_type == 'bounded_value': t_distribution_value = ( metric.value.confidence_interval.t_distribution_value) result[metric.key.name] = types.ValueWithTDistribution( sample_mean=t_distribution_value.sample_mean.value, sample_standard_deviation=t_distribution_value. sample_standard_deviation.value, sample_degrees_of_freedom=t_distribution_value. sample_degrees_of_freedom.value, unsampled_value=t_distribution_value.unsampled_value.value) elif value_type == 'double_value': result[metric.key.name] = types.ValueWithTDistribution( sample_mean=-1, sample_standard_deviation=-1, sample_degrees_of_freedom=-1, unsampled_value=metric.value.double_value.value) return result
def _calculate_t_distribution( # pylint: disable=invalid-name sampling_data_list: List[Union[int, float, np.ndarray]], unsampled_data: Union[int, float, np.ndarray]): """Calculate the confidence interval of the data. Args: sampling_data_list: A list of number or np.ndarray. unsampled_data: Individual number or np.ndarray. The format of the unsampled_data should match the format of the element inside sampling_data_list. Returns: Confidence Interval value stored inside types.ValueWithTDistribution. """ if isinstance(sampling_data_list[0], (np.ndarray, list)): merged_data = sampling_data_list[0][:] if isinstance(sampling_data_list[0], np.ndarray): merged_data = merged_data.astype(object) for index in range(len(merged_data)): merged_data[index] = _calculate_t_distribution( [data[index] for data in sampling_data_list], unsampled_data[index]) return merged_data else: # Data has to be numeric. That means throw out nan values. sampling_data_list = [ data for data in sampling_data_list if not np.isnan(data) ] n_samples = len(sampling_data_list) if n_samples: sample_mean = np.mean(sampling_data_list) sample_std = np.std(sampling_data_list, ddof=1) return types.ValueWithTDistribution(sample_mean, sample_std, n_samples - 1, unsampled_data) else: return types.ValueWithTDistribution(float('nan'), float('nan'), -1, float('nan'))
def check_result(got_pcoll): expected_pcoll = [ (slice_key1, { x_key: types.ValueWithTDistribution( sample_mean=1.5, sample_standard_deviation=0.5, sample_degrees_of_freedom=1, unsampled_value=1.6), y_key: types.ValueWithTDistribution( sample_mean=15., sample_standard_deviation=5, sample_degrees_of_freedom=1, unsampled_value=16), cm_key: types.ValueWithTDistribution( sample_mean=cm_metric, sample_standard_deviation=( binary_confusion_matrices.Matrices( thresholds=[0.5], tp=[1], fp=[1], tn=[1], fn=[1])), sample_degrees_of_freedom=1, unsampled_value=cm_metric), }), (slice_key2, { x_key: types.ValueWithTDistribution( sample_mean=3., sample_standard_deviation=1, sample_degrees_of_freedom=1, unsampled_value=3.3), y_key: types.ValueWithTDistribution( sample_mean=30., sample_standard_deviation=10, sample_degrees_of_freedom=1, unsampled_value=33), cm_key: types.ValueWithTDistribution( sample_mean=cm_metric, sample_standard_deviation=( binary_confusion_matrices.Matrices( thresholds=[0.5], tp=[10], fp=[10], tn=[10], fn=[10])), sample_degrees_of_freedom=1, unsampled_value=cm_metric), }), ] self.assertCountEqual(expected_pcoll, got_pcoll)
def check_result(got_pcoll): expected_pcoll = [ ( slice_key1, { x_key: types.ValueWithTDistribution( sample_mean=1.5, # sample_standard_deviation=0.5 sample_standard_deviation=np.std([1, 2], ddof=1), sample_degrees_of_freedom=1, unsampled_value=1.6), y_key: types.ValueWithTDistribution( sample_mean=15., # sample_standard_deviation=5, sample_standard_deviation=np.std([10, 20], ddof=1), sample_degrees_of_freedom=1, unsampled_value=16), cm_key: types.ValueWithTDistribution( sample_mean=cm_metric, sample_standard_deviation=cm_metric * 0, sample_degrees_of_freedom=1, unsampled_value=cm_metric), skipped_metric_key: 100, }), ( slice_key2, { x_key: types.ValueWithTDistribution( sample_mean=3., # sample_standard_deviation=1, sample_standard_deviation=np.std([2, 4], ddof=1), sample_degrees_of_freedom=1, unsampled_value=3.3), y_key: types.ValueWithTDistribution( sample_mean=30., # sample_standard_deviation=10, sample_standard_deviation=np.std([20, 40], ddof=1), sample_degrees_of_freedom=1, unsampled_value=33), cm_key: types.ValueWithTDistribution( sample_mean=cm_metric, sample_standard_deviation=cm_metric * 0, sample_degrees_of_freedom=1, unsampled_value=cm_metric), skipped_metric_key: 1000, }), ] self.assertCountEqual(expected_pcoll, got_pcoll)
def testCalculateConfidenceIntervalConfusionMatrices(self): mid, lb, ub = math_util.calculate_confidence_interval( types.ValueWithTDistribution( sample_mean=binary_confusion_matrices.Matrices( thresholds=[0.5], tp=[0.0], tn=[2.0], fp=[1.0], fn=[1.0]), sample_standard_deviation=binary_confusion_matrices.Matrices( thresholds=[0.5], tp=[0.0], tn=[2.051956704170308], fp=[1.025978352085154], fn=[1.2139539573337679]), sample_degrees_of_freedom=19, unsampled_value=binary_confusion_matrices.Matrices( thresholds=[0.5], tp=[0.0], tn=[2.0], fp=[1.0], fn=[1.0]))) expected_mid = binary_confusion_matrices.Matrices(thresholds=[0.5], tp=[0.0], tn=[2.0], fp=[1.0], fn=[1.0]) self.assertEqual(expected_mid, mid) expected_lb = binary_confusion_matrices.Matrices( thresholds=[0.5], tp=[0.0], tn=[-2.2947947404327547], fp=[-1.1473973702163773], fn=[-1.5408348336436783]) self.assertEqual(expected_lb.thresholds, lb.thresholds) np.testing.assert_almost_equal(lb.tp, expected_lb.tp) np.testing.assert_almost_equal(lb.fp, expected_lb.fp) np.testing.assert_almost_equal(lb.tn, expected_lb.tn) np.testing.assert_almost_equal(lb.fn, expected_lb.fn) expected_ub = binary_confusion_matrices.Matrices( thresholds=[0.5], tp=[0.0], tn=[6.294794740432755], fp=[3.1473973702163773], fn=[3.5408348336436783]) self.assertEqual(expected_ub.thresholds, ub.thresholds) np.testing.assert_almost_equal(ub.tp, expected_ub.tp) np.testing.assert_almost_equal(ub.fp, expected_ub.fp) np.testing.assert_almost_equal(ub.tn, expected_ub.tn) np.testing.assert_almost_equal(ub.fn, expected_ub.fn)
def extract_output(self, accumulator): # Compute the jackknife standard error for each metric. # See delete-d bootstrap method described in: # https://www.stat.berkeley.edu/~hhuang/STAT152/Jackknife-Bootstrap.pdf # Rather than normalize by all possible n-choose-d samples, we normalize by # the actual number of samples. assert _JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY in accumulator.unsampled_values, ( 'Expected unsampled jackknife values to contain the example count key: "', _JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY, '". Instead, found keys: ', accumulator.unsampled_values.keys()) n = accumulator.unsampled_values.pop( _JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY) # set d to expected size of a sample holdout d = n / float(accumulator.num_samples) if d < n**0.5: # if d < sqrt(n) the jackknife standard error will behave poorly for some # metrics (including the median). self._bad_samples_counter.inc(1) jackknife_scaling_factor = (n - d) / d dof = accumulator.num_samples - 1 num_samples = accumulator.num_samples result = {} for metric_key, unsampled_value in accumulator.unsampled_values.items( ): if (metric_key not in accumulator.sums or (self._skip_ci_metric_keys and metric_key in self._skip_ci_metric_keys)): result[metric_key] = unsampled_value else: mean = accumulator.sums[metric_key] / accumulator.num_samples sum_of_squares = accumulator.sums_of_squares[metric_key] # one-pass variance formula with num_samples degrees of freedom sample_variance = sum_of_squares / float( num_samples) - mean * mean standard_error = (jackknife_scaling_factor * sample_variance)**0.5 result[metric_key] = types.ValueWithTDistribution( sample_mean=mean, sample_standard_deviation=standard_error, sample_degrees_of_freedom=dof, unsampled_value=unsampled_value) return result
def extract_output( self, accumulator: confidence_intervals_util.SampleCombineFn._SampleAccumulator ) -> metric_types.MetricsDict: accumulator = self._validate_accumulator(accumulator) result = {} dof = self._num_samples - 1 for key, point_estimate in accumulator.point_estimates.items(): if key not in accumulator.metric_samples: result[key] = point_estimate else: mean, std_error = confidence_intervals_util.mean_and_std( accumulator.metric_samples[key], ddof=1) result[key] = types.ValueWithTDistribution( sample_mean=mean, sample_standard_deviation=std_error, unsampled_value=point_estimate, sample_degrees_of_freedom=dof) return result
def extract_output( self, accumulator: confidence_intervals_util.SampleCombineFn._SampleAccumulator ) -> metric_types.MetricsDict: accumulator = self._validate_accumulator(accumulator) result = {} num_buckets = self._num_samples for key, point_estimate in accumulator.point_estimates.items(): if key not in accumulator.metric_samples: result[key] = point_estimate else: # See jackknife cookie bucket method described in: # go/rasta-confidence-intervals pseudo_values = [] total = None for sample_value in accumulator.metric_samples[key]: if total is None: total = sample_value else: total = total + sample_value pseudo_values.append(point_estimate * num_buckets - sample_value * (num_buckets - 1)) _, std_dev = confidence_intervals_util.mean_and_std( pseudo_values, ddof=1) # Here we use Student's t-distribution to estimate the standard # error with n - 1 degrees of freedom as S.E. = S.D. / sqrt(n)a # In the case of the delete-d jackknife, the standard error is inversely # proprotional to the square root of the number of data partitions. std_error = std_dev / (num_buckets**0.5) mean = total / num_buckets result[key] = types.ValueWithTDistribution( sample_mean=mean, sample_standard_deviation=std_error, unsampled_value=point_estimate, sample_degrees_of_freedom=num_buckets - 1) return result
def testUncertaintyValuedMetrics(self): slice_key = _make_slice_key() slice_metrics = { 'one_dim': types.ValueWithTDistribution(2.0, 1.0, 3, 2.0), 'nans': types.ValueWithTDistribution( float('nan'), float('nan'), -1, float('nan')), } expected_metrics_for_slice = text_format.Parse( """ slice_key {} metrics { key: "one_dim" value { bounded_value { value { value: 2.0 } lower_bound { value: -1.1824463 } upper_bound { value: 5.1824463 } methodology: POISSON_BOOTSTRAP } confidence_interval { lower_bound { value: -1.1824463 } upper_bound { value: 5.1824463 } t_distribution_value { sample_mean { value: 2.0 } sample_standard_deviation { value: 1.0 } sample_degrees_of_freedom { value: 3 } unsampled_value { value: 2.0 } } } } } metrics { key: "nans" value { bounded_value { value { value: nan } lower_bound { value: nan } upper_bound { value: nan } methodology: POISSON_BOOTSTRAP } confidence_interval { lower_bound { value: nan } upper_bound { value: nan } t_distribution_value { sample_mean { value: nan } sample_standard_deviation { value: nan } sample_degrees_of_freedom { value: -1 } unsampled_value { value: nan } } } } } """, metrics_for_slice_pb2.MetricsForSlice()) got = metrics_plots_and_validations_writer.convert_slice_metrics_to_proto( (slice_key, slice_metrics), []) self.assertProtoEquals(expected_metrics_for_slice, got)
def testValidateMetricsMetricTDistributionChangeAndThreshold( self, slicing_specs, slice_key): threshold = config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection.LOWER_IS_BETTER, absolute={'value': -1})) eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(), config.ModelSpec(name='baseline', is_baseline=True) ], slicing_specs=slicing_specs, metrics_specs=[ config.MetricsSpec(metrics=[ config.MetricConfig( class_name='AUC', threshold=threshold if slicing_specs is None else None, per_slice_thresholds=[ config.PerSliceMetricThreshold( slicing_specs=slicing_specs, threshold=threshold) ]), ], model_names=['']), ], ) sliced_metrics = ( slice_key, { # This is the mean of the diff. metric_types.MetricKey(name='auc', model_name='baseline'): types.ValueWithTDistribution(sample_mean=0.91, unsampled_value=0.6), metric_types.MetricKey(name='auc', is_diff=True): types.ValueWithTDistribution(sample_mean=0.1, unsampled_value=0.1), }) result = metrics_validator.validate_metrics(sliced_metrics, eval_config) self.assertFalse(result.validation_ok) expected = text_format.Parse( """ metric_validations_per_slice { failures { metric_key { name: "auc" is_diff: true } metric_value { double_value { value: 0.1 } } } }""", validation_result_pb2.ValidationResult()) expected.metric_validations_per_slice[0].failures[ 0].metric_threshold.CopyFrom(threshold) expected.metric_validations_per_slice[0].slice_key.CopyFrom( slicer.serialize_slice_key(slice_key)) for spec in slicing_specs or [None]: if (spec is None or slicer.SingleSliceSpec( spec=spec).is_slice_applicable(slice_key)): slicing_details = expected.validation_details.slicing_details.add( ) if spec is not None: slicing_details.slicing_spec.CopyFrom(spec) else: slicing_details.slicing_spec.CopyFrom(config.SlicingSpec()) slicing_details.num_matching_slices = 1 self.assertAlmostEqual(result, expected)
def testConvertSliceMetricsToProtoMetricsRanges(self): slice_key = _make_slice_key('age', 5, 'language', 'english', 'price', 0.3) slice_metrics = { 'accuracy': types.ValueWithTDistribution(0.8, 0.1, 9, 0.8), metric_keys.AUPRC: 0.1, metric_keys.lower_bound_key(metric_keys.AUPRC): 0.05, metric_keys.upper_bound_key(metric_keys.AUPRC): 0.17, metric_keys.AUC: 0.2, metric_keys.lower_bound_key(metric_keys.AUC): 0.1, metric_keys.upper_bound_key(metric_keys.AUC): 0.3 } expected_metrics_for_slice = text_format.Parse( string.Template(""" slice_key { single_slice_keys { column: 'age' int64_value: 5 } single_slice_keys { column: 'language' bytes_value: 'english' } single_slice_keys { column: 'price' float_value: 0.3 } } metrics { key: "accuracy" value { bounded_value { value { value: 0.8 } lower_bound { value: 0.5737843 } upper_bound { value: 1.0262157 } methodology: POISSON_BOOTSTRAP } confidence_interval { lower_bound { value: 0.5737843 } upper_bound { value: 1.0262157 } t_distribution_value { sample_mean { value: 0.8 } sample_standard_deviation { value: 0.1 } sample_degrees_of_freedom { value: 9 } unsampled_value { value: 0.8 } } } } } metrics { key: "$auc" value { bounded_value { lower_bound { value: 0.1 } upper_bound { value: 0.3 } value { value: 0.2 } methodology: RIEMANN_SUM } } } metrics { key: "$auprc" value { bounded_value { lower_bound { value: 0.05 } upper_bound { value: 0.17 } value { value: 0.1 } methodology: RIEMANN_SUM } } }""").substitute(auc=metric_keys.AUC, auprc=metric_keys.AUPRC), metrics_for_slice_pb2.MetricsForSlice()) got = metrics_plots_and_validations_writer.convert_slice_metrics_to_proto( (slice_key, slice_metrics), [post_export_metrics.auc(), post_export_metrics.auc(curve='PR')]) self.assertProtoEquals(expected_metrics_for_slice, got)
def testSerializeMetricsRanges(self): slice_key = _make_slice_key('age', 5, 'language', 'english', 'price', 0.3) slice_metrics = { 'accuracy': types.ValueWithTDistribution(0.8, 0.1, 9, 0.8), metric_keys.AUPRC: 0.1, metric_keys.lower_bound_key(metric_keys.AUPRC): 0.05, metric_keys.upper_bound_key(metric_keys.AUPRC): 0.17, metric_keys.AUC: 0.2, metric_keys.lower_bound_key(metric_keys.AUC): 0.1, metric_keys.upper_bound_key(metric_keys.AUC): 0.3 } expected_metrics_for_slice = text_format.Parse( string.Template(""" slice_key { single_slice_keys { column: 'age' int64_value: 5 } single_slice_keys { column: 'language' bytes_value: 'english' } single_slice_keys { column: 'price' float_value: 0.3 } } metrics { key: "accuracy" value { bounded_value { value { value: 0.8 } lower_bound { value: 0.5737843 } upper_bound { value: 1.0262157 } methodology: POISSON_BOOTSTRAP } } } metrics { key: "$auc" value { bounded_value { lower_bound { value: 0.1 } upper_bound { value: 0.3 } value { value: 0.2 } methodology: RIEMANN_SUM } } } metrics { key: "$auprc" value { bounded_value { lower_bound { value: 0.05 } upper_bound { value: 0.17 } value { value: 0.1 } methodology: RIEMANN_SUM } } }""").substitute(auc=metric_keys.AUC, auprc=metric_keys.AUPRC), metrics_for_slice_pb2.MetricsForSlice()) got = metrics_and_plots_serialization._serialize_metrics( (slice_key, slice_metrics), [post_export_metrics.auc(), post_export_metrics.auc(curve='PR')]) self.assertProtoEquals( expected_metrics_for_slice, metrics_for_slice_pb2.MetricsForSlice.FromString(got))