def testCalculateConfidenceInterval(self):
    sampling_data_list = [
        np.array([
            [0, 0, 2, 7, 0.77777779, 1],
            [1, 0, 2, 6, 0.75, 0.85714287],
            [4, 0, 2, 3, 0.60000002, 0.42857143],
            [4, 2, 0, 3, 1, 0.42857143],
            [7, 2, 0, 0, float('nan'), 0],
        ]),
        np.array([
            [7, 2, 0, 0, float('nan'), 0],
            [0, 0, 2, 7, 0.77777779, 1],
            [1, 0, 2, 6, 0.75, 0.85714287],
            [4, 0, 2, 3, 0.60000002, 0.42857143],
            [4, 2, 0, 3, 1, 0.42857143],
        ]),
    ]
    unsampled_data = np.array([
        [4, 2, 0, 3, 1, 0.42857143],
        [7, 2, 0, 0, float('nan'), 0],
        [0, 0, 2, 7, 0.77777779, 1],
        [1, 0, 2, 6, 0.75, 0.85714287],
        [4, 0, 2, 3, 0.60000002, 0.42857143],
    ])
    result = poisson_bootstrap._calculate_t_distribution(
        sampling_data_list, unsampled_data)
    self.assertIsInstance(result, np.ndarray)
    self.assertEqual(result.shape, (5, 6))
    self.assertAlmostEqual(result[0][0].sample_mean, 3.5, delta=0.1)
    self.assertAlmostEqual(
        result[0][0].sample_standard_deviation, 4.94, delta=0.1)
    self.assertEqual(result[0][0].sample_degrees_of_freedom, 1)
    self.assertEqual(result[0][0].unsampled_value, 4.0)
    self.assertAlmostEqual(result[0][4].sample_mean, 0.77, delta=0.1)
    self.assertTrue(np.isnan(result[0][4].sample_standard_deviation))
    self.assertEqual(result[0][4].sample_degrees_of_freedom, 0)
    self.assertEqual(result[0][4].unsampled_value, 1.0)

    sampling_data_list = [
        np.array([1, 2]),
        np.array([1, 2]),
        np.array([1, float('nan')])
    ]
    unsampled_data = np.array([1, 2])
    result = poisson_bootstrap._calculate_t_distribution(
        sampling_data_list, unsampled_data)
    self.assertIsInstance(result, np.ndarray)
    self.assertEqual(result.tolist(), [
        types.ValueWithTDistribution(
            sample_mean=1.0,
            sample_standard_deviation=0.0,
            sample_degrees_of_freedom=2,
            unsampled_value=1),
        types.ValueWithTDistribution(
            sample_mean=2.0,
            sample_standard_deviation=0.0,
            sample_degrees_of_freedom=1,
            unsampled_value=2)
    ])
Esempio n. 2
0
 def testCalculateConfidenceInterval(self):
     np.testing.assert_almost_equal(
         math_util.calculate_confidence_interval(
             types.ValueWithTDistribution(10, 2, 9, 10)),
         (10, 5.4756856744035902196, 14.524314325596410669))
     mid, lb, ub = math_util.calculate_confidence_interval(
         types.ValueWithTDistribution(-1, -1, -1, -1))
     self.assertEqual(mid, -1)
     self.assertTrue(math.isnan(lb))
     self.assertTrue(math.isnan(ub))
Esempio n. 3
0
 def testCalculateConfidenceInterval(self):
     self.assertEqual(
         math_util.calculate_confidence_interval(
             types.ValueWithTDistribution(10, 2, 9, 10)),
         (10, 8.5692861880948552, 11.430713811905145))
     mean, lb, ub = math_util.calculate_confidence_interval(
         types.ValueWithTDistribution(-1, -1, -1, -1))
     self.assertEqual(mean, -1)
     self.assertTrue(math.isnan(lb))
     self.assertTrue(math.isnan(ub))
Esempio n. 4
0
 def testUncertaintyValuedMetrics(self):
     slice_key = _make_slice_key()
     slice_metrics = {
         'one_dim':
         types.ValueWithTDistribution(2.0, 1.0, 3, 2.0),
         'nans':
         types.ValueWithTDistribution(float('nan'), float('nan'), -1,
                                      float('nan')),
     }
     expected_metrics_for_slice = text_format.Parse(
         """
     slice_key {}
     metrics {
       key: "one_dim"
       value {
         bounded_value {
           value {
             value: 2.0
           }
           lower_bound {
             value: -1.1824463
           }
           upper_bound {
             value: 5.1824463
           }
           methodology: POISSON_BOOTSTRAP
         }
       }
     }
     metrics {
       key: "nans"
       value {
         bounded_value {
           value {
             value: nan
           }
           lower_bound {
             value: nan
           }
           upper_bound {
             value: nan
           }
           methodology: POISSON_BOOTSTRAP
         }
       }
     }
     """, metrics_for_slice_pb2.MetricsForSlice())
     got = metrics_and_plots_serialization._serialize_metrics(
         (slice_key, slice_metrics), [])
     self.assertProtoEquals(
         expected_metrics_for_slice,
         metrics_for_slice_pb2.MetricsForSlice.FromString(got))
Esempio n. 5
0
 def check_result(got_pcoll):
     expected_pcoll = [
         (
             (slice_key1, ),
             {
                 x_key:
                 types.ValueWithTDistribution(
                     sample_mean=1.5,
                     # (((100 - 100/2)/(100/2))*np.var([1, 2]))**0.5
                     sample_standard_deviation=.5,
                     sample_degrees_of_freedom=1,
                     unsampled_value=1.6),
                 y_key:
                 types.ValueWithTDistribution(
                     sample_mean=15,
                     # (((100 - 100/2)/(100/2))*np.var([10, 20]))**0.5
                     sample_standard_deviation=5,
                     sample_degrees_of_freedom=1,
                     unsampled_value=16),
                 cm_key:
                 cm_metric,
                 example_count_key:
                 100,
             }),
         (
             (slice_key2, ),
             {
                 x_key:
                 types.ValueWithTDistribution(
                     sample_mean=3,
                     # (((1000 - 1000/2)/(1000/2))*np.var([2, 4]))**0.5
                     sample_standard_deviation=1,
                     sample_degrees_of_freedom=1,
                     unsampled_value=3.3),
                 y_key:
                 types.ValueWithTDistribution(
                     sample_mean=30,
                     # (((1000 - 1000/2)/(1000/2))*np.var([20, 40]))**0.5
                     sample_standard_deviation=10,
                     sample_degrees_of_freedom=1,
                     unsampled_value=33),
                 cm_key:
                 cm_metric,
                 example_count_key:
                 1000,
             }),
     ]
     self.assertCountEqual(expected_pcoll, got_pcoll)
Esempio n. 6
0
    def extract_output(self, accumulator):
        # Compute the jackknife standard error for each metric.
        # See delete-d bootstrap method described in:
        # https://www.stat.berkeley.edu/~hhuang/STAT152/Jackknife-Bootstrap.pdf
        # Rather than normalize by all possible n-choose-d samples, we normalize by
        # the actual number of samples.
        self._num_slices_counter.inc(1)
        unsampled_values = accumulator.unsampled_values
        assert _JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY in unsampled_values, (
            'Expected unsampled jackknife values to contain the example count key: '
            '"{}". Instead, found keys: {}'.format(
                _JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY, unsampled_values.keys()))
        n = unsampled_values.pop(_JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY)

        result = {}
        missing_samples = False
        # If we don't get at least one example in each sample, don't compute CI.
        if accumulator.num_samples < self._num_jackknife_samples:
            self._missing_samples_counter.inc(1)
            missing_samples = True
            result[metric_types.MetricKey(metric_keys.ERROR_METRIC)] = (
                'CI not computed because only {num_samples} samples were non-empty. '
                'Expected {num_jackknife_samples}.'.format(
                    num_samples=accumulator.num_samples,
                    num_jackknife_samples=self._num_jackknife_samples))

        # set d to expected size of a sample holdout
        d = n / float(accumulator.num_samples)
        if d < n**0.5:
            # if d < sqrt(n) the jackknife standard error will behave poorly for some
            # metrics (including the median).
            self._small_samples_counter.inc(1)

        jackknife_scaling_factor = (n - d) / d
        dof = accumulator.num_samples - 1
        num_samples = accumulator.num_samples

        for metric_key, unsampled_value in unsampled_values.items():
            if (missing_samples or metric_key not in accumulator.sums
                    or (self._skip_ci_metric_keys
                        and metric_key in self._skip_ci_metric_keys)):
                result[metric_key] = unsampled_value
            else:
                mean = accumulator.sums[metric_key] / num_samples
                sum_of_squares = accumulator.sums_of_squares[metric_key]
                # one-pass variance formula with num_samples degrees of freedom
                sample_variance = sum_of_squares / float(
                    num_samples) - mean * mean
                if sample_variance < 0:
                    self._negative_variance_dist.update(n)
                standard_error = (jackknife_scaling_factor *
                                  sample_variance)**0.5
                if standard_error == 0:
                    self._zero_variance_dist.update(n)
                result[metric_key] = types.ValueWithTDistribution(
                    sample_mean=mean,
                    sample_standard_deviation=standard_error,
                    sample_degrees_of_freedom=dof,
                    unsampled_value=unsampled_value)
        return result
 def testValidateMetricsMetricTDistributionValueAndThreshold(
         self, slicing_specs, slice_key):
     threshold = config.MetricThreshold(
         value_threshold=config.GenericValueThreshold(
             lower_bound={'value': 0.9}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(metrics=[
                 config.MetricConfig(
                     class_name='AUC',
                     threshold=threshold if slicing_specs is None else None,
                     per_slice_thresholds=[
                         config.PerSliceMetricThreshold(
                             slicing_specs=slicing_specs,
                             threshold=threshold)
                     ]),
             ],
                                model_names=['']),
         ],
     )
     sliced_metrics = (slice_key, {
         metric_types.MetricKey(name='auc'):
         types.ValueWithTDistribution(sample_mean=0.91, unsampled_value=0.8)
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
     expected = text_format.Parse(
         """
     metric_validations_per_slice {
       failures {
         metric_key {
           name: "auc"
         }
         metric_value {
           double_value {
             value: 0.8
           }
         }
       }
     }""", validation_result_pb2.ValidationResult())
     expected.metric_validations_per_slice[0].failures[
         0].metric_threshold.CopyFrom(threshold)
     expected.metric_validations_per_slice[0].slice_key.CopyFrom(
         slicer.serialize_slice_key(slice_key))
     for spec in slicing_specs or [None]:
         if (spec is None or slicer.SingleSliceSpec(
                 spec=spec).is_slice_applicable(slice_key)):
             slicing_details = expected.validation_details.slicing_details.add(
             )
             if spec is not None:
                 slicing_details.slicing_spec.CopyFrom(spec)
             else:
                 slicing_details.slicing_spec.CopyFrom(config.SlicingSpec())
             slicing_details.num_matching_slices = 1
     self.assertEqual(result, expected)
Esempio n. 8
0
def _get_metrics_as_dict(
    metrics: metrics_for_slice_pb2.MetricsForSlice
) -> Dict[metric_types.MetricKey, types.ValueWithTDistribution]:
    """Convert slice metrics to a Dict of types.ValueWithTDistribution.

  For metrics missing the confidence interval message, an empty
  ValueWithTDistribution will be created and the double_value or
  bounded_value.value will be set as the unsampled value. Any metrics which are
  not represented as double_values or bounded_values will be ommitted from the
  result.

  Args:
    metrics: The MetricsForSlice proto to be converted.

  Returns:
    A dict from metric keys to ValueWithTDistributions.
  """
    result = {}
    for metric in metrics.metric_keys_and_values:
        value_type = metric.value.WhichOneof('type')
        unsampled_value = float('nan')
        if value_type == 'bounded_value':
            unsampled_value = metric.value.bounded_value.value.value
        elif value_type == 'double_value':
            unsampled_value = metric.value.double_value.value
        t_distribution_value = metric.value.confidence_interval.t_distribution_value
        result[metric_types.MetricKey.from_proto(
            metric.key)] = types.ValueWithTDistribution(
                sample_mean=t_distribution_value.sample_mean.value,
                sample_standard_deviation=t_distribution_value.
                sample_standard_deviation.value,
                sample_degrees_of_freedom=t_distribution_value.
                sample_degrees_of_freedom.value,
                unsampled_value=unsampled_value)
    return result
Esempio n. 9
0
 def check_result(got_pcoll):
     expected_pcoll = [
         ((slice_key, ), {
             metric_key:
             types.ValueWithTDistribution(
                 sample_mean=5293977041.15,
                 sample_standard_deviation=12845957824.018991,
                 sample_degrees_of_freedom=19,
                 unsampled_value=1),
         }),
     ]
     self.assertCountEqual(expected_pcoll, got_pcoll)
 def check_result(got_pcoll):
   expected_pcoll = [
       {
           metric_key:
               types.ValueWithTDistribution(
                   sample_mean=5293977041.15,
                   sample_standard_deviation=3023624729.537024,
                   sample_degrees_of_freedom=19,
                   unsampled_value=1),
       },
   ]
   self.assertCountEqual(expected_pcoll, got_pcoll)
def _get_metrics_as_dict(metrics):
    """Convert slice metrics to a Dict of types.ValueWithTDistribution."""
    result = {}
    for metric in metrics.metric_keys_and_values:
        value_type = metric.value.WhichOneof('type')
        if value_type == 'bounded_value':
            t_distribution_value = (
                metric.value.confidence_interval.t_distribution_value)
            result[metric.key.name] = types.ValueWithTDistribution(
                sample_mean=t_distribution_value.sample_mean.value,
                sample_standard_deviation=t_distribution_value.
                sample_standard_deviation.value,
                sample_degrees_of_freedom=t_distribution_value.
                sample_degrees_of_freedom.value,
                unsampled_value=t_distribution_value.unsampled_value.value)
        elif value_type == 'double_value':
            result[metric.key.name] = types.ValueWithTDistribution(
                sample_mean=-1,
                sample_standard_deviation=-1,
                sample_degrees_of_freedom=-1,
                unsampled_value=metric.value.double_value.value)
    return result
Esempio n. 12
0
def _calculate_t_distribution(  # pylint: disable=invalid-name
        sampling_data_list: List[Union[int, float, np.ndarray]],
        unsampled_data: Union[int, float, np.ndarray]):
    """Calculate the confidence interval of the data.

  Args:
    sampling_data_list: A list of number or np.ndarray.
    unsampled_data: Individual number or np.ndarray. The format of the
      unsampled_data should match the format of the element inside
      sampling_data_list.

  Returns:
    Confidence Interval value stored inside
    types.ValueWithTDistribution.
  """
    if isinstance(sampling_data_list[0], (np.ndarray, list)):
        merged_data = sampling_data_list[0][:]
        if isinstance(sampling_data_list[0], np.ndarray):
            merged_data = merged_data.astype(object)
        for index in range(len(merged_data)):
            merged_data[index] = _calculate_t_distribution(
                [data[index] for data in sampling_data_list],
                unsampled_data[index])
        return merged_data
    else:
        # Data has to be numeric. That means throw out nan values.
        sampling_data_list = [
            data for data in sampling_data_list if not np.isnan(data)
        ]
        n_samples = len(sampling_data_list)
        if n_samples:
            sample_mean = np.mean(sampling_data_list)
            sample_std = np.std(sampling_data_list, ddof=1)
            return types.ValueWithTDistribution(sample_mean, sample_std,
                                                n_samples - 1, unsampled_data)
        else:
            return types.ValueWithTDistribution(float('nan'), float('nan'), -1,
                                                float('nan'))
Esempio n. 13
0
 def check_result(got_pcoll):
     expected_pcoll = [
         (slice_key1, {
             x_key:
             types.ValueWithTDistribution(
                 sample_mean=1.5,
                 sample_standard_deviation=0.5,
                 sample_degrees_of_freedom=1,
                 unsampled_value=1.6),
             y_key:
             types.ValueWithTDistribution(
                 sample_mean=15.,
                 sample_standard_deviation=5,
                 sample_degrees_of_freedom=1,
                 unsampled_value=16),
             cm_key:
             types.ValueWithTDistribution(
                 sample_mean=cm_metric,
                 sample_standard_deviation=(
                     binary_confusion_matrices.Matrices(
                         thresholds=[0.5],
                         tp=[1],
                         fp=[1],
                         tn=[1],
                         fn=[1])),
                 sample_degrees_of_freedom=1,
                 unsampled_value=cm_metric),
         }),
         (slice_key2, {
             x_key:
             types.ValueWithTDistribution(
                 sample_mean=3.,
                 sample_standard_deviation=1,
                 sample_degrees_of_freedom=1,
                 unsampled_value=3.3),
             y_key:
             types.ValueWithTDistribution(
                 sample_mean=30.,
                 sample_standard_deviation=10,
                 sample_degrees_of_freedom=1,
                 unsampled_value=33),
             cm_key:
             types.ValueWithTDistribution(
                 sample_mean=cm_metric,
                 sample_standard_deviation=(
                     binary_confusion_matrices.Matrices(
                         thresholds=[0.5],
                         tp=[10],
                         fp=[10],
                         tn=[10],
                         fn=[10])),
                 sample_degrees_of_freedom=1,
                 unsampled_value=cm_metric),
         }),
     ]
     self.assertCountEqual(expected_pcoll, got_pcoll)
 def check_result(got_pcoll):
   expected_pcoll = [
       (
           slice_key1,
           {
               x_key:
                   types.ValueWithTDistribution(
                       sample_mean=1.5,
                       # sample_standard_deviation=0.5
                       sample_standard_deviation=np.std([1, 2], ddof=1),
                       sample_degrees_of_freedom=1,
                       unsampled_value=1.6),
               y_key:
                   types.ValueWithTDistribution(
                       sample_mean=15.,
                       # sample_standard_deviation=5,
                       sample_standard_deviation=np.std([10, 20], ddof=1),
                       sample_degrees_of_freedom=1,
                       unsampled_value=16),
               cm_key:
                   types.ValueWithTDistribution(
                       sample_mean=cm_metric,
                       sample_standard_deviation=cm_metric * 0,
                       sample_degrees_of_freedom=1,
                       unsampled_value=cm_metric),
               skipped_metric_key:
                   100,
           }),
       (
           slice_key2,
           {
               x_key:
                   types.ValueWithTDistribution(
                       sample_mean=3.,
                       # sample_standard_deviation=1,
                       sample_standard_deviation=np.std([2, 4], ddof=1),
                       sample_degrees_of_freedom=1,
                       unsampled_value=3.3),
               y_key:
                   types.ValueWithTDistribution(
                       sample_mean=30.,
                       # sample_standard_deviation=10,
                       sample_standard_deviation=np.std([20, 40], ddof=1),
                       sample_degrees_of_freedom=1,
                       unsampled_value=33),
               cm_key:
                   types.ValueWithTDistribution(
                       sample_mean=cm_metric,
                       sample_standard_deviation=cm_metric * 0,
                       sample_degrees_of_freedom=1,
                       unsampled_value=cm_metric),
               skipped_metric_key:
                   1000,
           }),
   ]
   self.assertCountEqual(expected_pcoll, got_pcoll)
Esempio n. 15
0
    def testCalculateConfidenceIntervalConfusionMatrices(self):
        mid, lb, ub = math_util.calculate_confidence_interval(
            types.ValueWithTDistribution(
                sample_mean=binary_confusion_matrices.Matrices(
                    thresholds=[0.5], tp=[0.0], tn=[2.0], fp=[1.0], fn=[1.0]),
                sample_standard_deviation=binary_confusion_matrices.Matrices(
                    thresholds=[0.5],
                    tp=[0.0],
                    tn=[2.051956704170308],
                    fp=[1.025978352085154],
                    fn=[1.2139539573337679]),
                sample_degrees_of_freedom=19,
                unsampled_value=binary_confusion_matrices.Matrices(
                    thresholds=[0.5], tp=[0.0], tn=[2.0], fp=[1.0], fn=[1.0])))

        expected_mid = binary_confusion_matrices.Matrices(thresholds=[0.5],
                                                          tp=[0.0],
                                                          tn=[2.0],
                                                          fp=[1.0],
                                                          fn=[1.0])
        self.assertEqual(expected_mid, mid)

        expected_lb = binary_confusion_matrices.Matrices(
            thresholds=[0.5],
            tp=[0.0],
            tn=[-2.2947947404327547],
            fp=[-1.1473973702163773],
            fn=[-1.5408348336436783])
        self.assertEqual(expected_lb.thresholds, lb.thresholds)
        np.testing.assert_almost_equal(lb.tp, expected_lb.tp)
        np.testing.assert_almost_equal(lb.fp, expected_lb.fp)
        np.testing.assert_almost_equal(lb.tn, expected_lb.tn)
        np.testing.assert_almost_equal(lb.fn, expected_lb.fn)

        expected_ub = binary_confusion_matrices.Matrices(
            thresholds=[0.5],
            tp=[0.0],
            tn=[6.294794740432755],
            fp=[3.1473973702163773],
            fn=[3.5408348336436783])
        self.assertEqual(expected_ub.thresholds, ub.thresholds)
        np.testing.assert_almost_equal(ub.tp, expected_ub.tp)
        np.testing.assert_almost_equal(ub.fp, expected_ub.fp)
        np.testing.assert_almost_equal(ub.tn, expected_ub.tn)
        np.testing.assert_almost_equal(ub.fn, expected_ub.fn)
Esempio n. 16
0
    def extract_output(self, accumulator):
        # Compute the jackknife standard error for each metric.
        # See delete-d bootstrap method described in:
        # https://www.stat.berkeley.edu/~hhuang/STAT152/Jackknife-Bootstrap.pdf
        # Rather than normalize by all possible n-choose-d samples, we normalize by
        # the actual number of samples.
        assert _JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY in accumulator.unsampled_values, (
            'Expected unsampled jackknife values to contain the example count key: "',
            _JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY, '". Instead, found keys: ',
            accumulator.unsampled_values.keys())
        n = accumulator.unsampled_values.pop(
            _JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY)
        # set d to expected size of a sample holdout
        d = n / float(accumulator.num_samples)
        if d < n**0.5:
            # if d < sqrt(n) the jackknife standard error will behave poorly for some
            # metrics (including the median).
            self._bad_samples_counter.inc(1)

        jackknife_scaling_factor = (n - d) / d
        dof = accumulator.num_samples - 1
        num_samples = accumulator.num_samples

        result = {}
        for metric_key, unsampled_value in accumulator.unsampled_values.items(
        ):
            if (metric_key not in accumulator.sums
                    or (self._skip_ci_metric_keys
                        and metric_key in self._skip_ci_metric_keys)):
                result[metric_key] = unsampled_value
            else:
                mean = accumulator.sums[metric_key] / accumulator.num_samples
                sum_of_squares = accumulator.sums_of_squares[metric_key]
                # one-pass variance formula with num_samples degrees of freedom
                sample_variance = sum_of_squares / float(
                    num_samples) - mean * mean
                standard_error = (jackknife_scaling_factor *
                                  sample_variance)**0.5
                result[metric_key] = types.ValueWithTDistribution(
                    sample_mean=mean,
                    sample_standard_deviation=standard_error,
                    sample_degrees_of_freedom=dof,
                    unsampled_value=unsampled_value)
        return result
Esempio n. 17
0
 def extract_output(
     self,
     accumulator: confidence_intervals_util.SampleCombineFn._SampleAccumulator
 ) -> metric_types.MetricsDict:
   accumulator = self._validate_accumulator(accumulator)
   result = {}
   dof = self._num_samples - 1
   for key, point_estimate in accumulator.point_estimates.items():
     if key not in accumulator.metric_samples:
       result[key] = point_estimate
     else:
       mean, std_error = confidence_intervals_util.mean_and_std(
           accumulator.metric_samples[key], ddof=1)
       result[key] = types.ValueWithTDistribution(
           sample_mean=mean,
           sample_standard_deviation=std_error,
           unsampled_value=point_estimate,
           sample_degrees_of_freedom=dof)
   return result
Esempio n. 18
0
 def extract_output(
     self,
     accumulator: confidence_intervals_util.SampleCombineFn._SampleAccumulator
 ) -> metric_types.MetricsDict:
   accumulator = self._validate_accumulator(accumulator)
   result = {}
   num_buckets = self._num_samples
   for key, point_estimate in accumulator.point_estimates.items():
     if key not in accumulator.metric_samples:
       result[key] = point_estimate
     else:
       # See jackknife cookie bucket method described in:
       # go/rasta-confidence-intervals
       pseudo_values = []
       total = None
       for sample_value in accumulator.metric_samples[key]:
         if total is None:
           total = sample_value
         else:
           total = total + sample_value
         pseudo_values.append(point_estimate * num_buckets - sample_value *
                              (num_buckets - 1))
       _, std_dev = confidence_intervals_util.mean_and_std(
           pseudo_values, ddof=1)
       # Here we use Student's t-distribution to estimate the standard
       # error with n - 1 degrees of freedom as S.E. = S.D. / sqrt(n)a
       # In the case of the delete-d jackknife, the standard error is inversely
       # proprotional to the square root of the number of data partitions.
       std_error = std_dev / (num_buckets**0.5)
       mean = total / num_buckets
       result[key] = types.ValueWithTDistribution(
           sample_mean=mean,
           sample_standard_deviation=std_error,
           unsampled_value=point_estimate,
           sample_degrees_of_freedom=num_buckets - 1)
   return result
Esempio n. 19
0
 def testUncertaintyValuedMetrics(self):
   slice_key = _make_slice_key()
   slice_metrics = {
       'one_dim':
           types.ValueWithTDistribution(2.0, 1.0, 3, 2.0),
       'nans':
           types.ValueWithTDistribution(
               float('nan'), float('nan'), -1, float('nan')),
   }
   expected_metrics_for_slice = text_format.Parse(
       """
       slice_key {}
       metrics {
         key: "one_dim"
         value {
           bounded_value {
             value {
               value: 2.0
             }
             lower_bound {
               value: -1.1824463
             }
             upper_bound {
               value: 5.1824463
             }
             methodology: POISSON_BOOTSTRAP
           }
           confidence_interval {
             lower_bound {
               value: -1.1824463
             }
             upper_bound {
               value: 5.1824463
             }
             t_distribution_value {
               sample_mean {
                 value: 2.0
               }
               sample_standard_deviation {
                 value: 1.0
               }
               sample_degrees_of_freedom {
                 value: 3
               }
               unsampled_value {
                 value: 2.0
               }
             }
           }
         }
       }
       metrics {
         key: "nans"
         value {
           bounded_value {
             value {
               value: nan
             }
             lower_bound {
               value: nan
             }
             upper_bound {
               value: nan
             }
             methodology: POISSON_BOOTSTRAP
           }
           confidence_interval {
             lower_bound {
               value: nan
             }
             upper_bound {
               value: nan
             }
             t_distribution_value {
               sample_mean {
                 value: nan
               }
               sample_standard_deviation {
                 value: nan
               }
               sample_degrees_of_freedom {
                 value: -1
               }
               unsampled_value {
                 value: nan
               }
             }
           }
         }
       }
       """, metrics_for_slice_pb2.MetricsForSlice())
   got = metrics_plots_and_validations_writer.convert_slice_metrics_to_proto(
       (slice_key, slice_metrics), [])
   self.assertProtoEquals(expected_metrics_for_slice, got)
 def testValidateMetricsMetricTDistributionChangeAndThreshold(
         self, slicing_specs, slice_key):
     threshold = config.MetricThreshold(
         change_threshold=config.GenericChangeThreshold(
             direction=config.MetricDirection.LOWER_IS_BETTER,
             absolute={'value': -1}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
             config.ModelSpec(name='baseline', is_baseline=True)
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(metrics=[
                 config.MetricConfig(
                     class_name='AUC',
                     threshold=threshold if slicing_specs is None else None,
                     per_slice_thresholds=[
                         config.PerSliceMetricThreshold(
                             slicing_specs=slicing_specs,
                             threshold=threshold)
                     ]),
             ],
                                model_names=['']),
         ],
     )
     sliced_metrics = (
         slice_key,
         {
             # This is the mean of the diff.
             metric_types.MetricKey(name='auc', model_name='baseline'):
             types.ValueWithTDistribution(sample_mean=0.91,
                                          unsampled_value=0.6),
             metric_types.MetricKey(name='auc', is_diff=True):
             types.ValueWithTDistribution(sample_mean=0.1,
                                          unsampled_value=0.1),
         })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertFalse(result.validation_ok)
     expected = text_format.Parse(
         """
     metric_validations_per_slice {
       failures {
         metric_key {
           name: "auc"
           is_diff: true
         }
         metric_value {
           double_value {
             value: 0.1
           }
         }
       }
     }""", validation_result_pb2.ValidationResult())
     expected.metric_validations_per_slice[0].failures[
         0].metric_threshold.CopyFrom(threshold)
     expected.metric_validations_per_slice[0].slice_key.CopyFrom(
         slicer.serialize_slice_key(slice_key))
     for spec in slicing_specs or [None]:
         if (spec is None or slicer.SingleSliceSpec(
                 spec=spec).is_slice_applicable(slice_key)):
             slicing_details = expected.validation_details.slicing_details.add(
             )
             if spec is not None:
                 slicing_details.slicing_spec.CopyFrom(spec)
             else:
                 slicing_details.slicing_spec.CopyFrom(config.SlicingSpec())
             slicing_details.num_matching_slices = 1
     self.assertAlmostEqual(result, expected)
Esempio n. 21
0
  def testConvertSliceMetricsToProtoMetricsRanges(self):
    slice_key = _make_slice_key('age', 5, 'language', 'english', 'price', 0.3)
    slice_metrics = {
        'accuracy': types.ValueWithTDistribution(0.8, 0.1, 9, 0.8),
        metric_keys.AUPRC: 0.1,
        metric_keys.lower_bound_key(metric_keys.AUPRC): 0.05,
        metric_keys.upper_bound_key(metric_keys.AUPRC): 0.17,
        metric_keys.AUC: 0.2,
        metric_keys.lower_bound_key(metric_keys.AUC): 0.1,
        metric_keys.upper_bound_key(metric_keys.AUC): 0.3
    }
    expected_metrics_for_slice = text_format.Parse(
        string.Template("""
        slice_key {
          single_slice_keys {
            column: 'age'
            int64_value: 5
          }
          single_slice_keys {
            column: 'language'
            bytes_value: 'english'
          }
          single_slice_keys {
            column: 'price'
            float_value: 0.3
          }
        }
        metrics {
          key: "accuracy"
          value {
            bounded_value {
              value {
                value: 0.8
              }
              lower_bound {
                value: 0.5737843
              }
              upper_bound {
                value: 1.0262157
              }
              methodology: POISSON_BOOTSTRAP
            }
            confidence_interval {
              lower_bound {
                value: 0.5737843
              }
              upper_bound {
                value: 1.0262157
              }
              t_distribution_value {
                sample_mean {
                  value: 0.8
                }
                sample_standard_deviation {
                  value: 0.1
                }
                sample_degrees_of_freedom {
                  value: 9
                }
                unsampled_value {
                  value: 0.8
                }
              }
            }
          }
        }
        metrics {
          key: "$auc"
          value {
            bounded_value {
              lower_bound {
                value: 0.1
              }
              upper_bound {
                value: 0.3
              }
              value {
                value: 0.2
              }
              methodology: RIEMANN_SUM
            }
          }
        }
        metrics {
          key: "$auprc"
          value {
            bounded_value {
              lower_bound {
                value: 0.05
              }
              upper_bound {
                value: 0.17
              }
              value {
                value: 0.1
              }
              methodology: RIEMANN_SUM
            }
          }
        }""").substitute(auc=metric_keys.AUC, auprc=metric_keys.AUPRC),
        metrics_for_slice_pb2.MetricsForSlice())

    got = metrics_plots_and_validations_writer.convert_slice_metrics_to_proto(
        (slice_key, slice_metrics),
        [post_export_metrics.auc(),
         post_export_metrics.auc(curve='PR')])
    self.assertProtoEquals(expected_metrics_for_slice, got)
Esempio n. 22
0
    def testSerializeMetricsRanges(self):
        slice_key = _make_slice_key('age', 5, 'language', 'english', 'price',
                                    0.3)
        slice_metrics = {
            'accuracy': types.ValueWithTDistribution(0.8, 0.1, 9, 0.8),
            metric_keys.AUPRC: 0.1,
            metric_keys.lower_bound_key(metric_keys.AUPRC): 0.05,
            metric_keys.upper_bound_key(metric_keys.AUPRC): 0.17,
            metric_keys.AUC: 0.2,
            metric_keys.lower_bound_key(metric_keys.AUC): 0.1,
            metric_keys.upper_bound_key(metric_keys.AUC): 0.3
        }
        expected_metrics_for_slice = text_format.Parse(
            string.Template("""
        slice_key {
          single_slice_keys {
            column: 'age'
            int64_value: 5
          }
          single_slice_keys {
            column: 'language'
            bytes_value: 'english'
          }
          single_slice_keys {
            column: 'price'
            float_value: 0.3
          }
        }
        metrics {
          key: "accuracy"
          value {
            bounded_value {
              value {
                value: 0.8
              }
              lower_bound {
                value: 0.5737843
              }
              upper_bound {
                value: 1.0262157
              }
              methodology: POISSON_BOOTSTRAP
            }
          }
        }
        metrics {
          key: "$auc"
          value {
            bounded_value {
              lower_bound {
                value: 0.1
              }
              upper_bound {
                value: 0.3
              }
              value {
                value: 0.2
              }
              methodology: RIEMANN_SUM
            }
          }
        }
        metrics {
          key: "$auprc"
          value {
            bounded_value {
              lower_bound {
                value: 0.05
              }
              upper_bound {
                value: 0.17
              }
              value {
                value: 0.1
              }
              methodology: RIEMANN_SUM
            }
          }
        }""").substitute(auc=metric_keys.AUC, auprc=metric_keys.AUPRC),
            metrics_for_slice_pb2.MetricsForSlice())

        got = metrics_and_plots_serialization._serialize_metrics(
            (slice_key, slice_metrics),
            [post_export_metrics.auc(),
             post_export_metrics.auc(curve='PR')])
        self.assertProtoEquals(
            expected_metrics_for_slice,
            metrics_for_slice_pb2.MetricsForSlice.FromString(got))