Esempio n. 1
0
def _example_count(
    name: Text = EXAMPLE_COUNT_NAME) -> metric_types.MetricComputations:
  """Returns metric computations for computing example counts."""
  key = metric_types.MetricKey(name=name)
  return [
      metric_types.MetricComputation(
          keys=[key],
          preprocessor=_ExampleCountPreprocessor(),
          combiner=_ExampleCountCombiner(key))
  ]
            def check_metrics(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    example_count_key = metric_types.MetricKey(
                        name='example_count')
                    weighted_example_count_key = metric_types.MetricKey(
                        name='weighted_example_count')
                    label_key = metric_types.MetricKey(name='mean_label')
                    self.assertDictElementsAlmostEqual(
                        got_metrics, {
                            example_count_key: 2,
                            weighted_example_count_key: (1.0 + 0.5),
                            label_key: (1.0 * 1.0 + 0.0 * 0.5) / (1.0 + 0.5),
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)
Esempio n. 3
0
 def testMetricKeyFromProto(self):
     metric_keys = [
         metric_types.MetricKey(name='metric_name'),
         metric_types.MetricKey(name='metric_name',
                                model_name='model_name',
                                output_name='output_name',
                                sub_key=metric_types.SubKey(class_id=1),
                                is_diff=True),
         metric_types.MetricKey(
             name='metric_name',
             model_name='model_name',
             output_name='output_name',
             sub_key=metric_types.SubKey(top_k=2),
             aggregation_type=metric_types.AggregationType(
                 micro_average=True))
     ]
     for key in metric_keys:
         got_key = metric_types.MetricKey.from_proto(key.to_proto())
         self.assertEqual(key, got_key, '{} != {}'.format(key, got_key))
Esempio n. 4
0
            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    self.assertLen(got_metrics, 8)  # 1 threshold * 8 metrics
                    self.assertTrue(
                        math.isnan(got_metrics[metric_types.MetricKey(
                            name=
                            'fairness_indicators_metrics/[email protected]'
                        )]))
                    self.assertTrue(
                        math.isnan(got_metrics[metric_types.MetricKey(
                            name=
                            'fairness_indicators_metrics/[email protected]'
                        )]))

                except AssertionError as err:
                    raise util.BeamAssertException(err)
            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    self.assertDictElementsAlmostEqual(
                        got_metrics, {
                            metric_types.MetricKey(name='accuracy'):
                            1.0,
                            metric_types.MetricKey(name='label/mean'):
                            0.5,
                            metric_types.MetricKey(name='my_mean_age'):
                            3.75,
                            metric_types.MetricKey(name='my_mean_age_times_label'):
                            1.75
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)
Esempio n. 6
0
      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          mse_key = metric_types.MetricKey(name='mse')
          self.assertDictElementsAlmostEqual(got_metrics, {mse_key: 0.1875})

        except AssertionError as err:
          raise util.BeamAssertException(err)
  def testGetMissingSlices(self):
    slicing_specs = [
        config.SlicingSpec(),
        config.SlicingSpec(feature_values={'feature1': 'value1'}),
        config.SlicingSpec(feature_values={'feature2': 'value2'})
    ]
    threshold = config.MetricThreshold(
        value_threshold=config.GenericValueThreshold(upper_bound={'value': 1}))
    eval_config = config.EvalConfig(
        model_specs=[
            config.ModelSpec(),
        ],
        slicing_specs=slicing_specs,
        metrics_specs=[
            config.MetricsSpec(
                metrics=[
                    config.MetricConfig(
                        class_name='WeightedExampleCount',
                        # 1.5 < 1, NOT OK.
                        per_slice_thresholds=[
                            config.PerSliceMetricThreshold(
                                slicing_specs=slicing_specs,
                                threshold=threshold)
                        ]),
                ],
                model_names=['']),
        ],
    )
    sliced_metrics = ((('feature1', 'value1'),), {
        metric_types.MetricKey(name='weighted_example_count'): 0,
    })
    result = metrics_validator.validate_metrics(sliced_metrics, eval_config)

    expected_checks = text_format.Parse(
        """
        validation_ok: true
        validation_details {
          slicing_details {
            slicing_spec {
              feature_values {
                key: "feature1"
                value: "value1"
              }
            }
            num_matching_slices: 1
          }
        }""", validation_result_pb2.ValidationResult())

    self.assertProtoEquals(expected_checks, result)

    missing = metrics_validator.get_missing_slices(
        result.validation_details.slicing_details, eval_config)
    self.assertLen(missing, 2)
    self.assertProtoEquals(missing[0], slicing_specs[0])
    self.assertProtoEquals(missing[1], slicing_specs[2])
Esempio n. 8
0
  def extract_output(self, accumulator):
    # Compute the jackknife standard error for each metric.
    # See delete-d bootstrap method described in:
    # https://www.stat.berkeley.edu/~hhuang/STAT152/Jackknife-Bootstrap.pdf
    # Rather than normalize by all possible n-choose-d samples, we normalize by
    # the actual number of samples.
    self._num_slices_counter.inc(1)
    unsampled_values = accumulator.unsampled_values
    assert _JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY in unsampled_values, (
        'Expected unsampled jackknife values to contain the example count key: '
        '"{}". Instead, found keys: {}'.format(
            _JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY, unsampled_values.keys()))
    n = unsampled_values.pop(_JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY)

    result = {}
    missing_samples = False
    # If we don't get at least one example in each sample, don't compute CI.
    if accumulator.num_samples < self._num_jackknife_samples:
      self._missing_samples_counter.inc(1)
      missing_samples = True
      result[metric_types.MetricKey(metric_keys.ERROR_METRIC)] = (
          'CI not computed because only {num_samples} samples were non-empty. '
          'Expected {num_jackknife_samples}.'.format(
              num_samples=accumulator.num_samples,
              num_jackknife_samples=self._num_jackknife_samples))

    # set d to expected size of a sample holdout
    d = n / float(accumulator.num_samples)
    if d < n**0.5:
      # if d < sqrt(n) the jackknife standard error will behave poorly for some
      # metrics (including the median).
      self._small_samples_counter.inc(1)

    jackknife_scaling_factor = (n - d) / d
    dof = accumulator.num_samples - 1
    num_samples = accumulator.num_samples

    for metric_key, unsampled_value in unsampled_values.items():
      if (missing_samples or metric_key not in accumulator.sums or
          (self._skip_ci_metric_keys and
           metric_key in self._skip_ci_metric_keys)):
        result[metric_key] = unsampled_value
      else:
        mean = accumulator.sums[metric_key] / accumulator.num_samples
        sum_of_squares = accumulator.sums_of_squares[metric_key]
        # one-pass variance formula with num_samples degrees of freedom
        sample_variance = sum_of_squares / float(num_samples) - mean * mean
        standard_error = (jackknife_scaling_factor * sample_variance)**0.5
        result[metric_key] = types.ValueWithTDistribution(
            sample_mean=mean,
            sample_standard_deviation=standard_error,
            sample_degrees_of_freedom=dof,
            unsampled_value=unsampled_value)
    return result
      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          total_queries_key = metric_types.MetricKey(name='total_queries')
          total_documents_key = metric_types.MetricKey(name='total_documents')
          min_documents_key = metric_types.MetricKey(name='min_documents')
          max_documents_key = metric_types.MetricKey(name='max_documents')
          self.assertDictElementsAlmostEqual(
              got_metrics, {
                  total_queries_key: 3,
                  total_documents_key: 6,
                  min_documents_key: 1,
                  max_documents_key: 3
              },
              places=5)

        except AssertionError as err:
          raise util.BeamAssertException(err)
            def check_result(got):
                try:
                    self.assertLen(got, 1, 'got: %s' % got)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())

                    custom_key = metric_types.MetricKey(name='custom',
                                                        example_weighted=True)
                    mse_key = metric_types.MetricKey(name='mse',
                                                     example_weighted=True)
                    self.assertDictElementsAlmostEqual(
                        got_metrics, {
                            custom_key: (0.0 + 0.5 + 0.3 + 0.9 + 0.0) /
                            (1.0 + 1.0 + 1.0 + 1.0 + 0.0),
                            mse_key:
                            0.1875,
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)
Esempio n. 11
0
 def testMetricKeyStrForMetricKeyWithAllFields(self):
     self.assertEqual(
         str(
             metric_types.MetricKey(name='metric_name',
                                    model_name='model_name',
                                    output_name='output_name',
                                    sub_key=metric_types.SubKey(class_id=1),
                                    is_diff=True)),
         'name: "metric_name" output_name: "output_name" ' +
         'sub_key: { class_id: { value: 1 } } model_name: "model_name" ' +
         'is_diff: true')
Esempio n. 12
0
      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          key = metric_types.MetricKey(name='sparse_categorical_crossentropy')
          # 0*log(.3) -1*log(0.6)-0*log(.1) = 0.51
          self.assertDictElementsAlmostEqual(got_metrics, {key: 0.51083})

        except AssertionError as err:
          raise util.BeamAssertException(err)
      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          key = metric_types.MetricKey(name='min_label_position')
          self.assertIn(key, got_metrics)
          self.assertTrue(math.isnan(got_metrics[key]))

        except AssertionError as err:
          raise util.BeamAssertException(err)
Esempio n. 14
0
      def check_confusion_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          auc_key = metric_types.MetricKey(name='auc')
          self.assertDictElementsAlmostEqual(
              got_metrics, {auc_key: 0.75}, places=5)

        except AssertionError as err:
          raise util.BeamAssertException(err)
Esempio n. 15
0
      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          key = metric_types.MetricKey(name=metric_name)
          self.assertDictElementsAlmostEqual(
              got_metrics, {key: expected_value}, places=5)

        except AssertionError as err:
          raise util.BeamAssertException(err)
Esempio n. 16
0
      def check_metrics(got):
        try:
          self.assertLen(got, 3)
          slices = {}
          for slice_key, value in got:
            slices[slice_key] = value
          overall_slice = ()
          fixed_string1_slice = (('fixed_string', b'fixed_string1'),)
          fixed_string2_slice = (('fixed_string', b'fixed_string2'),)
          self.assertCountEqual(
              list(slices.keys()),
              [overall_slice, fixed_string1_slice, fixed_string2_slice])
          example_count_key = metric_types.MetricKey(name='example_count')
          weighted_example_count_key = metric_types.MetricKey(
              name='weighted_example_count')
          label_key = metric_types.MetricKey(name='mean_label')
          pred_key = metric_types.MetricKey(name='mean_prediction')
          self.assertDictElementsWithTDistributionAlmostEqual(
              slices[overall_slice], {
                  example_count_key: 3,
                  weighted_example_count_key: 4.0,
                  label_key: (1.0 + 0.0 + 2 * 0.0) / (1.0 + 1.0 + 2.0),
                  pred_key: (0.2 + 0.8 + 2 * 0.5) / (1.0 + 1.0 + 2.0),
              })
          self.assertDictElementsWithTDistributionAlmostEqual(
              slices[fixed_string1_slice], {
                  example_count_key: 2,
                  weighted_example_count_key: 2.0,
                  label_key: (1.0 + 0.0) / (1.0 + 1.0),
                  pred_key: (0.2 + 0.8) / (1.0 + 1.0),
              })
          self.assertDictElementsWithTDistributionAlmostEqual(
              slices[fixed_string2_slice], {
                  example_count_key: 1,
                  weighted_example_count_key: 2.0,
                  label_key: (2 * 0.0) / 2.0,
                  pred_key: (2 * 0.5) / 2.0,
              })

        except AssertionError as err:
          raise util.BeamAssertException(err)
      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          fixed_sized_sample_key = metric_types.MetricKey(
              name='fixed_size_sample', example_weighted=True)
          np.testing.assert_equal(got_metrics,
                                  {fixed_sized_sample_key: np.array([4, 3])})

        except AssertionError as err:
          raise util.BeamAssertException(err)
            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())

                    custom_key = metric_types.MetricKey(name='custom')
                    self.assertDictElementsAlmostEqual(
                        got_metrics, {custom_key: 1.0 / (1.0 + 1.0)})

                except AssertionError as err:
                    raise util.BeamAssertException(err)
 def testValidateMetricsChangeThresholdRelativePass(self, slicing_specs,
                                                    slice_key):
     threshold = config.MetricThreshold(
         change_threshold=config.GenericChangeThreshold(
             direction=config.MetricDirection.LOWER_IS_BETTER,
             relative={'value': 0}))
     eval_config = config.EvalConfig(
         model_specs=[
             config.ModelSpec(),
             config.ModelSpec(name='baseline', is_baseline=True)
         ],
         slicing_specs=slicing_specs,
         metrics_specs=[
             config.MetricsSpec(
                 metrics=[
                     config.MetricConfig(
                         class_name='MeanPrediction',
                         # Diff = -.333
                         # Diff% = -.333/.333 = -100% < 0%, OK.
                         threshold=threshold
                         if slicing_specs is None else None,
                         per_slice_thresholds=[
                             config.PerSliceMetricThreshold(
                                 slicing_specs=slicing_specs,
                                 threshold=threshold)
                         ])
                 ],
                 model_names=['']),
         ],
     )
     sliced_metrics = (slice_key, {
         metric_types.MetricKey(name='mean_prediction',
                                model_name='baseline'):
         0.333,
         metric_types.MetricKey(name='mean_prediction', is_diff=True):
         -0.333,
     })
     result = metrics_validator.validate_metrics(sliced_metrics,
                                                 eval_config)
     self.assertTrue(result.validation_ok)
Esempio n. 20
0
 def check_result(got):
     try:
         self.assertLen(got, 1)
         got_slice_key, got_metrics = got[0]
         self.assertEqual(got_slice_key, ())
         self.assertLen(got_metrics, 6)
         self.assertDictElementsAlmostEqual(
             got_metrics, {
                 metric_types.MetricKey(name='flip_count/[email protected]'):
                 5.0,
                 metric_types.MetricKey(name='flip_count/[email protected]'):
                 7.0,
                 metric_types.MetricKey(name='flip_count/[email protected]'):
                 6.0,
                 metric_types.MetricKey(name='flip_count/[email protected]'):
                 7.0,
             })
         self.assertAllEqual(
             got_metrics[metric_types.MetricKey(
                 name=
                 'flip_count/[email protected]'
             )], np.array([['id_2'], ['id_3']]))
         self.assertAllEqual(
             got_metrics[metric_types.MetricKey(
                 name=
                 'flip_count/[email protected]'
             )], np.array([['id_2'], ['id_3'], ['id_4']]))
     except AssertionError as err:
         raise util.BeamAssertException(err)
            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    chinese_accuracy_key = metric_types.MetricKey(
                        name='accuracy', output_name='chinese_head')
                    chinese_mean_label_key = metric_types.MetricKey(
                        name='label/mean', output_name='chinese_head')
                    english_accuracy_key = metric_types.MetricKey(
                        name='accuracy', output_name='english_head')
                    english_mean_label_key = metric_types.MetricKey(
                        name='label/mean', output_name='english_head')
                    other_accuracy_key = metric_types.MetricKey(
                        name='accuracy', output_name='other_head')
                    other_mean_label_key = metric_types.MetricKey(
                        name='label/mean', output_name='other_head')
                    self.assertDictElementsAlmostEqual(
                        got_metrics, {
                            chinese_accuracy_key: 0.75,
                            chinese_mean_label_key: 0.5,
                            english_accuracy_key: 1.0,
                            english_mean_label_key: 0.5,
                            other_accuracy_key: 1.0,
                            other_mean_label_key: 0.25
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)
Esempio n. 22
0
def _keys_for_metric(
    metric_name: Text, spec: config.MetricsSpec,
    sub_keys: Optional[List[metric_types.SubKey]]
) -> Iterator[metric_types.MetricKey]:
    """Yields all non-diff keys for a specific metric name."""
    for model_name in spec.model_names or ['']:
        for output_name in spec.output_names or ['']:
            for sub_key in sub_keys:
                key = metric_types.MetricKey(name=metric_name,
                                             model_name=model_name,
                                             output_name=output_name,
                                             sub_key=sub_key)
                yield key
Esempio n. 23
0
 def result(
     metrics: Dict[metric_types.MetricKey, Any]
 ) -> Dict[metric_types.MetricKey, float]:
   """Returns weighted macro average."""
   class_weights_from_labels = metrics[class_weights_from_labels_key]
   total_value = 0.0
   total_weight = 0.0
   for sub_key in sub_keys:
     child_key = metric_types.MetricKey(
         name=metric_name,
         model_name=model_name,
         output_name=output_name,
         sub_key=sub_key,
         example_weighted=example_weighted)
     if child_key not in metrics:
       # Use private name if not found under metric name
       child_key = metric_types.MetricKey(
           name='_' + metric_name,
           model_name=model_name,
           output_name=output_name,
           sub_key=sub_key,
           example_weighted=example_weighted)
     weight = 1.0 if not class_weights else 0.0
     offset = None
     if (child_key.sub_key is not None and
         child_key.sub_key.class_id is not None):
       offset = child_key.sub_key.class_id
     elif child_key.sub_key is not None and child_key.sub_key.k is not None:
       offset = child_key.sub_key.k
     if offset is not None:
       if (class_weights_from_labels and
           child_key.sub_key.class_id in class_weights_from_labels):
         weight = class_weights_from_labels[offset]
       if class_weights and child_key.sub_key.class_id in class_weights:
         weight *= class_weights[offset]
     total_value += _to_float(metrics[child_key]) * weight
     total_weight += weight
   average = total_value / total_weight if total_weight else float('nan')
   return {key: average}
      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          key = metric_types.MetricKey(name='min_label_position')
          self.assertDictElementsAlmostEqual(
              got_metrics, {
                  key: 0.66667,
              }, places=5)

        except AssertionError as err:
          raise util.BeamAssertException(err)
Esempio n. 25
0
            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    expected = {
                        metric_types.MetricKey(name=name): value
                        for name, value in expected_values.items()
                    }
                    self.assertDictElementsAlmostEqual(got_metrics, expected)

                except AssertionError as err:
                    raise util.BeamAssertException(err)
Esempio n. 26
0
 def check_result(got):
     try:
         self.assertLen(got, 1)
         got_slice_key, got_metrics = got[0]
         self.assertEqual(got_slice_key, ())
         self.assertLen(got_metrics, 6)
         self.assertSameElements(
             got_metrics[metric_types.MetricKey(
                 name='flip_rate/[email protected]',
                 example_weighted=False)],
             np.array([['id_2'], ['id_3'], ['id_4']]))
     except AssertionError as err:
         raise util.BeamAssertException(err)
Esempio n. 27
0
      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          mse_key = metric_types.MetricKey(name='mse')
          # numerator = (0.1*0**2 + 0.2*0.5**2 + 0.3*0.7**2 + 0.4*0.9**2)
          # denominator = (.1 + .2 + 0.3 + 0.4)
          # numerator / denominator = 0.521
          self.assertDictElementsAlmostEqual(got_metrics, {mse_key: 0.521})

        except AssertionError as err:
          raise util.BeamAssertException(err)
      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          self.assertLen(got_metrics, 1)
          key = metric_types.MetricKey(name='_binary_confusion_matrices')
          self.assertIn(key, got_metrics)
          got_matrices = got_metrics[key]
          self.assertEqual(got_matrices, expected_matrices)

        except AssertionError as err:
          raise util.BeamAssertException(err)
Esempio n. 29
0
def multi_class_confusion_matrices(
        thresholds: Optional[List[float]] = None,
        num_thresholds: Optional[int] = None,
        name: str = MULTI_CLASS_CONFUSION_MATRICES,
        eval_config: Optional[config_pb2.EvalConfig] = None,
        model_name: str = '',
        output_name: str = '',
        example_weighted: bool = False) -> metric_types.MetricComputations:
    """Returns computations for multi-class confusion matrices.

  Args:
    thresholds: A specific set of thresholds to use. The caller is responsible
      for marking the bondaires with +/-epsilon if desired. Only one of
      num_thresholds or thresholds should be used.
    num_thresholds: Number of thresholds to use. Thresholds will be calculated
      using linear interpolation between 0.0 and 1.0 with equidistant values and
      bondardaries at -epsilon and 1.0+epsilon. Values must be > 0. Only one of
      num_thresholds or thresholds should be used.
    name: Metric name.
    eval_config: Eval config.
    model_name: Optional model name (if multi-model evaluation).
    output_name: Optional output name (if multi-output model type).
    example_weighted: True if example weights should be applied.

  Raises:
    ValueError: If both num_thresholds and thresholds are set at the same time.
  """
    if num_thresholds is not None and thresholds is not None:
        raise ValueError(
            'only one of thresholds or num_thresholds can be set at a time')
    if num_thresholds is None and thresholds is None:
        thresholds = [0.0]
    if num_thresholds is not None:
        thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
                      for i in range(num_thresholds - 2)]
        thresholds = [-_EPSILON] + thresholds + [1.0 + _EPSILON]

    key = metric_types.MetricKey(name=name,
                                 model_name=model_name,
                                 output_name=output_name,
                                 example_weighted=example_weighted)
    return [
        metric_types.MetricComputation(
            keys=[key],
            preprocessor=None,
            combiner=_MultiClassConfusionMatrixCombiner(
                key=key,
                eval_config=eval_config,
                example_weighted=example_weighted,
                thresholds=thresholds))
    ]
            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())

                    custom_key = metric_types.MetricKey(name='custom',
                                                        example_weighted=True)
                    self.assertDictElementsAlmostEqual(got_metrics, {
                        custom_key: (0.2 + 0.8 + 2 * 0.5) / (1.0 + 1.0 + 2.0)
                    })

                except AssertionError as err:
                    raise util.BeamAssertException(err)