def testBinaryConfusionMatrices(self, kwargs, expected_matrices):
        computations = binary_confusion_matrices.binary_confusion_matrices(
            **kwargs)
        histogram = computations[0]
        matrices = computations[1]

        example1 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.0]),
            'example_weights': np.array([1.0])
        }
        example2 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.5]),
            'example_weights': np.array([1.0])
        }
        example3 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.3]),
            'example_weights': np.array([1.0])
        }
        example4 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.9]),
            'example_weights': np.array([1.0])
        }

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [example1, example2, example3, example4])
                | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
                | 'AddSlice' >> beam.Map(lambda x: ((), x))
                | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner)
                | 'ComputeMatrices' >> beam.Map(lambda x:
                                                (x[0], matrices.result(x[1])))
            )  # pyformat: disable

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    self.assertLen(got_metrics, 1)
                    name = '_binary_confusion_matrices_{}'.format(
                        kwargs['num_thresholds'] if 'num_thresholds' in
                        kwargs else kwargs['thresholds'])
                    key = metric_types.MetricKey(name=name)
                    self.assertIn(key, got_metrics)
                    got_matrices = got_metrics[key]
                    self.assertEqual(got_matrices, expected_matrices)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
def _confusion_matrix_at_thresholds(
    thresholds: List[float],
    name: Text = CONFUSION_MATRIX_AT_THRESHOLDS_NAME,
    eval_config: Optional[config.EvalConfig] = None,
    model_name: Text = '',
    output_name: Text = '',
    sub_key: Optional[metric_types.SubKey] = None
) -> metric_types.MetricComputations:
    """Returns metric computations for confusion matrix at thresholds."""
    key = metric_types.MetricKey(name=name,
                                 model_name=model_name,
                                 output_name=output_name,
                                 sub_key=sub_key)

    # Make sure matrices are calculated.
    matrices_computations = binary_confusion_matrices.binary_confusion_matrices(
        eval_config=eval_config,
        model_name=model_name,
        output_name=output_name,
        sub_key=sub_key,
        thresholds=thresholds)
    matrices_key = matrices_computations[-1].keys[-1]

    def result(
        metrics: Dict[metric_types.MetricKey,
                      metrics_for_slice_pb2.ConfusionMatrixAtThresholds]
    ) -> Dict[metric_types.MetricKey, Any]:
        return {key: to_proto(thresholds, metrics[matrices_key])}

    derived_computation = metric_types.DerivedMetricComputation(keys=[key],
                                                                result=result)
    computations = matrices_computations
    computations.append(derived_computation)
    return computations
Esempio n. 3
0
def _confusion_matrix_plot(
    num_thresholds: int = DEFAULT_NUM_THRESHOLDS,
    name: Text = CONFUSION_MATRIX_PLOT_NAME,
    eval_config: Optional[config.EvalConfig] = None,
    model_name: Text = '',
    output_name: Text = '',
    sub_key: Optional[metric_types.SubKey] = None,
    aggregation_type: Optional[metric_types.AggregationType] = None,
    class_weights: Optional[Dict[int, float]] = None
) -> metric_types.MetricComputations:
    """Returns metric computations for confusion matrix plots."""
    key = metric_types.PlotKey(name=name,
                               model_name=model_name,
                               output_name=output_name,
                               sub_key=sub_key)

    # The interoploation strategy used here matches how the legacy post export
    # metrics calculated its plots.
    thresholds = [
        i * 1.0 / num_thresholds for i in range(0, num_thresholds + 1)
    ]
    thresholds = [-1e-6] + thresholds

    # Make sure matrices are calculated.
    matrices_computations = binary_confusion_matrices.binary_confusion_matrices(
        # Use a custom name since we have a custom interpolation strategy which
        # will cause the default naming used by the binary confusion matrix to be
        # very long.
        name=(binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME + '_' +
              name),
        eval_config=eval_config,
        model_name=model_name,
        output_name=output_name,
        sub_key=sub_key,
        aggregation_type=aggregation_type,
        class_weights=class_weights,
        thresholds=thresholds)
    matrices_key = matrices_computations[-1].keys[-1]

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.MetricKey,
              metrics_for_slice_pb2.ConfusionMatrixAtThresholds]:
        return {
            key:
            confusion_matrix_metrics.to_proto(thresholds,
                                              metrics[matrices_key])
        }

    derived_computation = metric_types.DerivedMetricComputation(keys=[key],
                                                                result=result)
    computations = matrices_computations
    computations.append(derived_computation)
    return computations
    def _metric_computation(
        self,
        thresholds: Optional[List[float]] = None,
        name: Text = '',
        eval_config: Optional[config.EvalConfig] = None,
        model_name: Text = '',
        output_name: Text = '',
        sub_key: Optional[metric_types.SubKey] = None,
        aggregation_type: Optional[metric_types.AggregationType] = None,
        class_weights: Optional[Dict[int, float]] = None
    ) -> metric_types.MetricComputations:
        """Returns metric computations for specificity."""
        key = metric_types.MetricKey(name=name,
                                     model_name=model_name,
                                     output_name=output_name,
                                     sub_key=sub_key)

        if not thresholds:
            thresholds = [0.5]

        # Make sure matrices are calculated.
        matrices_computations = binary_confusion_matrices.binary_confusion_matrices(
            eval_config=eval_config,
            model_name=model_name,
            output_name=output_name,
            sub_key=sub_key,
            aggregation_type=aggregation_type,
            class_weights=class_weights,
            thresholds=thresholds)
        matrices_key = matrices_computations[-1].keys[-1]

        def result(
            metrics: Dict[metric_types.MetricKey, Any]
        ) -> Dict[metric_types.MetricKey, Union[float, np.ndarray]]:
            matrices = metrics[matrices_key]
            values = []
            for i in range(len(thresholds)):
                values.append(
                    self.result(matrices.tp[i], matrices.tn[i], matrices.fp[i],
                                matrices.fn[i]))
            return {
                key: values[0] if len(thresholds) == 1 else np.array(values)
            }

        derived_computation = metric_types.DerivedMetricComputation(
            keys=[key], result=result)
        computations = matrices_computations
        computations.append(derived_computation)
        return computations
def _auc_plot(
    num_thresholds: int = DEFAULT_NUM_THRESHOLDS,
    name: Text = AUC_PLOT_NAME,
    eval_config: Optional[config.EvalConfig] = None,
    model_name: Text = '',
    output_name: Text = '',
    sub_key: Optional[metric_types.SubKey] = None,
    class_weights: Optional[Dict[int, float]] = None
) -> metric_types.MetricComputations:
    """Returns metric computations for AUC plots."""
    key = metric_types.PlotKey(name=name,
                               model_name=model_name,
                               output_name=output_name,
                               sub_key=sub_key)

    # The interoploation stragety used here matches how the legacy post export
    # metrics calculated its plots.
    thresholds = [
        i * 1.0 / num_thresholds for i in range(0, num_thresholds + 1)
    ]
    thresholds = [-1e-6] + thresholds

    # Make sure matrices are calculated.
    matrices_computations = binary_confusion_matrices.binary_confusion_matrices(
        eval_config=eval_config,
        model_name=model_name,
        output_name=output_name,
        sub_key=sub_key,
        class_weights=class_weights,
        thresholds=thresholds)
    matrices_key = matrices_computations[-1].keys[-1]

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.MetricKey,
              metrics_for_slice_pb2.ConfusionMatrixAtThresholds]:
        return {
            key:
            confusion_matrix_at_thresholds.to_proto(thresholds,
                                                    metrics[matrices_key])
        }

    derived_computation = metric_types.DerivedMetricComputation(keys=[key],
                                                                result=result)
    computations = matrices_computations
    computations.append(derived_computation)
    return computations
Esempio n. 6
0
def _wrap_confusion_matrix_metric(
    metric: tf.keras.metrics.Metric, eval_config: config_pb2.EvalConfig,
    model_name: Text, output_name: Text, sub_key: Optional[metric_types.SubKey],
    aggregation_type: Optional[metric_types.AggregationType],
    class_weights: Optional[Dict[int,
                                 float]]) -> metric_types.MetricComputations:
  """Returns confusion matrix metric wrapped in a more efficient computation."""

  # Special handling for AUC metric which supports aggregation inherently via
  # multi_label flag.
  if (isinstance(metric, tf.keras.metrics.AUC) and
      hasattr(metric, 'label_weights')):
    if metric.label_weights:
      if class_weights:
        raise ValueError(
            'class weights are configured in two different places: (1) via the '
            'tf.keras.metrics.AUC class (using "label_weights") and (2) via '
            'the MetricsSpecs (using "aggregate.class_weights"). Either remove '
            'the label_weights settings in the AUC class or remove the '
            'class_weights from the AggregationOptions: metric={}, '
            'class_weights={}'.format(metric, class_weights))
      class_weights = {i: v for i, v in enumerate(metric.label_weights)}
    if metric.multi_label:
      raise NotImplementedError('AUC.multi_label=True is not implemented yet.')

  sub_key = _verify_and_update_sub_key(model_name, output_name, sub_key, metric)
  key = metric_types.MetricKey(
      name=metric.name,
      model_name=model_name,
      output_name=output_name,
      aggregation_type=aggregation_type,
      sub_key=sub_key)

  metric_config = tf.keras.metrics.serialize(metric)

  thresholds = None
  num_thresholds = None
  # The top_k metrics have special settings. If we are setting the top_k value
  # outside of keras (i.e. using BinarizeOptions), then we need to set the
  # special threshold ourselves otherwise the default threshold of 0.5 is used.
  if (sub_key and sub_key.top_k is not None and
      _get_config_value(_TOP_K_KEY, metric_config) is None and
      _get_config_value(_THRESHOLDS_KEY, metric_config) is None and
      _get_config_value(_NUM_THRESHOLDS_KEY, metric_config) is None):
    thresholds = [float('-inf')]
  elif hasattr(metric, _THRESHOLDS_KEY):
    thresholds = metric.thresholds
  # Only one of either thresholds or num_thresholds should be used. Keras AUC
  # allows both but thresholds has more precedence.
  if thresholds is None and hasattr(metric, _NUM_THRESHOLDS_KEY):
    num_thresholds = metric.num_thresholds

  # Make sure matrices are calculated.
  computations = binary_confusion_matrices.binary_confusion_matrices(
      num_thresholds=num_thresholds,
      thresholds=thresholds,
      eval_config=eval_config,
      model_name=model_name,
      output_name=output_name,
      sub_key=sub_key,
      aggregation_type=aggregation_type,
      class_weights=class_weights)
  matrices_key = computations[-1].keys[-1]

  def result(
      metrics: Dict[metric_types.MetricKey, Any]
  ) -> Dict[metric_types.MetricKey, Any]:
    """Returns result derived from binary confustion matrices."""
    matrices = metrics[matrices_key]

    metric = tf.keras.metrics.deserialize(metric_config)
    if (isinstance(metric, tf.keras.metrics.AUC) or
        isinstance(metric, tf.keras.metrics.SpecificityAtSensitivity) or
        isinstance(metric, tf.keras.metrics.SensitivityAtSpecificity)):
      metric.true_positives.assign(np.array(matrices.tp))
      metric.true_negatives.assign(np.array(matrices.tn))
      metric.false_positives.assign(np.array(matrices.fp))
      metric.false_negatives.assign(np.array(matrices.fn))
    elif isinstance(metric, tf.keras.metrics.Precision):
      metric.true_positives.assign(np.array(matrices.tp))
      metric.false_positives.assign(np.array(matrices.fp))
    elif isinstance(metric, tf.keras.metrics.Recall):
      metric.true_positives.assign(np.array(matrices.tp))
      metric.false_negatives.assign(np.array(matrices.fn))
    elif isinstance(metric, tf.keras.metrics.TruePositives):
      metric.accumulator.assign(np.array(matrices.tp))
    elif isinstance(metric, tf.keras.metrics.FalsePositives):
      metric.accumulator.assign(np.array(matrices.fp))
    elif isinstance(metric, tf.keras.metrics.TrueNegatives):
      metric.accumulator.assign(np.array(matrices.tn))
    elif isinstance(metric, tf.keras.metrics.FalseNegatives):
      metric.accumulator.assign(np.array(matrices.fn))
    return {key: metric.result().numpy()}

  derived_computation = metric_types.DerivedMetricComputation(
      keys=[key], result=result)
  computations.append(derived_computation)
  return computations
Esempio n. 7
0
def _fairness_indicators_metrics_at_thresholds(
    thresholds: List[float],
    name: Text = FAIRNESS_INDICATORS_METRICS_NAME,
    eval_config: Optional[config.EvalConfig] = None,
    model_name: Text = '',
    output_name: Text = '',
    sub_key: Optional[metric_types.SubKey] = None,
    class_weights: Optional[Dict[int, float]] = None
) -> metric_types.MetricComputations:
    """Returns computations for fairness metrics at thresholds."""
    metric_key_by_name_by_threshold = collections.defaultdict(dict)
    keys = []
    digits_num = calculate_digits(thresholds)
    for t in thresholds:
        for m in FAIRNESS_INDICATORS_SUB_METRICS:
            key = metric_types.MetricKey(
                name='%s/%s@%.*f' %
                (name, m, digits_num,
                 t),  # e.g. "fairness_indicators_metrics/[email protected]"
                model_name=model_name,
                output_name=output_name,
                sub_key=sub_key)
            keys.append(key)
            metric_key_by_name_by_threshold[t][m] = key

    # Make sure matrices are calculated.
    computations = binary_confusion_matrices.binary_confusion_matrices(
        eval_config=eval_config,
        model_name=model_name,
        output_name=output_name,
        sub_key=sub_key,
        class_weights=class_weights,
        thresholds=thresholds)
    confusion_matrices_key = computations[-1].keys[-1]

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.MetricKey, Any]:
        """Returns fairness metrics values."""
        metric = metrics[confusion_matrices_key]
        output = {}

        for i, threshold in enumerate(thresholds):
            num_positives = metric.tp[i] + metric.fn[i]
            num_negatives = metric.tn[i] + metric.fp[i]

            tpr = metric.tp[i] / (num_positives or float('nan'))
            tnr = metric.tn[i] / (num_negatives or float('nan'))
            fpr = metric.fp[i] / (num_negatives or float('nan'))
            fnr = metric.fn[i] / (num_positives or float('nan'))
            pr = (metric.tp[i] + metric.fp[i]) / (
                (num_positives + num_negatives) or float('nan'))
            nr = (metric.tn[i] + metric.fn[i]) / (
                (num_positives + num_negatives) or float('nan'))

            fdr = metric.fp[i] / (
                (metric.fp[i] + metric.tp[i]) or float('nan'))
            fomr = metric.fn[i] / (
                (metric.fn[i] + metric.tn[i]) or float('nan'))

            output[metric_key_by_name_by_threshold[threshold]
                   ['false_positive_rate']] = fpr
            output[metric_key_by_name_by_threshold[threshold]
                   ['false_negative_rate']] = fnr
            output[metric_key_by_name_by_threshold[threshold]
                   ['true_positive_rate']] = tpr
            output[metric_key_by_name_by_threshold[threshold]
                   ['true_negative_rate']] = tnr
            output[metric_key_by_name_by_threshold[threshold]
                   ['positive_rate']] = pr
            output[metric_key_by_name_by_threshold[threshold]
                   ['negative_rate']] = nr
            output[metric_key_by_name_by_threshold[threshold]
                   ['false_discovery_rate']] = fdr
            output[metric_key_by_name_by_threshold[threshold]
                   ['false_omission_rate']] = fomr

        return output

    derived_computation = metric_types.DerivedMetricComputation(keys=keys,
                                                                result=result)

    computations.append(derived_computation)
    return computations
Esempio n. 8
0
def _wrap_confusion_matrix_metric(
    metric: tf.keras.metrics.Metric, eval_config: config.EvalConfig,
    model_name: Text, output_name: Text, sub_key: Optional[metric_types.SubKey],
    class_weights: Optional[Dict[int,
                                 float]]) -> metric_types.MetricComputations:
  """Returns confusion matrix metric wrapped in a more efficient computation."""

  # Special handling for AUC metric which supports aggregation inherently via
  # multi_label flag.
  if (isinstance(metric, tf.keras.metrics.AUC) and
      hasattr(metric, 'label_weights')):
    if metric.label_weights:
      if class_weights:
        raise ValueError(
            'class weights are configured in two different places: (1) via the '
            'tf.keras.metrics.AUC class (using "label_weights") and (2) via '
            'the MetricsSpecs (using "aggregate.class_weights"). Either remove '
            'the label_weights settings in the AUC class or remove the '
            'class_weights from the AggregationOptions: metric={}, '
            'class_weights={}'.format(metric, class_weights))
      class_weights = {i: v for i, v in enumerate(metric.label_weights)}
    if metric.multi_label:
      raise NotImplementedError('AUC.multi_label=True is not implemented yet.')

  sub_key = _verify_and_update_sub_key(model_name, output_name, sub_key, metric)
  key = metric_types.MetricKey(
      name=metric.name,
      model_name=model_name,
      output_name=output_name,
      sub_key=sub_key)

  metric_config = tf.keras.metrics.serialize(metric)

  thresholds = None
  num_thresholds = None
  if hasattr(metric, _THRESHOLDS_KEY):
    if (len(
        metric.thresholds) == binary_confusion_matrices.DEFAULT_NUM_THRESHOLDS):
      num_thresholds = binary_confusion_matrices.DEFAULT_NUM_THRESHOLDS
    else:
      thresholds = metric.thresholds
  # Only one of either thresholds or num_thresholds should be used. Keras AUC
  # allows both but thresholds has more precedence.
  if thresholds is None and hasattr(metric, _NUM_THRESHOLDS_KEY):
    num_thresholds = metric.num_thresholds

  # By default use separate compuations for the confusion matrices since the
  # metrics might be using different thresholds (note, the underlying histogram
  # the confusion matrices are based on will still only be calculated once).
  if (num_thresholds is not None and
      num_thresholds == binary_confusion_matrices.DEFAULT_NUM_THRESHOLDS):
    name = binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME
  else:
    name = '_{}{}'.format(
        metric.name, binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME)

  # Make sure matrices are calculated. Note that the use of class_weights here
  # implies that micro averaging is being performed.
  computations = binary_confusion_matrices.binary_confusion_matrices(
      num_thresholds=num_thresholds,
      thresholds=thresholds,
      name=name,
      eval_config=eval_config,
      model_name=model_name,
      output_name=output_name,
      sub_key=sub_key,
      class_weights=class_weights)
  matrices_key = computations[-1].keys[-1]

  def result(
      metrics: Dict[metric_types.MetricKey, Any]
  ) -> Dict[metric_types.MetricKey, Any]:
    """Returns AUC derived from binary confustion matrices."""
    matrices = metrics[matrices_key]

    metric = tf.keras.metrics.deserialize(metric_config)
    if (isinstance(metric, tf.keras.metrics.AUC) or
        isinstance(metric, tf.keras.metrics.SpecificityAtSensitivity) or
        isinstance(metric, tf.keras.metrics.SensitivityAtSpecificity)):
      metric.true_positives.assign(np.array(matrices.tp))
      metric.true_negatives.assign(np.array(matrices.tn))
      metric.false_positives.assign(np.array(matrices.fp))
      metric.false_negatives.assign(np.array(matrices.fn))
    elif isinstance(metric, tf.keras.metrics.Precision):
      metric.true_positives.assign(np.array(matrices.tp))
      metric.false_positives.assign(np.array(matrices.fp))
    elif isinstance(metric, tf.keras.metrics.Recall):
      metric.true_positives.assign(np.array(matrices.tp))
      metric.false_negatives.assign(np.array(matrices.fn))
    elif isinstance(metric, tf.keras.metrics.TruePositives):
      metric.accumulator.assign(np.array(matrices.tp))
    elif isinstance(metric, tf.keras.metrics.FalsePositives):
      metric.accumulator.assign(np.array(matrices.fp))
    elif isinstance(metric, tf.keras.metrics.TrueNegatives):
      metric.accumulator.assign(np.array(matrices.tn))
    elif isinstance(metric, tf.keras.metrics.FalseNegatives):
      metric.accumulator.assign(np.array(matrices.fn))
    return {key: metric.result().numpy()}

  derived_computation = metric_types.DerivedMetricComputation(
      keys=[key], result=result)
  computations.append(derived_computation)
  return computations
def _wrap_confusion_matrix_metric(
    metric: tf.keras.metrics.Metric, model_name: Text, output_name: Text,
    sub_key: Optional[metric_types.SubKey],
    class_weights: Optional[Dict[int,
                                 float]]) -> metric_types.MetricComputations:
  """Returns confusion matrix metric wrapped in a more efficient computation."""

  sub_key = _verify_and_update_sub_key(model_name, output_name, sub_key, metric)
  key = metric_types.MetricKey(
      name=metric.name,
      model_name=model_name,
      output_name=output_name,
      sub_key=sub_key)

  metric_config = tf.keras.metrics.serialize(metric)

  # By default use separate compuations for the confusion matrices since the
  # metrics might be using different thresholds (note, the underlying histogram
  # the confusion matrices are based on will still only be calculated once).
  name = '_{}{}'.format(
      metric.name, binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME)
  thresholds = None
  if hasattr(metric, _THRESHOLDS_KEY):
    thresholds = metric.thresholds
  num_thresholds = None
  if hasattr(metric, _NUM_THRESHOLDS_KEY):
    num_thresholds = metric.num_thresholds
  # Increase the default number of thresholds if keras defaults were used (this
  # also allows us to share the computation with other confusion based metrics).
  if (num_thresholds == _DEFAULT_NUM_THRESHOLDS_IN_KERAS and
      _CONFIG_KEY in metric_config and
      _NUM_THRESHOLDS_KEY in metric_config[_CONFIG_KEY]):
    name = binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME
    num_thresholds = binary_confusion_matrices.DEFAULT_NUM_THRESHOLDS
    metric_config[_CONFIG_KEY][_NUM_THRESHOLDS_KEY] = num_thresholds
    thresholds = None
    if _THRESHOLDS_KEY in metric_config[_CONFIG_KEY]:
      metric_config[_CONFIG_KEY][_THRESHOLDS_KEY] = None
  # Only one of either thresholds or num_thresholds should be used. Keras AUC
  # allows both but thresholds has more precedence.
  if thresholds is not None and num_thresholds is not None:
    num_thresholds = None

  # Make sure matrices are calculated.
  computations = binary_confusion_matrices.binary_confusion_matrices(
      num_thresholds=num_thresholds,
      thresholds=thresholds,
      name=name,
      model_name=model_name,
      output_name=output_name,
      sub_key=sub_key,
      class_weights=class_weights)
  matrices_key = computations[-1].keys[-1]

  def result(
      metrics: Dict[metric_types.MetricKey, Any]
  ) -> Dict[metric_types.MetricKey, Any]:
    """Returns AUC derived from binary confustion matrices."""
    matrices = metrics[matrices_key]

    metric = tf.keras.metrics.deserialize(metric_config)
    if (isinstance(metric, tf.keras.metrics.AUC) or
        isinstance(metric, tf.keras.metrics.SpecificityAtSensitivity) or
        isinstance(metric, tf.keras.metrics.SensitivityAtSpecificity)):
      metric.true_positives.assign(np.array(matrices.tp))
      metric.true_negatives.assign(np.array(matrices.tn))
      metric.false_positives.assign(np.array(matrices.fp))
      metric.false_negatives.assign(np.array(matrices.fn))
    elif isinstance(metric, tf.keras.metrics.Precision):
      metric.true_positives.assign(np.array(matrices.tp))
      metric.false_positives.assign(np.array(matrices.fp))
    elif isinstance(metric, tf.keras.metrics.Recall):
      metric.true_positives.assign(np.array(matrices.tp))
      metric.false_negatives.assign(np.array(matrices.fn))
    elif isinstance(metric, tf.keras.metrics.TruePositives):
      metric.accumulator.assign(np.array(matrices.tp))
    elif isinstance(metric, tf.keras.metrics.FalsePositives):
      metric.accumulator.assign(np.array(matrices.fp))
    elif isinstance(metric, tf.keras.metrics.TrueNegatives):
      metric.accumulator.assign(np.array(matrices.tn))
    elif isinstance(metric, tf.keras.metrics.FalseNegatives):
      metric.accumulator.assign(np.array(matrices.fn))
    return {key: metric.result().numpy()}

  derived_computation = metric_types.DerivedMetricComputation(
      keys=[key], result=result)
  computations.append(derived_computation)
  return computations
    def testBinaryConfusionMatricesTopK(self):
        computations = binary_confusion_matrices.binary_confusion_matrices(
            thresholds=[float('-inf')],
            sub_key=metric_types.SubKey(top_k=3),
            use_histogram=True)
        histogram = computations[0]
        matrices = computations[1]

        example1 = {
            'labels': np.array([2]),
            'predictions': np.array([0.1, 0.2, 0.1, 0.25, 0.35]),
            'example_weights': np.array([1.0])
        }
        example2 = {
            'labels': np.array([1]),
            'predictions': np.array([0.2, 0.3, 0.05, 0.15, 0.3]),
            'example_weights': np.array([1.0])
        }
        example3 = {
            'labels': np.array([3]),
            'predictions': np.array([0.01, 0.2, 0.09, 0.5, 0.2]),
            'example_weights': np.array([1.0])
        }
        example4 = {
            'labels': np.array([4]),
            'predictions': np.array([0.3, 0.2, 0.05, 0.4, 0.05]),
            'example_weights': np.array([1.0])
        }

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [example1, example2, example3, example4])
                | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
                | 'AddSlice' >> beam.Map(lambda x: ((), x))
                | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner)
                | 'ComputeMatrices' >> beam.Map(lambda x:
                                                (x[0], matrices.result(x[1])))
            )  # pyformat: disable

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    self.assertLen(got_metrics, 1)
                    key = metric_types.MetricKey(
                        name='_binary_confusion_matrices_[-inf]',
                        sub_key=metric_types.SubKey(top_k=3))
                    self.assertIn(key, got_metrics)
                    got_matrices = got_metrics[key]
                    self.assertEqual(
                        got_matrices,
                        binary_confusion_matrices.Matrices(
                            thresholds=[float('-inf')],
                            tp=[2.0],
                            fp=[10.0],
                            tn=[6.0],
                            fn=[2.0],
                            tp_examples=[],
                            tn_examples=[],
                            fp_examples=[],
                            fn_examples=[]))

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
Esempio n. 11
0
def flip_count(
        counterfactual_prediction_key: Optional[str] = None,
        example_id_key: Optional[str] = None,
        example_ids_count: int = DEFAULT_NUM_EXAMPLE_IDS,
        name: str = FLIP_COUNT_NAME,
        thresholds: Sequence[float] = DEFAULT_THRESHOLDS,
        model_name: str = '',
        output_name: str = '',
        eval_config: Optional[config_pb2.EvalConfig] = None,
        example_weighted: bool = False) -> metric_types.MetricComputations:
    """Returns metric computations for computing flip counts."""
    keys, metric_key_by_name_by_threshold = create_metric_keys(
        thresholds, METRICS_LIST, name, model_name, output_name,
        example_weighted)

    feature_keys = [counterfactual_prediction_key]
    if example_id_key:
        feature_keys.append(example_id_key)

    def extract_label_prediction_and_weight(
        inputs: metric_types.StandardMetricInputs,
        eval_config: Optional[config_pb2.EvalConfig] = None,
        model_name: str = '',
        output_name: str = '',
        sub_key: Optional[metric_types.SubKey] = None,
        aggregation_type: Optional[metric_types.AggregationType] = None,
        class_weights: Optional[Dict[int, float]] = None,
        example_weighted: bool = False,
        fractional_labels: bool = False,
        flatten: bool = True,
    ) -> Iterator[Tuple[np.ndarray, np.ndarray, np.ndarray]]:
        """Yields label, prediction, and example weights to be used in calculations.

    This function is a customized metric_util.to_label_prediction_example_weight
    function which yields original prediction as label and counterfactual
    prediction as prediction and derive flip count metrics from false positives
    and false negatives of binary confusion matrix.

    Args:
      inputs: Standard metric inputs.
      eval_config: Eval config
      model_name: Optional model name (if multi-model evaluation).
      output_name: Optional output name (if multi-output model type).
      sub_key: Optional sub key. (unused)
      aggregation_type: Optional aggregation type. (unused)
      class_weights: Optional class weights to apply to multi-class /
        multi-label labels and predictions. (unused)
      example_weighted: True if example weights should be applied.
      fractional_labels: If true, each incoming tuple of (label, prediction,
        example weight) will be split into two tuples as follows (where l, p, w
        represent the resulting label, prediction, and example weight values):
          (1) l = 0.0, p = prediction, and w = example_weight * (1.0 - label)
          (2) l = 1.0, p = prediction, and w = example_weight * label If
          enabled, an exception will be raised if labels are not within [0, 1].
          The implementation is such that tuples associated with a weight of
          zero are not yielded. This means it is safe to enable
          fractional_labels even when the labels only take on the values of 0.0
          or 1.0. (unused)
      flatten: True to flatten the final label and prediction outputs so that
        the yielded values are always arrays of size 1. For example, multi-class
        /multi-label outputs would be converted into label and prediction pairs
        that could then be processed by a binary classification metric in order
        to compute a micro average over all classes. (unused)

    Yields:
      Tuple of (label, prediction, example_weight).

    Raises:
      ValueError: If counterfactual prediction key is not found within either
        the features or predictions.
      ValueError: If predictions is None or empty.
    """
        del (sub_key, aggregation_type, class_weights, fractional_labels,
             flatten)  # unused

        # TODO(sokeefe): Look into removing the options to pass counterfactual
        # predictions in a feature and instead as a baseline model.
        if (counterfactual_prediction_key is not None
                and counterfactual_prediction_key in inputs.features):
            counterfactual_prediction = inputs.features[
                counterfactual_prediction_key]
        elif eval_config is not None:
            counterfactual_model_spec = model_util.get_baseline_model_spec(
                eval_config)
            if counterfactual_model_spec is not None:
                _, counterfactual_prediction, _ = next(
                    metric_util.to_label_prediction_example_weight(
                        inputs,
                        eval_config=eval_config,
                        model_name=counterfactual_model_spec.name,
                        output_name=output_name,
                        example_weighted=example_weighted,
                        fractional_labels=
                        False,  # Labels are ignored for flip counts.
                        flatten=False,  # Flattened below
                        allow_none=True,  # Allow None labels
                        require_single_example_weight=True))
            else:
                raise ValueError(
                    'The Counterfactual model must be listed with '
                    f'`is_baseline` equal to `True`. Found: {eval_config}')
        else:
            raise ValueError(
                '`counterfactual_prediction` was not found within the provided '
                'inputs. It must be included as either a feature key or within the '
                'predictions. Found:\n'
                f'`counterfactual_prediction_key`: {counterfactual_prediction_key}\n'
                f'`inputs.prediction`:{inputs.prediction}')

        if counterfactual_prediction is None:
            raise ValueError(
                '%s feature key is None (required for FlipCount metric)' %
                counterfactual_prediction_key)

        def get_by_keys(value: Any, keys: List[str]) -> Any:
            if isinstance(value, dict):
                new_value = util.get_by_keys(value, keys, optional=True)
                if new_value is not None:
                    return new_value
            return value

        if model_name:
            counterfactual_prediction = get_by_keys(counterfactual_prediction,
                                                    [model_name])
        if output_name:
            counterfactual_prediction = get_by_keys(counterfactual_prediction,
                                                    [output_name])

        _, prediction, example_weight = next(
            metric_util.to_label_prediction_example_weight(
                inputs,
                eval_config=eval_config,
                model_name=model_name,
                output_name=output_name,
                example_weighted=example_weighted,
                fractional_labels=False,  # Labels are ignored for flip counts.
                flatten=False,  # Flattened below
                allow_none=True,  # Allow None labels
                require_single_example_weight=True))

        if prediction.size != counterfactual_prediction.size:
            raise ValueError(
                'prediction and counterfactual_prediction size should be same for '
                'FlipCount metric, %f != %f' %
                (prediction.size, counterfactual_prediction.size))

        if prediction.size == 0:
            raise ValueError(
                'prediction is empty (required for FlipCount metric)')
        else:  # Always flatten
            example_weight = np.array(
                [float(example_weight) for i in range(prediction.shape[-1])])
            for p, cfp, w in zip(prediction.flatten(),
                                 counterfactual_prediction.flatten(),
                                 example_weight.flatten()):
                yield np.array([p]), np.array([cfp]), np.array([w])

    # Setting fractional label to false, since prediction is being used as label
    # and it can be a non-binary value.
    computations = binary_confusion_matrices.binary_confusion_matrices(
        thresholds=list(thresholds),
        eval_config=eval_config,
        model_name=model_name,
        output_name=output_name,
        example_weighted=example_weighted,
        extract_label_prediction_and_weight=extract_label_prediction_and_weight,
        preprocessor=metric_types.FeaturePreprocessor(
            feature_keys=feature_keys),
        example_id_key=example_id_key,
        example_ids_count=example_ids_count,
        fractional_labels=False)
    examples_metric_key, matrices_metric_key = computations[-1].keys

    def result(
        metrics: Dict[metric_types.MetricKey, Any]
    ) -> Dict[metric_types.MetricKey, Any]:
        """Returns flip count metrics values."""
        matrix = metrics[matrices_metric_key]
        examples = metrics[examples_metric_key]

        output = {}
        for i, threshold in enumerate(matrix.thresholds):
            output[metric_key_by_name_by_threshold[threshold]
                   ['positive_to_negative']] = matrix.fn[i]
            output[metric_key_by_name_by_threshold[threshold]
                   ['negative_to_positive']] = matrix.fp[i]
            output[metric_key_by_name_by_threshold[threshold]
                   ['positive_to_negative_examples_ids']] = np.array(
                       examples.fn_examples[i])
            output[metric_key_by_name_by_threshold[threshold]
                   ['negative_to_positive_examples_ids']] = np.array(
                       examples.fp_examples[i])
            output[metric_key_by_name_by_threshold[threshold]
                   ['positive_examples_count']] = matrix.fn[i] + matrix.tp[i]
            output[metric_key_by_name_by_threshold[threshold]
                   ['negative_examples_count']] = matrix.fp[i] + matrix.tn[i]

        return output

    derived_computation = metric_types.DerivedMetricComputation(keys=keys,
                                                                result=result)

    computations.append(derived_computation)
    return computations