def default_regression_specs(
    model_names: Optional[List[Text]] = None,
    output_names: Optional[List[Text]] = None,
    loss_functions: Optional[List[Union[tf.keras.metrics.Metric,
                                        tf.keras.losses.Loss]]] = None,
    min_value: Optional[float] = None,
    max_value: Optional[float] = None) -> List[config.MetricsSpec]:
  """Returns default metric specs for for regression problems.

  Args:
    model_names: Optional model names (if multi-model evaluation).
    output_names: Optional list of output names (if multi-output model).
    loss_functions: Loss functions to use (if None MSE is used).
    min_value: Min value for calibration plot (if None no plot will be created).
    max_value: Max value for calibration plot (if None no plot will be created).
  """

  if loss_functions is None:
    loss_functions = [tf.keras.metrics.MeanSquaredError(name='mse')]

  metrics = [
      tf.keras.metrics.Accuracy(name='accuracy'),
      calibration.MeanLabel(name='mean_label'),
      calibration.MeanPrediction(name='mean_prediction'),
      calibration.Calibration(name='calibration'),
  ]
  for fn in loss_functions:
    metrics.append(fn)
  if min_value is not None and max_value is not None:
    metrics.append(
        calibration_plot.CalibrationPlot(
            name='calibration_plot', left=min_value, right=max_value))

  return specs_from_metrics(
      metrics, model_names=model_names, output_names=output_names)
 def testCalibrationPlotWithSchema(self, eval_config, schema, model_names,
                                   output_names, expected_left,
                                   expected_range):
     computations = calibration_plot.CalibrationPlot(
         num_buckets=10).computations(eval_config=eval_config,
                                      schema=schema,
                                      model_names=model_names,
                                      output_names=output_names)
     histogram = computations[0]
     self.assertEqual(expected_left, histogram.combiner._left)
     self.assertEqual(expected_range, histogram.combiner._range)
Exemple #3
0
def default_binary_classification_specs(
        model_names: Optional[List[Text]] = None,
        output_names: Optional[List[Text]] = None,
        output_weights: Optional[Dict[Text, float]] = None,
        binarize: Optional[config.BinarizationOptions] = None,
        aggregate: Optional[config.AggregationOptions] = None,
        include_loss: bool = True) -> List[config.MetricsSpec]:
    """Returns default metric specs for binary classification problems.

  Args:
    model_names: Optional model names (if multi-model evaluation).
    output_names: Optional list of output names (if multi-output model).
    output_weights: Optional output weights for creating overall metric
      aggregated across outputs (if multi-output model). If a weight is not
      provided for an output, it's weight defaults to 0.0 (i.e. output ignored).
    binarize: Optional settings for binarizing multi-class/multi-label metrics.
    aggregate: Optional settings for aggregating multi-class/multi-label
      metrics.
    include_loss: True to include loss.
  """

    metrics = [
        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.AUC(
            name='auc',
            num_thresholds=binary_confusion_matrices.DEFAULT_NUM_THRESHOLDS),
        tf.keras.metrics.AUC(
            name=
            'auc_precison_recall',  # Matches default name used by estimator.
            curve='PR',
            num_thresholds=binary_confusion_matrices.DEFAULT_NUM_THRESHOLDS),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
        calibration.MeanLabel(name='mean_label'),
        calibration.MeanPrediction(name='mean_prediction'),
        calibration.Calibration(name='calibration'),
        confusion_matrix_plot.ConfusionMatrixPlot(
            name='confusion_matrix_plot'),
        calibration_plot.CalibrationPlot(name='calibration_plot')
    ]
    if include_loss:
        metrics.append(tf.keras.metrics.BinaryCrossentropy(name='loss'))

    return specs_from_metrics(metrics,
                              model_names=model_names,
                              output_names=output_names,
                              output_weights=output_weights,
                              binarize=binarize,
                              aggregate=aggregate)
def default_binary_classification_specs(
        model_names: Optional[List[Text]] = None,
        output_names: Optional[List[Text]] = None,
        class_ids: Optional[List[int]] = None,
        k_list: Optional[List[int]] = None,
        top_k_list: Optional[List[int]] = None,
        include_loss: bool = True) -> List[config.MetricsSpec]:
    """Returns default metric specs for binary classification problems.

  Args:
    model_names: Optional model names (if multi-model evaluation).
    output_names: Optional list of output names (if multi-output model).
    class_ids: Optional class IDs to compute metrics for particular classes in a
      multi-class model. If output_names are provided, all outputs are assumed
      to use the same class IDs.
    k_list: Optional list of k values to compute metrics for the kth predicted
      values of a multi-class model prediction. If output_names are provided,
      all outputs are assumed to use the same k value.
    top_k_list: Optional list of top_k values to compute metrics for the top k
      predicted values in a multi-class model prediction. If output_names are
      provided, all outputs are assumed to use the same top_k value. Metrics and
      plots will be based on treating each predicted value in the top_k as
      though they were separate predictions.
    include_loss: True to include loss.
  """

    metrics = [
        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.AUC(name='auc'),
        tf.keras.metrics.AUC(name='auc_pr', curve='PR'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
        calibration.MeanLabel(name='mean_label'),
        calibration.MeanPrediction(name='mean_prediction'),
        calibration.Calibration(name='calibration'),
        auc_plot.AUCPlot(name='auc_plot'),
        calibration_plot.CalibrationPlot(name='calibration_plot')
    ]
    if include_loss:
        metrics.append(tf.keras.metrics.BinaryCrossentropy(name='loss'))

    return specs_from_metrics(metrics,
                              model_names=model_names,
                              output_names=output_names,
                              class_ids=class_ids,
                              k_list=k_list,
                              top_k_list=top_k_list)
def default_binary_classification_specs(
        model_names: Optional[List[Text]] = None,
        output_names: Optional[List[Text]] = None,
        binarize: Optional[config.BinarizationOptions] = None,
        aggregate: Optional[config.AggregationOptions] = None,
        include_loss: bool = True) -> List[config.MetricsSpec]:
    """Returns default metric specs for binary classification problems.

  Args:
    model_names: Optional model names (if multi-model evaluation).
    output_names: Optional list of output names (if multi-output model).
    binarize: Optional settings for binarizing multi-class/multi-label metrics.
    aggregate: Optional settings for aggregating multi-class/multi-label
      metrics.
    include_loss: True to include loss.
  """

    metrics = [
        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.AUC(name='auc'),
        tf.keras.metrics.AUC(name='auc_pr', curve='PR'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall'),
        calibration.MeanLabel(name='mean_label'),
        calibration.MeanPrediction(name='mean_prediction'),
        calibration.Calibration(name='calibration'),
        auc_plot.AUCPlot(name='auc_plot'),
        calibration_plot.CalibrationPlot(name='calibration_plot')
    ]
    if include_loss:
        metrics.append(tf.keras.metrics.BinaryCrossentropy(name='loss'))

    return specs_from_metrics(metrics,
                              model_names=model_names,
                              output_names=output_names,
                              binarize=binarize,
                              aggregate=aggregate)
    def testCalibrationPlot(self):
        computations = calibration_plot.CalibrationPlot(
            num_buckets=10).computations()
        histogram = computations[0]
        plot = computations[1]

        example1 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.2]),
            'example_weights': np.array([1.0])
        }
        example2 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.8]),
            'example_weights': np.array([2.0])
        }
        example3 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.5]),
            'example_weights': np.array([3.0])
        }
        example4 = {
            'labels': np.array([1.0]),
            'predictions': np.array([-0.1]),
            'example_weights': np.array([4.0])
        }
        example5 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.5]),
            'example_weights': np.array([5.0])
        }
        example6 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.8]),
            'example_weights': np.array([6.0])
        }
        example7 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.2]),
            'example_weights': np.array([7.0])
        }
        example8 = {
            'labels': np.array([1.0]),
            'predictions': np.array([1.1]),
            'example_weights': np.array([8.0])
        }

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create([
                    example1, example2, example3, example4, example5, example6,
                    example7, example8
                ])
                | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
                | 'AddSlice' >> beam.Map(lambda x: ((), x))
                | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner)
                |
                'ComputePlot' >> beam.Map(lambda x: (x[0], plot.result(x[1]))))

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_plots = got[0]
                    self.assertEqual(got_slice_key, ())
                    self.assertLen(got_plots, 1)
                    key = metric_types.PlotKey(name='calibration_plot')
                    self.assertIn(key, got_plots)
                    got_plot = got_plots[key]
                    self.assertProtoEquals(
                        """
              buckets {
                lower_threshold_inclusive: -inf
                upper_threshold_exclusive: 0.0
                total_weighted_label {
                  value: 4.0
                }
                total_weighted_refined_prediction {
                  value: -0.4
                }
                num_weighted_examples {
                  value: 4.0
                }
              }
              buckets {
                lower_threshold_inclusive: 0.0
                upper_threshold_exclusive: 0.1
                total_weighted_label {
                }
                total_weighted_refined_prediction {
                }
                num_weighted_examples {
                }
              }
              buckets {
                lower_threshold_inclusive: 0.1
                upper_threshold_exclusive: 0.2
                total_weighted_label {
                }
                total_weighted_refined_prediction {
                }
                num_weighted_examples {
                }
              }
              buckets {
                lower_threshold_inclusive: 0.2
                upper_threshold_exclusive: 0.3
                total_weighted_label {
                }
                total_weighted_refined_prediction {
                  value: 1.6
                }
                num_weighted_examples {
                  value: 8.0
                }
              }
              buckets {
                lower_threshold_inclusive: 0.3
                upper_threshold_exclusive: 0.4
                total_weighted_label {
                }
                total_weighted_refined_prediction {
                }
                num_weighted_examples {
                }
              }
              buckets {
                lower_threshold_inclusive: 0.4
                upper_threshold_exclusive: 0.5
                total_weighted_label {
                }
                total_weighted_refined_prediction {
                }
                num_weighted_examples {
                }
              }
              buckets {
                lower_threshold_inclusive: 0.5
                upper_threshold_exclusive: 0.6
                total_weighted_label {
                  value: 5.0
                }
                total_weighted_refined_prediction {
                  value: 4.0
                }
                num_weighted_examples {
                  value: 8.0
                }
              }
              buckets {
                lower_threshold_inclusive: 0.6
                upper_threshold_exclusive: 0.7
                total_weighted_label {
                }
                total_weighted_refined_prediction {
                }
                num_weighted_examples {
                }
              }
              buckets {
                lower_threshold_inclusive: 0.7
                upper_threshold_exclusive: 0.8
                total_weighted_label {
                }
                total_weighted_refined_prediction {
                }
                num_weighted_examples {
                }
              }
              buckets {
                lower_threshold_inclusive: 0.8
                upper_threshold_exclusive: 0.9
                total_weighted_label {
                  value: 8.0
                }
                total_weighted_refined_prediction {
                  value: 6.4
                }
                num_weighted_examples {
                  value: 8.0
                }
              }
              buckets {
                lower_threshold_inclusive: 0.9
                upper_threshold_exclusive: 1.0
                total_weighted_label {
                }
                total_weighted_refined_prediction {
                }
                num_weighted_examples {
                }
              }
              buckets {
                lower_threshold_inclusive: 1.0
                upper_threshold_exclusive: inf
                total_weighted_label {
                  value: 8.0
                }
                total_weighted_refined_prediction {
                  value: 8.8
                }
                num_weighted_examples {
                  value: 8.0
                }
              }
          """, got_plot)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
    def testEvaluateWithBinaryClassificationModel(self):
        n_classes = 2
        temp_export_dir = self._getExportDir()
        _, export_dir = dnn_classifier.simple_dnn_classifier(
            None, temp_export_dir, n_classes=n_classes)

        # Add mean_label, example_count, weighted_example_count, calibration_plot
        eval_config = config.EvalConfig(
            model_specs=[
                config.ModelSpec(location=export_dir,
                                 label_key='label',
                                 example_weight_key='age')
            ],
            slicing_specs=[config.SlicingSpec()],
            metrics_specs=metric_specs.specs_from_metrics([
                calibration.MeanLabel('mean_label'),
                calibration_plot.CalibrationPlot(name='calibration_plot',
                                                 num_buckets=10)
            ]))
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])

        slice_spec = [
            slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
        ]
        extractors = [
            input_extractor.InputExtractor(eval_config=eval_config),
            predict_extractor_v2.PredictExtractor(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model]),
            slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
        ]
        evaluators = [
            metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model])
        ]

        examples = [
            self._makeExample(age=1.0, language='english', label=0.0),
            self._makeExample(age=2.0, language='chinese', label=1.0),
            self._makeExample(age=3.0, language='chinese', label=0.0),
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            metrics_and_plots = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate(
                    extractors=extractors, evaluators=evaluators))

            # pylint: enable=no-value-for-parameter

            def check_metrics(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    example_count_key = metric_types.MetricKey(
                        name='example_count')
                    weighted_example_count_key = metric_types.MetricKey(
                        name='weighted_example_count')
                    label_key = metric_types.MetricKey(name='mean_label')
                    self.assertDictElementsAlmostEqual(
                        got_metrics, {
                            example_count_key:
                            3,
                            weighted_example_count_key: (1.0 + 2.0 + 3.0),
                            label_key:
                            (0 * 1.0 + 1 * 2.0 + 0 * 3.0) / (1.0 + 2.0 + 3.0),
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            def check_plots(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_plots = got[0]
                    self.assertEqual(got_slice_key, ())
                    plot_key = metric_types.PlotKey('calibration_plot')
                    self.assertIn(plot_key, got_plots)
                    # 10 buckets + 2 for edge cases
                    self.assertLen(got_plots[plot_key].buckets, 12)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics_and_plots[constants.METRICS_KEY],
                             check_metrics,
                             label='metrics')
            util.assert_that(metrics_and_plots[constants.PLOTS_KEY],
                             check_plots,
                             label='plots')