Esempio n. 1
0
def default_evaluators(  # pylint: disable=invalid-name
    eval_shared_model: types.EvalSharedModel,
    desired_batch_size: Optional[int] = None,
    num_bootstrap_samples: Optional[int] = None,
    k_anonymization_count: int = 1) -> List[evaluator.Evaluator]:
  """Returns the default evaluators for use in ExtractAndEvaluate.

  Args:
    eval_shared_model: Shared model parameters for EvalSavedModel.
    desired_batch_size: Optional batch size for batching in Aggregate.
    num_bootstrap_samples: Number of bootstrap samples to draw. If more than 1,
      confidence intervals will be computed for metrics. Suggested value is at
      least 20.
    k_anonymization_count: If the number of examples in a specific slice is less
      than k_anonymization_count, then an error will be returned for that slice.
      This will be useful to ensure privacy by not displaying the aggregated
      data for smaller number of examples.
  """
  return [
      metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(
          eval_shared_model,
          desired_batch_size,
          num_bootstrap_samples=num_bootstrap_samples,
          k_anonymization_count=k_anonymization_count)
  ]
Esempio n. 2
0
def default_evaluators(  # pylint: disable=invalid-name
        eval_shared_model: Optional[types.EvalSharedModel] = None,
        eval_shared_models: Optional[List[types.EvalSharedModel]] = None,
        eval_config: config.EvalConfig = None,
        desired_batch_size: Optional[int] = None,
        compute_confidence_intervals: Optional[bool] = False,
        k_anonymization_count: int = 1) -> List[evaluator.Evaluator]:
    """Returns the default evaluators for use in ExtractAndEvaluate.

  Args:
    eval_shared_model: Shared model (single-model evaluation).
    eval_shared_models: Shared models (multi-model evaluation).
    eval_config: Eval config.
    desired_batch_size: Deprecated (use eval_config).
    compute_confidence_intervals: Deprecated (use eval_config).
    k_anonymization_count: Deprecated (use eval_config).
  """
    # TODO(b/141016373): Add support for multiple models.
    if eval_shared_model is not None:
        eval_shared_models = [eval_shared_model]
    if not eval_config or not eval_config.metrics_specs:
        # Backwards compatibility for previous EvalSavedModel implementation.
        if eval_config is not None:
            desired_batch_size = eval_config.desired_batch_size
            compute_confidence_intervals = eval_config.compute_confidence_intervals
            k_anonymization_count = eval_config.k_anonymization_count
        return [
            metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(
                eval_shared_models[0],
                desired_batch_size,
                compute_confidence_intervals=compute_confidence_intervals,
                k_anonymization_count=k_anonymization_count)
        ]
    else:
        raise NotImplementedError('metrics_specs not implemented yet.')
Esempio n. 3
0
def default_evaluators(  # pylint: disable=invalid-name
    eval_shared_model: Optional[Union[types.EvalSharedModel,
                                      Dict[Text,
                                           types.EvalSharedModel]]] = None,
    eval_config: config.EvalConfig = None,
    compute_confidence_intervals: Optional[bool] = False,
    k_anonymization_count: int = 1,
    desired_batch_size: Optional[int] = None,
    serialize: bool = False,
    random_seed_for_testing: Optional[int] = None) -> List[
        evaluator.Evaluator]:
    """Returns the default evaluators for use in ExtractAndEvaluate.

  Args:
    eval_shared_model: Optional shared model (single-model evaluation) or dict
      of shared models keyed by model name (multi-model evaluation). Only
      required if there are metrics to be computed in-graph using the model.
    eval_config: Eval config.
    compute_confidence_intervals: Deprecated (use eval_config).
    k_anonymization_count: Deprecated (use eval_config).
    desired_batch_size: Optional batch size for batching in combiner.
    serialize: Deprecated.
    random_seed_for_testing: Provide for deterministic tests only.
  """
    disabled_outputs = []
    if eval_config and eval_config.options:
        disabled_outputs = eval_config.options.disabled_outputs
    if (constants.METRICS_KEY in disabled_outputs
            and constants.PLOTS_KEY in disabled_outputs):
        return []
    if (eval_shared_model and not isinstance(eval_shared_model, dict) and
        ((not eval_shared_model.model_loader.tags
          or eval_constants.EVAL_TAG in eval_shared_model.model_loader.tags)
         and (not eval_config or not eval_config.metrics_specs))):
        # Backwards compatibility for previous add_metrics_callbacks implementation.
        if eval_config is not None:
            if eval_config.options.HasField('compute_confidence_intervals'):
                compute_confidence_intervals = (
                    eval_config.options.compute_confidence_intervals.value)
            if eval_config.options.HasField('k_anonymization_count'):
                k_anonymization_count = eval_config.options.k_anonymization_count.value
        return [
            metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(
                eval_shared_model,
                compute_confidence_intervals=compute_confidence_intervals,
                k_anonymization_count=k_anonymization_count,
                desired_batch_size=desired_batch_size,
                serialize=serialize,
                random_seed_for_testing=random_seed_for_testing)
        ]
    else:
        return [
            metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                eval_config=eval_config, eval_shared_model=eval_shared_model)
        ]
def default_evaluators(  # pylint: disable=invalid-name
        eval_shared_model: Optional[types.EvalSharedModel] = None,
        eval_shared_models: Optional[List[types.EvalSharedModel]] = None,
        eval_config: config.EvalConfig = None,
        desired_batch_size: Optional[int] = None,
        compute_confidence_intervals: Optional[bool] = False,
        k_anonymization_count: int = 1,
        serialize: bool = False) -> List[evaluator.Evaluator]:
    """Returns the default evaluators for use in ExtractAndEvaluate.

  Args:
    eval_shared_model: Optional shared model (single-model evaluation). Required
      if any of the metrics are derived or computed using the model.
    eval_shared_models: Optional shared models (multi-model evaluation).
      Required if any of the metrics are derived or computed using the model.
    eval_config: Eval config.
    desired_batch_size: Deprecated (use eval_config).
    compute_confidence_intervals: Deprecated (use eval_config).
    k_anonymization_count: Deprecated (use eval_config).
    serialize: Deprecated.
  """
    # TODO(b/141016373): Add support for multiple models.
    if eval_shared_model is not None:
        eval_shared_models = [eval_shared_model]
    disabled_outputs = []
    if eval_config and eval_config.output_data_specs:
        disabled_outputs = eval_config.output_data_specs[0].disabled_outputs
    if (constants.METRICS_KEY in disabled_outputs
            and constants.PLOTS_KEY in disabled_outputs):
        return []
    if ((not eval_shared_models[0].model_loader.tags
         or eval_constants.EVAL_TAG in eval_shared_models[0].model_loader.tags)
            and (not eval_config or not eval_config.metrics_specs)):
        # Backwards compatibility for previous EvalSavedModel implementation.
        if eval_config is not None:
            if eval_config.options.HasField('desired_batch_size'):
                desired_batch_size = eval_config.options.desired_batch_size.value
            if eval_config.options.HasField('compute_confidence_intervals'):
                compute_confidence_intervals = (
                    eval_config.options.compute_confidence_intervals.value)
            if eval_config.options.HasField('k_anonymization_count'):
                k_anonymization_count = eval_config.options.k_anonymization_count.value
        return [
            metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(
                eval_shared_models[0],
                desired_batch_size,
                compute_confidence_intervals=compute_confidence_intervals,
                k_anonymization_count=k_anonymization_count,
                serialize=serialize)
        ]
    else:
        return [
            metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                eval_config=eval_config, eval_shared_models=eval_shared_models)
        ]
Esempio n. 5
0
def default_evaluators(  # pylint: disable=invalid-name
        eval_shared_model,
        desired_batch_size=None):
    """Returns the default evaluators for use in ExtractAndEvaluate.

  Args:
    eval_shared_model: Shared model parameters for EvalSavedModel.
    desired_batch_size: Optional batch size for batching in Aggregate.
  """
    return [
        metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(
            eval_shared_model, desired_batch_size)
    ]
Esempio n. 6
0
def default_evaluators(  # pylint: disable=invalid-name
        eval_shared_model,
        desired_batch_size=None,
        num_bootstrap_samples=None):
    """Returns the default evaluators for use in ExtractAndEvaluate.

  Args:
    eval_shared_model: Shared model parameters for EvalSavedModel.
    desired_batch_size: Optional batch size for batching in Aggregate.
    num_bootstrap_samples: Number of bootstrap samples to draw. If more than 1,
      confidence intervals will be computed for metrics. Suggested value is at
      least 20.
  """
    return [
        metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(
            eval_shared_model,
            desired_batch_size,
            num_bootstrap_samples=num_bootstrap_samples)
    ]
  def testWriteMetricsAndPlots(self):
    metrics_file = os.path.join(self._getTempDir(), 'metrics')
    plots_file = os.path.join(self._getTempDir(), 'plots')
    temp_eval_export_dir = os.path.join(self._getTempDir(), 'eval_export_dir')

    _, eval_export_dir = (
        fixed_prediction_estimator.simple_fixed_prediction_estimator(
            None, temp_eval_export_dir))
    eval_config = config.EvalConfig(
        model_specs=[config.ModelSpec()],
        options=config.Options(
            disabled_outputs={'values': ['eval_config.json']}))
    eval_shared_model = self.createTestEvalSharedModel(
        eval_saved_model_path=eval_export_dir,
        add_metrics_callbacks=[
            post_export_metrics.example_count(),
            post_export_metrics.calibration_plot_and_prediction_histogram(
                num_buckets=2)
        ])
    extractors = [
        predict_extractor.PredictExtractor(eval_shared_model),
        slice_key_extractor.SliceKeyExtractor()
    ]
    evaluators = [
        metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(eval_shared_model)
    ]
    output_paths = {
        constants.METRICS_KEY: metrics_file,
        constants.PLOTS_KEY: plots_file
    }
    writers = [
        metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter(
            output_paths, eval_shared_model.add_metrics_callbacks)
    ]

    with beam.Pipeline() as pipeline:
      example1 = self._makeExample(prediction=0.0, label=1.0)
      example2 = self._makeExample(prediction=1.0, label=1.0)

      # pylint: disable=no-value-for-parameter
      _ = (
          pipeline
          | 'Create' >> beam.Create([
              example1.SerializeToString(),
              example2.SerializeToString(),
          ])
          | 'ExtractEvaluateAndWriteResults' >>
          model_eval_lib.ExtractEvaluateAndWriteResults(
              eval_config=eval_config,
              eval_shared_model=eval_shared_model,
              extractors=extractors,
              evaluators=evaluators,
              writers=writers))
      # pylint: enable=no-value-for-parameter

    expected_metrics_for_slice = text_format.Parse(
        """
        slice_key {}
        metrics {
          key: "average_loss"
          value {
            double_value {
              value: 0.5
            }
          }
        }
        metrics {
          key: "post_export_metrics/example_count"
          value {
            double_value {
              value: 2.0
            }
          }
        }
        """, metrics_for_slice_pb2.MetricsForSlice())

    metric_records = []
    for record in tf.compat.v1.python_io.tf_record_iterator(metrics_file):
      metric_records.append(
          metrics_for_slice_pb2.MetricsForSlice.FromString(record))
    self.assertEqual(1, len(metric_records), 'metrics: %s' % metric_records)
    self.assertProtoEquals(expected_metrics_for_slice, metric_records[0])

    expected_plots_for_slice = text_format.Parse(
        """
      slice_key {}
      plots {
        key: "post_export_metrics"
        value {
          calibration_histogram_buckets {
            buckets {
              lower_threshold_inclusive: -inf
              num_weighted_examples {}
              total_weighted_label {}
              total_weighted_refined_prediction {}
            }
            buckets {
              upper_threshold_exclusive: 0.5
              num_weighted_examples {
                value: 1.0
              }
              total_weighted_label {
                value: 1.0
              }
              total_weighted_refined_prediction {}
            }
            buckets {
              lower_threshold_inclusive: 0.5
              upper_threshold_exclusive: 1.0
              num_weighted_examples {
              }
              total_weighted_label {}
              total_weighted_refined_prediction {}
            }
            buckets {
              lower_threshold_inclusive: 1.0
              upper_threshold_exclusive: inf
              num_weighted_examples {
                value: 1.0
              }
              total_weighted_label {
                value: 1.0
              }
              total_weighted_refined_prediction {
                value: 1.0
              }
            }
         }
        }
      }
    """, metrics_for_slice_pb2.PlotsForSlice())

    plot_records = []
    for record in tf.compat.v1.python_io.tf_record_iterator(plots_file):
      plot_records.append(
          metrics_for_slice_pb2.PlotsForSlice.FromString(record))
    self.assertEqual(1, len(plot_records), 'plots: %s' % plot_records)
    self.assertProtoEquals(expected_plots_for_slice, plot_records[0])
 def testRunModelAnalysisWithQueryExtractor(self):
   model_location = self._exportEvalSavedModel(
       linear_classifier.simple_linear_classifier)
   examples = [
       self._makeExample(age=3.0, language='english', label=1.0),
       self._makeExample(age=3.0, language='chinese', label=0.0),
       self._makeExample(age=4.0, language='english', label=0.0),
       self._makeExample(age=5.0, language='chinese', label=1.0)
   ]
   data_location = self._writeTFExamplesToTFRecords(examples)
   slice_spec = [slicer.SingleSliceSpec()]
   eval_shared_model = model_eval_lib.default_eval_shared_model(
       eval_saved_model_path=model_location, example_weight_key='age')
   eval_result = model_eval_lib.run_model_analysis(
       eval_shared_model=eval_shared_model,
       data_location=data_location,
       slice_spec=slice_spec,
       evaluators=[
           metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(
               eval_shared_model),
           query_based_metrics_evaluator.QueryBasedMetricsEvaluator(
               query_id='language',
               prediction_key='logistic',
               combine_fns=[
                   query_statistics.QueryStatisticsCombineFn(),
                   ndcg.NdcgMetricCombineFn(
                       at_vals=[1], gain_key='label', weight_key='')
               ]),
       ])
   # We only check some of the metrics to ensure that the end-to-end
   # pipeline works.
   expected = {
       (): {
           'post_export_metrics/total_queries': {
               'doubleValue': 2.0
           },
           'post_export_metrics/min_documents': {
               'doubleValue': 2.0
           },
           'post_export_metrics/max_documents': {
               'doubleValue': 2.0
           },
           'post_export_metrics/total_documents': {
               'doubleValue': 4.0
           },
           'post_export_metrics/ndcg@1': {
               'doubleValue': 0.5
           },
           'post_export_metrics/example_weight': {
               'doubleValue': 15.0
           },
           'post_export_metrics/example_count': {
               'doubleValue': 4.0
           },
       }
   }
   self.assertEqual(eval_result.config.model_location, model_location)
   self.assertEqual(eval_result.config.data_location, data_location)
   self.assertEqual(eval_result.config.slice_spec, slice_spec)
   self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
   self.assertFalse(eval_result.plots)
Esempio n. 9
0
def default_evaluators(  # pylint: disable=invalid-name
    eval_shared_model: Optional[Union[types.EvalSharedModel,
                                      Dict[Text,
                                           types.EvalSharedModel]]] = None,
    eval_config: config.EvalConfig = None,
    compute_confidence_intervals: Optional[bool] = False,
    k_anonymization_count: int = 1,
    desired_batch_size: Optional[int] = None,
    serialize: bool = False,
    random_seed_for_testing: Optional[int] = None) -> List[evaluator.Evaluator]:
  """Returns the default evaluators for use in ExtractAndEvaluate.

  Args:
    eval_shared_model: Optional shared model (single-model evaluation) or dict
      of shared models keyed by model name (multi-model evaluation). Only
      required if there are metrics to be computed in-graph using the model.
    eval_config: Eval config.
    compute_confidence_intervals: Deprecated (use eval_config).
    k_anonymization_count: Deprecated (use eval_config).
    desired_batch_size: Optional batch size for batching in combiner.
    serialize: Deprecated.
    random_seed_for_testing: Provide for deterministic tests only.
  """
  disabled_outputs = []
  if eval_config:
    eval_config = config.update_eval_config_with_defaults(eval_config)
    disabled_outputs = eval_config.options.disabled_outputs.values
    if model_util.get_model_types(eval_config) == set([constants.TF_LITE]):
      # no in-graph metrics present when tflite is used.
      if eval_shared_model:
        if isinstance(eval_shared_model, dict):
          eval_shared_model = {
              k: v._replace(include_default_metrics=False)
              for k, v in eval_shared_model.items()
          }
        else:
          eval_shared_model = eval_shared_model._replace(
              include_default_metrics=False)
  if (constants.METRICS_KEY in disabled_outputs and
      constants.PLOTS_KEY in disabled_outputs):
    return []
  if _is_legacy_eval(eval_shared_model, eval_config):
    # Backwards compatibility for previous add_metrics_callbacks implementation.
    if eval_config is not None:
      if eval_config.options.HasField('compute_confidence_intervals'):
        compute_confidence_intervals = (
            eval_config.options.compute_confidence_intervals.value)
      if eval_config.options.HasField('k_anonymization_count'):
        k_anonymization_count = eval_config.options.k_anonymization_count.value
    return [
        metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(
            eval_shared_model,
            compute_confidence_intervals=compute_confidence_intervals,
            k_anonymization_count=k_anonymization_count,
            desired_batch_size=desired_batch_size,
            serialize=serialize,
            random_seed_for_testing=random_seed_for_testing)
    ]
  else:
    return [
        metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
            eval_config=eval_config, eval_shared_model=eval_shared_model)
    ]