def default_evaluators( # pylint: disable=invalid-name eval_shared_model: types.EvalSharedModel, desired_batch_size: Optional[int] = None, num_bootstrap_samples: Optional[int] = None, k_anonymization_count: int = 1) -> List[evaluator.Evaluator]: """Returns the default evaluators for use in ExtractAndEvaluate. Args: eval_shared_model: Shared model parameters for EvalSavedModel. desired_batch_size: Optional batch size for batching in Aggregate. num_bootstrap_samples: Number of bootstrap samples to draw. If more than 1, confidence intervals will be computed for metrics. Suggested value is at least 20. k_anonymization_count: If the number of examples in a specific slice is less than k_anonymization_count, then an error will be returned for that slice. This will be useful to ensure privacy by not displaying the aggregated data for smaller number of examples. """ return [ metrics_and_plots_evaluator.MetricsAndPlotsEvaluator( eval_shared_model, desired_batch_size, num_bootstrap_samples=num_bootstrap_samples, k_anonymization_count=k_anonymization_count) ]
def default_evaluators( # pylint: disable=invalid-name eval_shared_model: Optional[types.EvalSharedModel] = None, eval_shared_models: Optional[List[types.EvalSharedModel]] = None, eval_config: config.EvalConfig = None, desired_batch_size: Optional[int] = None, compute_confidence_intervals: Optional[bool] = False, k_anonymization_count: int = 1) -> List[evaluator.Evaluator]: """Returns the default evaluators for use in ExtractAndEvaluate. Args: eval_shared_model: Shared model (single-model evaluation). eval_shared_models: Shared models (multi-model evaluation). eval_config: Eval config. desired_batch_size: Deprecated (use eval_config). compute_confidence_intervals: Deprecated (use eval_config). k_anonymization_count: Deprecated (use eval_config). """ # TODO(b/141016373): Add support for multiple models. if eval_shared_model is not None: eval_shared_models = [eval_shared_model] if not eval_config or not eval_config.metrics_specs: # Backwards compatibility for previous EvalSavedModel implementation. if eval_config is not None: desired_batch_size = eval_config.desired_batch_size compute_confidence_intervals = eval_config.compute_confidence_intervals k_anonymization_count = eval_config.k_anonymization_count return [ metrics_and_plots_evaluator.MetricsAndPlotsEvaluator( eval_shared_models[0], desired_batch_size, compute_confidence_intervals=compute_confidence_intervals, k_anonymization_count=k_anonymization_count) ] else: raise NotImplementedError('metrics_specs not implemented yet.')
def default_evaluators( # pylint: disable=invalid-name eval_shared_model: Optional[Union[types.EvalSharedModel, Dict[Text, types.EvalSharedModel]]] = None, eval_config: config.EvalConfig = None, compute_confidence_intervals: Optional[bool] = False, k_anonymization_count: int = 1, desired_batch_size: Optional[int] = None, serialize: bool = False, random_seed_for_testing: Optional[int] = None) -> List[ evaluator.Evaluator]: """Returns the default evaluators for use in ExtractAndEvaluate. Args: eval_shared_model: Optional shared model (single-model evaluation) or dict of shared models keyed by model name (multi-model evaluation). Only required if there are metrics to be computed in-graph using the model. eval_config: Eval config. compute_confidence_intervals: Deprecated (use eval_config). k_anonymization_count: Deprecated (use eval_config). desired_batch_size: Optional batch size for batching in combiner. serialize: Deprecated. random_seed_for_testing: Provide for deterministic tests only. """ disabled_outputs = [] if eval_config and eval_config.options: disabled_outputs = eval_config.options.disabled_outputs if (constants.METRICS_KEY in disabled_outputs and constants.PLOTS_KEY in disabled_outputs): return [] if (eval_shared_model and not isinstance(eval_shared_model, dict) and ((not eval_shared_model.model_loader.tags or eval_constants.EVAL_TAG in eval_shared_model.model_loader.tags) and (not eval_config or not eval_config.metrics_specs))): # Backwards compatibility for previous add_metrics_callbacks implementation. if eval_config is not None: if eval_config.options.HasField('compute_confidence_intervals'): compute_confidence_intervals = ( eval_config.options.compute_confidence_intervals.value) if eval_config.options.HasField('k_anonymization_count'): k_anonymization_count = eval_config.options.k_anonymization_count.value return [ metrics_and_plots_evaluator.MetricsAndPlotsEvaluator( eval_shared_model, compute_confidence_intervals=compute_confidence_intervals, k_anonymization_count=k_anonymization_count, desired_batch_size=desired_batch_size, serialize=serialize, random_seed_for_testing=random_seed_for_testing) ] else: return [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_model=eval_shared_model) ]
def default_evaluators( # pylint: disable=invalid-name eval_shared_model: Optional[types.EvalSharedModel] = None, eval_shared_models: Optional[List[types.EvalSharedModel]] = None, eval_config: config.EvalConfig = None, desired_batch_size: Optional[int] = None, compute_confidence_intervals: Optional[bool] = False, k_anonymization_count: int = 1, serialize: bool = False) -> List[evaluator.Evaluator]: """Returns the default evaluators for use in ExtractAndEvaluate. Args: eval_shared_model: Optional shared model (single-model evaluation). Required if any of the metrics are derived or computed using the model. eval_shared_models: Optional shared models (multi-model evaluation). Required if any of the metrics are derived or computed using the model. eval_config: Eval config. desired_batch_size: Deprecated (use eval_config). compute_confidence_intervals: Deprecated (use eval_config). k_anonymization_count: Deprecated (use eval_config). serialize: Deprecated. """ # TODO(b/141016373): Add support for multiple models. if eval_shared_model is not None: eval_shared_models = [eval_shared_model] disabled_outputs = [] if eval_config and eval_config.output_data_specs: disabled_outputs = eval_config.output_data_specs[0].disabled_outputs if (constants.METRICS_KEY in disabled_outputs and constants.PLOTS_KEY in disabled_outputs): return [] if ((not eval_shared_models[0].model_loader.tags or eval_constants.EVAL_TAG in eval_shared_models[0].model_loader.tags) and (not eval_config or not eval_config.metrics_specs)): # Backwards compatibility for previous EvalSavedModel implementation. if eval_config is not None: if eval_config.options.HasField('desired_batch_size'): desired_batch_size = eval_config.options.desired_batch_size.value if eval_config.options.HasField('compute_confidence_intervals'): compute_confidence_intervals = ( eval_config.options.compute_confidence_intervals.value) if eval_config.options.HasField('k_anonymization_count'): k_anonymization_count = eval_config.options.k_anonymization_count.value return [ metrics_and_plots_evaluator.MetricsAndPlotsEvaluator( eval_shared_models[0], desired_batch_size, compute_confidence_intervals=compute_confidence_intervals, k_anonymization_count=k_anonymization_count, serialize=serialize) ] else: return [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=eval_shared_models) ]
def default_evaluators( # pylint: disable=invalid-name eval_shared_model, desired_batch_size=None): """Returns the default evaluators for use in ExtractAndEvaluate. Args: eval_shared_model: Shared model parameters for EvalSavedModel. desired_batch_size: Optional batch size for batching in Aggregate. """ return [ metrics_and_plots_evaluator.MetricsAndPlotsEvaluator( eval_shared_model, desired_batch_size) ]
def default_evaluators( # pylint: disable=invalid-name eval_shared_model, desired_batch_size=None, num_bootstrap_samples=None): """Returns the default evaluators for use in ExtractAndEvaluate. Args: eval_shared_model: Shared model parameters for EvalSavedModel. desired_batch_size: Optional batch size for batching in Aggregate. num_bootstrap_samples: Number of bootstrap samples to draw. If more than 1, confidence intervals will be computed for metrics. Suggested value is at least 20. """ return [ metrics_and_plots_evaluator.MetricsAndPlotsEvaluator( eval_shared_model, desired_batch_size, num_bootstrap_samples=num_bootstrap_samples) ]
def testWriteMetricsAndPlots(self): metrics_file = os.path.join(self._getTempDir(), 'metrics') plots_file = os.path.join(self._getTempDir(), 'plots') temp_eval_export_dir = os.path.join(self._getTempDir(), 'eval_export_dir') _, eval_export_dir = ( fixed_prediction_estimator.simple_fixed_prediction_estimator( None, temp_eval_export_dir)) eval_config = config.EvalConfig( model_specs=[config.ModelSpec()], options=config.Options( disabled_outputs={'values': ['eval_config.json']})) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[ post_export_metrics.example_count(), post_export_metrics.calibration_plot_and_prediction_histogram( num_buckets=2) ]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] evaluators = [ metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(eval_shared_model) ] output_paths = { constants.METRICS_KEY: metrics_file, constants.PLOTS_KEY: plots_file } writers = [ metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter( output_paths, eval_shared_model.add_metrics_callbacks) ] with beam.Pipeline() as pipeline: example1 = self._makeExample(prediction=0.0, label=1.0) example2 = self._makeExample(prediction=1.0, label=1.0) # pylint: disable=no-value-for-parameter _ = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), ]) | 'ExtractEvaluateAndWriteResults' >> model_eval_lib.ExtractEvaluateAndWriteResults( eval_config=eval_config, eval_shared_model=eval_shared_model, extractors=extractors, evaluators=evaluators, writers=writers)) # pylint: enable=no-value-for-parameter expected_metrics_for_slice = text_format.Parse( """ slice_key {} metrics { key: "average_loss" value { double_value { value: 0.5 } } } metrics { key: "post_export_metrics/example_count" value { double_value { value: 2.0 } } } """, metrics_for_slice_pb2.MetricsForSlice()) metric_records = [] for record in tf.compat.v1.python_io.tf_record_iterator(metrics_file): metric_records.append( metrics_for_slice_pb2.MetricsForSlice.FromString(record)) self.assertEqual(1, len(metric_records), 'metrics: %s' % metric_records) self.assertProtoEquals(expected_metrics_for_slice, metric_records[0]) expected_plots_for_slice = text_format.Parse( """ slice_key {} plots { key: "post_export_metrics" value { calibration_histogram_buckets { buckets { lower_threshold_inclusive: -inf num_weighted_examples {} total_weighted_label {} total_weighted_refined_prediction {} } buckets { upper_threshold_exclusive: 0.5 num_weighted_examples { value: 1.0 } total_weighted_label { value: 1.0 } total_weighted_refined_prediction {} } buckets { lower_threshold_inclusive: 0.5 upper_threshold_exclusive: 1.0 num_weighted_examples { } total_weighted_label {} total_weighted_refined_prediction {} } buckets { lower_threshold_inclusive: 1.0 upper_threshold_exclusive: inf num_weighted_examples { value: 1.0 } total_weighted_label { value: 1.0 } total_weighted_refined_prediction { value: 1.0 } } } } } """, metrics_for_slice_pb2.PlotsForSlice()) plot_records = [] for record in tf.compat.v1.python_io.tf_record_iterator(plots_file): plot_records.append( metrics_for_slice_pb2.PlotsForSlice.FromString(record)) self.assertEqual(1, len(plot_records), 'plots: %s' % plot_records) self.assertProtoEquals(expected_plots_for_slice, plot_records[0])
def testRunModelAnalysisWithQueryExtractor(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=0.0), self._makeExample(age=5.0, language='chinese', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slice_spec = [slicer.SingleSliceSpec()] eval_shared_model = model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') eval_result = model_eval_lib.run_model_analysis( eval_shared_model=eval_shared_model, data_location=data_location, slice_spec=slice_spec, evaluators=[ metrics_and_plots_evaluator.MetricsAndPlotsEvaluator( eval_shared_model), query_based_metrics_evaluator.QueryBasedMetricsEvaluator( query_id='language', prediction_key='logistic', combine_fns=[ query_statistics.QueryStatisticsCombineFn(), ndcg.NdcgMetricCombineFn( at_vals=[1], gain_key='label', weight_key='') ]), ]) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (): { 'post_export_metrics/total_queries': { 'doubleValue': 2.0 }, 'post_export_metrics/min_documents': { 'doubleValue': 2.0 }, 'post_export_metrics/max_documents': { 'doubleValue': 2.0 }, 'post_export_metrics/total_documents': { 'doubleValue': 4.0 }, 'post_export_metrics/ndcg@1': { 'doubleValue': 0.5 }, 'post_export_metrics/example_weight': { 'doubleValue': 15.0 }, 'post_export_metrics/example_count': { 'doubleValue': 4.0 }, } } self.assertEqual(eval_result.config.model_location, model_location) self.assertEqual(eval_result.config.data_location, data_location) self.assertEqual(eval_result.config.slice_spec, slice_spec) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) self.assertFalse(eval_result.plots)
def default_evaluators( # pylint: disable=invalid-name eval_shared_model: Optional[Union[types.EvalSharedModel, Dict[Text, types.EvalSharedModel]]] = None, eval_config: config.EvalConfig = None, compute_confidence_intervals: Optional[bool] = False, k_anonymization_count: int = 1, desired_batch_size: Optional[int] = None, serialize: bool = False, random_seed_for_testing: Optional[int] = None) -> List[evaluator.Evaluator]: """Returns the default evaluators for use in ExtractAndEvaluate. Args: eval_shared_model: Optional shared model (single-model evaluation) or dict of shared models keyed by model name (multi-model evaluation). Only required if there are metrics to be computed in-graph using the model. eval_config: Eval config. compute_confidence_intervals: Deprecated (use eval_config). k_anonymization_count: Deprecated (use eval_config). desired_batch_size: Optional batch size for batching in combiner. serialize: Deprecated. random_seed_for_testing: Provide for deterministic tests only. """ disabled_outputs = [] if eval_config: eval_config = config.update_eval_config_with_defaults(eval_config) disabled_outputs = eval_config.options.disabled_outputs.values if model_util.get_model_types(eval_config) == set([constants.TF_LITE]): # no in-graph metrics present when tflite is used. if eval_shared_model: if isinstance(eval_shared_model, dict): eval_shared_model = { k: v._replace(include_default_metrics=False) for k, v in eval_shared_model.items() } else: eval_shared_model = eval_shared_model._replace( include_default_metrics=False) if (constants.METRICS_KEY in disabled_outputs and constants.PLOTS_KEY in disabled_outputs): return [] if _is_legacy_eval(eval_shared_model, eval_config): # Backwards compatibility for previous add_metrics_callbacks implementation. if eval_config is not None: if eval_config.options.HasField('compute_confidence_intervals'): compute_confidence_intervals = ( eval_config.options.compute_confidence_intervals.value) if eval_config.options.HasField('k_anonymization_count'): k_anonymization_count = eval_config.options.k_anonymization_count.value return [ metrics_and_plots_evaluator.MetricsAndPlotsEvaluator( eval_shared_model, compute_confidence_intervals=compute_confidence_intervals, k_anonymization_count=k_anonymization_count, desired_batch_size=desired_batch_size, serialize=serialize, random_seed_for_testing=random_seed_for_testing) ] else: return [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_model=eval_shared_model) ]