def testSerializeDeserializeEvalConfig(self): output_path = self._getTempDir() options = config.Options() options.compute_confidence_intervals.value = False options.k_anonymization_count.value = 1 eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location='/path/to/data')], model_specs=[config.ModelSpec(location='/path/to/model')], output_data_specs=[ config.OutputDataSpec(default_location=output_path) ], slicing_specs=[ config.SlicingSpec(feature_keys=['country'], feature_values={ 'age': '5', 'gender': 'f' }), config.SlicingSpec(feature_keys=['interest'], feature_values={ 'age': '6', 'gender': 'm' }) ], options=options) with tf.io.gfile.GFile(os.path.join(output_path, 'eval_config.json'), 'w') as f: f.write(model_eval_lib._serialize_eval_config(eval_config)) got_eval_config = model_eval_lib.load_eval_config(output_path) self.assertEqual(eval_config, got_eval_config)
def testSerializeDeserializeEvalConfig(self): output_path = self._getTempDir() options = config.Options() options.compute_confidence_intervals.value = False options.k_anonymization_count.value = 1 eval_config = config.EvalConfig(slicing_specs=[ config.SlicingSpec(feature_keys=['country'], feature_values={ 'age': '5', 'gender': 'f' }), config.SlicingSpec(feature_keys=['interest'], feature_values={ 'age': '6', 'gender': 'm' }) ], options=options) data_location = '/path/to/data' file_format = 'tfrecords' model_location = '/path/to/model' with tf.io.gfile.GFile(os.path.join(output_path, 'eval_config.json'), 'w') as f: f.write( model_eval_lib._serialize_eval_run(eval_config, data_location, file_format, {'': model_location})) got_eval_config, got_data_location, got_file_format, got_model_locations = ( model_eval_lib._load_eval_run(output_path)) self.assertEqual(eval_config, got_eval_config) self.assertEqual(data_location, got_data_location) self.assertEqual(file_format, got_file_format) self.assertEqual({'': model_location}, got_model_locations)
def testMergeAccumulators(self): options = config.Options() options.desired_batch_size.value = 2 computation = tf_metric_wrapper.tf_metric_computations( [tf.keras.metrics.MeanSquaredError(name='mse')], config.EvalConfig(options=options))[0] example1 = {'labels': [0.0], 'predictions': [0.0], 'example_weights': [1.0]} example2 = {'labels': [0.0], 'predictions': [0.5], 'example_weights': [1.0]} example3 = {'labels': [1.0], 'predictions': [0.3], 'example_weights': [1.0]} example4 = {'labels': [1.0], 'predictions': [0.9], 'example_weights': [1.0]} example5 = {'labels': [1.0], 'predictions': [0.5], 'example_weights': [0.0]} combiner_inputs = [] for e in (example1, example2, example3, example4, example5): combiner_inputs.append(metric_util.to_standard_metric_inputs(e)) acc1 = computation.combiner.create_accumulator() acc1 = computation.combiner.add_input(acc1, combiner_inputs[0]) acc1 = computation.combiner.add_input(acc1, combiner_inputs[1]) acc1 = computation.combiner.add_input(acc1, combiner_inputs[2]) acc2 = computation.combiner.create_accumulator() acc2 = computation.combiner.add_input(acc2, combiner_inputs[3]) acc2 = computation.combiner.add_input(acc2, combiner_inputs[4]) acc = computation.combiner.merge_accumulators([acc1, acc2]) got_metrics = computation.combiner.extract_output(acc) mse_key = metric_types.MetricKey(name='mse') self.assertDictElementsAlmostEqual(got_metrics, {mse_key: 0.1875})
def _load_eval_run( output_path: Text ) -> Tuple[config.EvalConfig, Text, Text, Dict[Text, Text]]: """Returns eval config, data location, file format, and model locations.""" path = os.path.join(output_path, _EVAL_CONFIG_FILE) if tf.io.gfile.exists(path): with tf.io.gfile.GFile(path, 'r') as f: pb = json_format.Parse(f.read(), config_pb2.EvalRun()) _check_version(pb.version, output_path) return (pb.eval_config, pb.data_location, pb.file_format, pb.model_locations) else: # Legacy suppport (to be removed in future). # The previous version did not include file extension. path = os.path.splitext(path)[0] serialized_record = six.next( tf.compat.v1.python_io.tf_record_iterator(path)) final_dict = pickle.loads(serialized_record) _check_version(final_dict, output_path) old_config = final_dict['eval_config'] slicing_specs = None if old_config.slice_spec: slicing_specs = [s.to_proto() for s in old_config.slice_spec] options = config.Options() options.compute_confidence_intervals.value = ( old_config.compute_confidence_intervals) options.k_anonymization_count.value = old_config.k_anonymization_count return (config.EvalConfig(slicing_specs=slicing_specs, options=options), old_config.data_location, '', { '': old_config.model_location })
def load_eval_config(output_path: Text) -> config.EvalConfig: """Loads eval config.""" path = os.path.join(output_path, _EVAL_CONFIG_FILE) if tf.io.gfile.exists(path): with tf.io.gfile.GFile(path, 'r') as f: pb = json_format.Parse(f.read(), config_pb2.EvalConfigAndVersion()) _check_version(pb.version, output_path) return pb.eval_config else: # Legacy suppport (to be removed in future). # The previous version did not include file extension. path = os.path.splitext(path)[0] serialized_record = six.next( tf.compat.v1.python_io.tf_record_iterator(path)) final_dict = pickle.loads(serialized_record) _check_version(final_dict, output_path) old_config = final_dict['eval_config'] slicing_specs = None if old_config.slice_spec: slicing_specs = [s.to_proto() for s in old_config.slice_spec] options = config.Options() options.compute_confidence_intervals.value = ( old_config.compute_confidence_intervals) options.k_anonymization_count.value = old_config.k_anonymization_count return config.EvalConfig( input_data_specs=[ config.InputDataSpec(location=old_config.data_location) ], model_specs=[config.ModelSpec(location=old_config.model_location)], output_data_specs=[ config.OutputDataSpec(default_location=output_path) ], slicing_specs=slicing_specs, options=options)
def load_eval_run( output_path: Text, output_file_format: Text = EVAL_CONFIG_FILE_FORMAT, filename: Optional[Text] = None ) -> Tuple[Optional[config.EvalConfig], Text, Text, Dict[Text, Text]]: """Returns eval config, data location, file format, and model locations. Args: output_path: Directory containing config file. output_file_format: Format of output file. Currently only 'json' is supported. filename: Name of output file (including extension if any). Returns: Tuple of (EvalConfig, data location, file format, model locations). If an EvalConfig is not found at the given path, None will be returned. """ if filename is None: filename = EVAL_CONFIG_FILE + '.' + output_file_format path = os.path.join(output_path, filename) if tf.io.gfile.exists(path): with tf.io.gfile.GFile(path, 'r') as f: pb = json_format.Parse(f.read(), config_pb2.EvalRun()) _check_version(pb.version, output_path) return (pb.eval_config, pb.data_location, pb.file_format, pb.model_locations) # Legacy suppport (to be removed in future). # The previous version did not include file extension. path = os.path.splitext(path)[0] if tf.io.gfile.exists(path): serialized_record = six.next( tf.compat.v1.python_io.tf_record_iterator(path)) final_dict = pickle.loads(serialized_record) _check_version(final_dict, output_path) old_config = final_dict['eval_config'] slicing_specs = None if old_config.slice_spec: slicing_specs = [s.to_proto() for s in old_config.slice_spec] options = config.Options() options.compute_confidence_intervals.value = ( old_config.compute_confidence_intervals) options.min_slice_size.value = old_config.k_anonymization_count return (config.EvalConfig(slicing_specs=slicing_specs, options=options), old_config.data_location, '', { '': old_config.model_location }) # No config found return (None, '', '', {})
def testSerializeDeserializeLegacyEvalConfig(self): output_path = self._getTempDir() old_config = LegacyConfig( model_location='/path/to/model', data_location='/path/to/data', slice_spec=[ slicer.SingleSliceSpec(columns=['country'], features=[('age', 5), ('gender', 'f')]), slicer.SingleSliceSpec(columns=['interest'], features=[('age', 6), ('gender', 'm')]) ], example_count_metric_key=None, example_weight_metric_key='key', compute_confidence_intervals=False, k_anonymization_count=1) final_dict = {} final_dict['tfma_version'] = tfma_version.VERSION_STRING final_dict['eval_config'] = old_config with tf.io.TFRecordWriter(os.path.join(output_path, 'eval_config')) as w: w.write(pickle.dumps(final_dict)) got_eval_config = model_eval_lib.load_eval_config(output_path) options = config.Options() options.compute_confidence_intervals.value = ( old_config.compute_confidence_intervals) options.k_anonymization_count.value = old_config.k_anonymization_count eval_config = config.EvalConfig( input_data_specs=[ config.InputDataSpec(location=old_config.data_location) ], model_specs=[config.ModelSpec(location=old_config.model_location)], output_data_specs=[ config.OutputDataSpec(default_location=output_path) ], slicing_specs=[ config.SlicingSpec(feature_keys=['country'], feature_values={ 'age': '5', 'gender': 'f' }), config.SlicingSpec(feature_keys=['interest'], feature_values={ 'age': '6', 'gender': 'm' }) ], options=options) self.assertEqual(eval_config, got_eval_config)
def testBatching(self): options = config.Options() options.desired_batch_size.value = 2 computation = tf_metric_wrapper.tf_metric_computations( [_CustomMetric(), tf.keras.metrics.MeanSquaredError(name='mse')], config.EvalConfig(options=options))[0] example1 = {'labels': [0.0], 'predictions': [0.0], 'example_weights': [1.0]} example2 = {'labels': [0.0], 'predictions': [0.5], 'example_weights': [1.0]} example3 = {'labels': [1.0], 'predictions': [0.3], 'example_weights': [1.0]} example4 = {'labels': [1.0], 'predictions': [0.9], 'example_weights': [1.0]} example5 = {'labels': [1.0], 'predictions': [0.5], 'example_weights': [0.0]} with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [example1, example2, example3, example4, example5]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'Combine' >> beam.CombinePerKey(computation.combiner)) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertEqual(1, len(got), 'got: %s' % got) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) custom_key = metric_types.MetricKey(name='custom') mse_key = metric_types.MetricKey(name='mse') self.assertDictElementsAlmostEqual( got_metrics, { custom_key: (0.0 + 0.5 + 0.3 + 0.9 + 0.0) / (1.0 + 1.0 + 1.0 + 1.0 + 0.0), mse_key: 0.1875, }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testRunModelAnalysisWithUncertainty(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=1.0), self._makeExample(age=5.0, language='hindi', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec(feature_keys=['language'])] options = config.Options() options.compute_confidence_intervals.value = True options.k_anonymization_count.value = 2 eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=[config.ModelSpec(location=model_location)], output_data_specs=[ config.OutputDataSpec(default_location=self._getTempDir()) ], slicing_specs=slicing_specs, options=options) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_models=[ model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age') ]) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (('language', 'hindi'), ): { u'__ERROR__': { 'debugMessage': u'Example count for this slice key is lower than the ' u'minimum required value: 2. No data is aggregated for ' u'this slice.' }, }, (('language', 'chinese'), ): { metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 8.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, }, (('language', 'english'), ): { 'accuracy': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, 'my_mean_label': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 7.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, } } self.assertEqual(eval_result.config.model_specs[0].location, model_location.decode()) self.assertEqual(eval_result.config.input_data_specs[0].location, data_location) self.assertEqual(eval_result.config.slicing_specs[0], config.SlicingSpec(feature_keys=['language'])) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) self.assertFalse(eval_result.plots)
def testRunModelAnalysisWithDeterministicConfidenceIntervals(self): model_location = self._exportEvalSavedModel( linear_classifier.simple_linear_classifier) examples = [ self._makeExample(age=3.0, language='english', label=1.0), self._makeExample(age=3.0, language='chinese', label=0.0), self._makeExample(age=4.0, language='english', label=1.0), self._makeExample(age=5.0, language='chinese', label=1.0), self._makeExample(age=5.0, language='hindi', label=1.0) ] data_location = self._writeTFExamplesToTFRecords(examples) slicing_specs = [config.SlicingSpec(feature_keys=['language'])] options = config.Options() options.compute_confidence_intervals.value = True options.k_anonymization_count.value = 2 eval_config = config.EvalConfig(slicing_specs=slicing_specs, options=options) eval_result = model_eval_lib.run_model_analysis( eval_config=eval_config, eval_shared_model=model_eval_lib.default_eval_shared_model( eval_saved_model_path=model_location, example_weight_key='age'), data_location=data_location, output_path=self._getTempDir(), random_seed_for_testing=_TEST_SEED) # We only check some of the metrics to ensure that the end-to-end # pipeline works. expected = { (('language', 'hindi'), ): { u'__ERROR__': { 'debugMessage': u'Example count for this slice key is lower than the ' u'minimum required value: 2. No data is aggregated for ' u'this slice.' }, }, (('language', 'chinese'), ): { metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 8.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, }, (('language', 'english'), ): { 'accuracy': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, 'my_mean_label': { 'boundedValue': { 'value': 1.0, 'lowerBound': 1.0, 'upperBound': 1.0, 'methodology': 'POISSON_BOOTSTRAP' } }, metric_keys.EXAMPLE_WEIGHT: { 'doubleValue': 7.0 }, metric_keys.EXAMPLE_COUNT: { 'doubleValue': 2.0 }, } } self.assertEqual(eval_result.model_location, model_location.decode()) self.assertEqual(eval_result.data_location, data_location) self.assertEqual(eval_result.config.slicing_specs[0], config.SlicingSpec(feature_keys=['language'])) self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected) for key, value in eval_result.slicing_metrics: if (('language', 'english'), ) == key: metric = value['']['']['average_loss'] self.assertAlmostEqual(0.171768754720, metric['boundedValue']['value'], delta=0.1) metric = value['']['']['auc_precision_recall'] self.assertAlmostEqual(0.99999940395, metric['boundedValue']['value'], delta=0.1) self.assertFalse(eval_result.plots)
def testWriteValidationResults(self, output_file_format): model_dir, baseline_dir = self._getExportDir(), self._getBaselineDir() eval_shared_model = self._build_keras_model(model_dir, mul=0) baseline_eval_shared_model = self._build_keras_model(baseline_dir, mul=1) validations_file = os.path.join(self._getTempDir(), constants.VALIDATIONS_KEY) schema = text_format.Parse( """ tensor_representation_group { key: "" value { tensor_representation { key: "input" value { dense_tensor { column_name: "input" shape { dim { size: 1 } } } } } } } feature { name: "input" type: FLOAT } feature { name: "label" type: FLOAT } feature { name: "example_weight" type: FLOAT } feature { name: "extra_feature" type: BYTES } """, schema_pb2.Schema()) tfx_io = test_util.InMemoryTFExampleRecord( schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN) tensor_adapter_config = tensor_adapter.TensorAdapterConfig( arrow_schema=tfx_io.ArrowSchema(), tensor_representations=tfx_io.TensorRepresentations()) examples = [ self._makeExample( input=0.0, label=1.0, example_weight=1.0, extra_feature='non_model_feature'), self._makeExample( input=1.0, label=0.0, example_weight=0.5, extra_feature='non_model_feature'), ] eval_config = config.EvalConfig( model_specs=[ config.ModelSpec( name='candidate', label_key='label', example_weight_key='example_weight'), config.ModelSpec( name='baseline', label_key='label', example_weight_key='example_weight', is_baseline=True) ], slicing_specs=[config.SlicingSpec()], metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 1.5 < 1, NOT OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': 1}))), config.MetricConfig( class_name='ExampleCount', # 2 > 10, NOT OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( lower_bound={'value': 10}))), config.MetricConfig( class_name='MeanLabel', # 0 > 0 and 0 > 0%?: NOT OK. threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection .HIGHER_IS_BETTER, relative={'value': 0}, absolute={'value': 0}))), config.MetricConfig( # MeanPrediction = (0+0)/(1+0.5) = 0 class_name='MeanPrediction', # -.01 < 0 < .01, OK. # Diff% = -.333/.333 = -100% < -99%, OK. # Diff = 0 - .333 = -.333 < 0, OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': .01}, lower_bound={'value': -.01}), change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection .LOWER_IS_BETTER, relative={'value': -.99}, absolute={'value': 0}))) ], model_names=['candidate', 'baseline']), ], options=config.Options( disabled_outputs={'values': ['eval_config.json']}), ) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] eval_shared_models = { 'candidate': eval_shared_model, 'baseline': baseline_eval_shared_model } extractors = [ batched_input_extractor.BatchedInputExtractor(eval_config), batched_predict_extractor_v2.BatchedPredictExtractor( eval_shared_model=eval_shared_models, eval_config=eval_config, tensor_adapter_config=tensor_adapter_config), unbatch_extractor.UnbatchExtractor(), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_model=eval_shared_models) ] output_paths = { constants.VALIDATIONS_KEY: validations_file, } writers = [ metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter( output_paths, add_metrics_callbacks=[], output_file_format=output_file_format) ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter _ = ( pipeline | 'Create' >> beam.Create([e.SerializeToString() for e in examples]) | 'BatchExamples' >> tfx_io.BeamSource() | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts() | 'ExtractEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators) | 'WriteResults' >> model_eval_lib.WriteResults(writers=writers)) # pylint: enable=no-value-for-parameter validation_result = ( metrics_plots_and_validations_writer .load_and_deserialize_validation_result( os.path.dirname(validations_file))) expected_validations = [ text_format.Parse( """ metric_key { name: "weighted_example_count" model_name: "candidate" } metric_threshold { value_threshold { upper_bound { value: 1.0 } } } metric_value { double_value { value: 1.5 } } """, validation_result_pb2.ValidationFailure()), text_format.Parse( """ metric_key { name: "example_count" model_name: "candidate" } metric_threshold { value_threshold { lower_bound { value: 10.0 } } } metric_value { double_value { value: 2.0 } } """, validation_result_pb2.ValidationFailure()), text_format.Parse( """ metric_key { name: "mean_label" model_name: "candidate" is_diff: true } metric_threshold { change_threshold { absolute { value: 0.0 } relative { value: 0.0 } direction: HIGHER_IS_BETTER } } metric_value { double_value { value: 0.0 } } """, validation_result_pb2.ValidationFailure()), ] self.assertFalse(validation_result.validation_ok) self.assertLen(validation_result.metric_validations_per_slice, 1) self.assertCountEqual( expected_validations, validation_result.metric_validations_per_slice[0].failures)
def ExtractEvaluateAndWriteResults( # pylint: disable=invalid-name examples: beam.pvalue.PCollection, eval_shared_model: Optional[Union[types.EvalSharedModel, Dict[Text, types.EvalSharedModel]]] = None, eval_config: config.EvalConfig = None, extractors: Optional[List[extractor.Extractor]] = None, evaluators: Optional[List[evaluator.Evaluator]] = None, writers: Optional[List[writer.Writer]] = None, output_path: Optional[Text] = None, display_only_data_location: Optional[Text] = None, display_only_file_format: Optional[Text] = None, slice_spec: Optional[List[slicer.SingleSliceSpec]] = None, write_config: Optional[bool] = True, compute_confidence_intervals: Optional[bool] = False, k_anonymization_count: int = 1, desired_batch_size: Optional[int] = None, random_seed_for_testing: Optional[int] = None) -> beam.pvalue.PDone: """PTransform for performing extraction, evaluation, and writing results. Users who want to construct their own Beam pipelines instead of using the lightweight run_model_analysis functions should use this PTransform. Example usage: eval_config = tfma.EvalConfig(slicing_specs=[...], metrics_specs=[...]) eval_shared_model = tfma.default_eval_shared_model( eval_saved_model_path=model_location, eval_config=eval_config) with beam.Pipeline(runner=...) as p: _ = (p | 'ReadData' >> beam.io.ReadFromTFRecord(data_location) | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_shared_model=eval_shared_model, eval_config=eval_config, ...)) result = tfma.load_eval_result(output_path=output_path) tfma.view.render_slicing_metrics(result) Note that the exact serialization format is an internal implementation detail and subject to change. Users should only use the TFMA functions to write and read the results. Args: examples: PCollection of input examples. Can be any format the model accepts (e.g. string containing CSV row, TensorFlow.Example, etc). eval_shared_model: Optional shared model (single-model evaluation) or dict of shared models keyed by model name (multi-model evaluation). Only required if needed by default extractors, evaluators, or writers and for display purposes of the model path. eval_config: Eval config. extractors: Optional list of Extractors to apply to Extracts. Typically these will be added by calling the default_extractors function. If no extractors are provided, default_extractors (non-materialized) will be used. evaluators: Optional list of Evaluators for evaluating Extracts. Typically these will be added by calling the default_evaluators function. If no evaluators are provided, default_evaluators will be used. writers: Optional list of Writers for writing Evaluation output. Typically these will be added by calling the default_writers function. If no writers are provided, default_writers will be used. output_path: Path to output metrics and plots results. display_only_data_location: Optional path indicating where the examples were read from. This is used only for display purposes - data will not actually be read from this path. display_only_file_format: Optional format of the examples. This is used only for display purposes. slice_spec: Deprecated (use EvalConfig). write_config: Deprecated (use EvalConfig). compute_confidence_intervals: Deprecated (use EvalConfig). k_anonymization_count: Deprecated (use EvalConfig). desired_batch_size: Optional batch size for batching in Predict. random_seed_for_testing: Provide for deterministic tests only. Raises: ValueError: If EvalConfig invalid or matching Extractor not found for an Evaluator. Returns: PDone. """ eval_shared_models = eval_shared_model if not isinstance(eval_shared_model, dict): eval_shared_models = {'': eval_shared_model} if eval_config is None: model_specs = [] for model_name, shared_model in eval_shared_models.items(): example_weight_key = shared_model.example_weight_key example_weight_keys = {} if example_weight_key and isinstance(example_weight_key, dict): example_weight_keys = example_weight_key example_weight_key = '' model_specs.append( config.ModelSpec( name=model_name, example_weight_key=example_weight_key, example_weight_keys=example_weight_keys)) slicing_specs = None if slice_spec: slicing_specs = [s.to_proto() for s in slice_spec] options = config.Options() options.compute_confidence_intervals.value = compute_confidence_intervals options.k_anonymization_count.value = k_anonymization_count if not write_config: options.disabled_outputs.values.append(_EVAL_CONFIG_FILE) eval_config = config.EvalConfig( model_specs=model_specs, slicing_specs=slicing_specs, options=options) else: eval_config = config.update_eval_config_with_defaults(eval_config) config.verify_eval_config(eval_config) if not extractors: extractors = default_extractors( eval_config=eval_config, eval_shared_model=eval_shared_model, materialize=False, desired_batch_size=desired_batch_size) if not evaluators: evaluators = default_evaluators( eval_config=eval_config, eval_shared_model=eval_shared_model, random_seed_for_testing=random_seed_for_testing) for v in evaluators: evaluator.verify_evaluator(v, extractors) if not writers: writers = default_writers( output_path=output_path, eval_shared_model=eval_shared_model) # pylint: disable=no-value-for-parameter _ = ( examples | 'InputsToExtracts' >> InputsToExtracts() | 'ExtractAndEvaluate' >> ExtractAndEvaluate( extractors=extractors, evaluators=evaluators) | 'WriteResults' >> WriteResults(writers=writers)) if _EVAL_CONFIG_FILE not in eval_config.options.disabled_outputs.values: data_location = '<user provided PCollection>' if display_only_data_location is not None: data_location = display_only_data_location file_format = '<unknown>' if display_only_file_format is not None: file_format = display_only_file_format model_locations = {} for k, v in eval_shared_models.items(): model_locations[k] = ('<unknown>' if v is None or v.model_path is None else v.model_path) _ = ( examples.pipeline | WriteEvalConfig(eval_config, output_path, data_location, file_format, model_locations)) # pylint: enable=no-value-for-parameter return beam.pvalue.PDone(examples.pipeline)
def testEvaluateWithConfidenceIntervals(self): # NOTE: This test does not actually test that confidence intervals are # accurate it only tests that the proto output by the test is well formed. # This test would pass if the confidence interval implementation did # nothing at all except compute the unsampled value. temp_export_dir = self._getExportDir() _, export_dir = (fixed_prediction_estimator_extra_fields. simple_fixed_prediction_estimator_extra_fields( None, temp_export_dir)) options = config.Options() options.compute_confidence_intervals.value = True eval_config = config.EvalConfig( model_specs=[ config.ModelSpec(location=export_dir, label_key='label', example_weight_key='fixed_float') ], slicing_specs=[ config.SlicingSpec(), config.SlicingSpec(feature_keys=['fixed_string']), ], metrics_specs=metric_specs.specs_from_metrics([ calibration.MeanLabel('mean_label'), calibration.MeanPrediction('mean_prediction') ]), options=options) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING]) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] extractors = [ input_extractor.InputExtractor(eval_config=eval_config), predict_extractor_v2.PredictExtractor( eval_config=eval_config, eval_shared_models=[eval_shared_model]), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_models=[eval_shared_model]) ] # fixed_float used as example_weight key examples = [ self._makeExample(prediction=0.2, label=1.0, fixed_int=1, fixed_float=1.0, fixed_string='fixed_string1'), self._makeExample(prediction=0.8, label=0.0, fixed_int=1, fixed_float=1.0, fixed_string='fixed_string1'), self._makeExample(prediction=0.5, label=0.0, fixed_int=2, fixed_float=2.0, fixed_string='fixed_string2') ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter metrics = ( pipeline | 'Create' >> beam.Create( [e.SerializeToString() for e in examples]) | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts() | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators)) # pylint: enable=no-value-for-parameter def check_metrics(got): try: self.assertLen(got, 3) slices = {} for slice_key, value in got: slices[slice_key] = value overall_slice = () fixed_string1_slice = (('fixed_string', b'fixed_string1'), ) fixed_string2_slice = (('fixed_string', b'fixed_string2'), ) self.assertCountEqual(list(slices.keys()), [ overall_slice, fixed_string1_slice, fixed_string2_slice ]) example_count_key = metric_types.MetricKey( name='example_count') weighted_example_count_key = metric_types.MetricKey( name='weighted_example_count') label_key = metric_types.MetricKey(name='mean_label') pred_key = metric_types.MetricKey(name='mean_prediction') self.assertDictElementsWithTDistributionAlmostEqual( slices[overall_slice], { example_count_key: 3, weighted_example_count_key: 4.0, label_key: (1.0 + 0.0 + 2 * 0.0) / (1.0 + 1.0 + 2.0), pred_key: (0.2 + 0.8 + 2 * 0.5) / (1.0 + 1.0 + 2.0), }) self.assertDictElementsWithTDistributionAlmostEqual( slices[fixed_string1_slice], { example_count_key: 2, weighted_example_count_key: 2.0, label_key: (1.0 + 0.0) / (1.0 + 1.0), pred_key: (0.2 + 0.8) / (1.0 + 1.0), }) self.assertDictElementsWithTDistributionAlmostEqual( slices[fixed_string2_slice], { example_count_key: 1, weighted_example_count_key: 2.0, label_key: (2 * 0.0) / 2.0, pred_key: (2 * 0.5) / 2.0, }) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(metrics[constants.METRICS_KEY], check_metrics, label='metrics')
def run_model_analysis( eval_shared_model: Optional[types.EvalSharedModel] = None, eval_shared_models: Optional[List[types.EvalSharedModel]] = None, eval_config: config.EvalConfig = None, extractors: Optional[List[extractor.Extractor]] = None, evaluators: Optional[List[evaluator.Evaluator]] = None, writers: Optional[List[writer.Writer]] = None, pipeline_options: Optional[Any] = None, data_location: Optional[Text] = None, file_format: Optional[Text] = 'tfrecords', slice_spec: Optional[List[slicer.SingleSliceSpec]] = None, output_path: Optional[Text] = None, write_config: Optional[bool] = True, desired_batch_size: Optional[int] = None, compute_confidence_intervals: Optional[bool] = False, k_anonymization_count: int = 1) -> EvalResult: """Runs TensorFlow model analysis. It runs a Beam pipeline to compute the slicing metrics exported in TensorFlow Eval SavedModel and returns the results. This is a simplified API for users who want to quickly get something running locally. Users who wish to create their own Beam pipelines can use the Evaluate PTransform instead. Args: eval_shared_model: Shared model (single-model evaluation). eval_shared_models: Shared models (multi-model evaluation). eval_config: Eval config. extractors: Optional list of Extractors to apply to Extracts. Typically these will be added by calling the default_extractors function. If no extractors are provided, default_extractors (non-materialized) will be used. evaluators: Optional list of Evaluators for evaluating Extracts. Typically these will be added by calling the default_evaluators function. If no evaluators are provided, default_evaluators will be used. writers: Optional list of Writers for writing Evaluation output. Typically these will be added by calling the default_writers function. If no writers are provided, default_writers will be used. pipeline_options: Optional arguments to run the Pipeline, for instance whether to run directly. data_location: Deprecated (use EvalConfig). file_format: Deprecated (use EvalConfig). slice_spec: Deprecated (use EvalConfig). output_path: Deprecated (use EvalConfig). write_config: Deprecated (use EvalConfig). desired_batch_size: Deprecated (use EvalConfig). compute_confidence_intervals: Deprecated (use EvalConfig). k_anonymization_count: Deprecated (use EvalConfig). Returns: An EvalResult that can be used with the TFMA visualization functions. Raises: ValueError: If the file_format is unknown to us. """ _assert_tensorflow_version() if eval_shared_model is not None: eval_shared_models = [eval_shared_model] if eval_config is None: if output_path is None: output_path = tempfile.mkdtemp() if not tf.io.gfile.exists(output_path): tf.io.gfile.makedirs(output_path) disabled_outputs = None if not write_config: disabled_outputs = [_EVAL_CONFIG_FILE] model_specs = [] for m in eval_shared_models: example_weight_key = m.example_weight_key example_weight_keys = {} if example_weight_key and isinstance(example_weight_key, dict): example_weight_keys = example_weight_key example_weight_key = '' model_specs.append( config.ModelSpec(location=m.model_path, example_weight_key=example_weight_key, example_weight_keys=example_weight_keys)) slicing_specs = None if slice_spec: slicing_specs = [s.to_proto() for s in slice_spec] options = config.Options() options.compute_confidence_intervals.value = compute_confidence_intervals options.k_anonymization_count.value = k_anonymization_count if desired_batch_size: options.desired_batch_size.value = desired_batch_size eval_config = config.EvalConfig( input_data_specs=[ config.InputDataSpec(location=data_location, file_format=file_format) ], model_specs=model_specs, output_data_specs=[ config.OutputDataSpec(default_location=output_path, disabled_outputs=disabled_outputs) ], slicing_specs=slicing_specs, options=options) if len(eval_config.input_data_specs) != 1: raise NotImplementedError( 'multiple input_data_specs are not yet supported.') if len(eval_config.model_specs) != 1: raise NotImplementedError( 'multiple model_specs are not yet supported.') if len(eval_config.output_data_specs) != 1: raise NotImplementedError( 'multiple output_data_specs are not yet supported.') with beam.Pipeline(options=pipeline_options) as p: if (not eval_config.input_data_specs[0].file_format or eval_config.input_data_specs[0].file_format == 'tfrecords'): data = p | 'ReadFromTFRecord' >> beam.io.ReadFromTFRecord( file_pattern=eval_config.input_data_specs[0].location, compression_type=beam.io.filesystem.CompressionTypes.AUTO) elif eval_config.input_data_specs[0].file_format == 'text': data = p | 'ReadFromText' >> beam.io.textio.ReadFromText( eval_config.input_data_specs[0].location) else: raise ValueError('unknown file_format: {}'.format( eval_config.input_data_specs[0].file_format)) # pylint: disable=no-value-for-parameter _ = ( data | 'ExtractEvaluateAndWriteResults' >> ExtractEvaluateAndWriteResults( eval_config=eval_config, eval_shared_models=eval_shared_models, extractors=extractors, evaluators=evaluators, writers=writers)) # pylint: enable=no-value-for-parameter # TODO(b/141016373): Add support for multiple models. return load_eval_result(eval_config.output_data_specs[0].default_location)
def ExtractEvaluateAndWriteResults( # pylint: disable=invalid-name examples: beam.pvalue.PCollection, eval_shared_model: Optional[types.EvalSharedModel] = None, eval_shared_models: Optional[List[types.EvalSharedModel]] = None, eval_config: config.EvalConfig = None, extractors: Optional[List[extractor.Extractor]] = None, evaluators: Optional[List[evaluator.Evaluator]] = None, writers: Optional[List[writer.Writer]] = None, output_path: Optional[Text] = None, display_only_data_location: Optional[Text] = None, slice_spec: Optional[List[slicer.SingleSliceSpec]] = None, desired_batch_size: Optional[int] = None, write_config: Optional[bool] = True, compute_confidence_intervals: Optional[bool] = False, k_anonymization_count: int = 1) -> beam.pvalue.PDone: """PTransform for performing extraction, evaluation, and writing results. Users who want to construct their own Beam pipelines instead of using the lightweight run_model_analysis functions should use this PTransform. Example usage: eval_config = tfma.EvalConfig( input_data_specs=[tfma.InputDataSpec(location=data_location)], model_specs=[tfma.ModelSpec(location=model_location)], output_data_specs=[tfma.OutputDataSpec(default_location=output_path)], slicing_specs=[...], metrics_specs=[...]) eval_shared_model = tfma.default_eval_shared_model( eval_saved_model_path=model_location, add_metrics_callbacks=[...]) with beam.Pipeline(runner=...) as p: _ = (p | 'ReadData' >> beam.io.ReadFromTFRecord(data_location) | 'ExtractEvaluateAndWriteResults' >> tfma.ExtractEvaluateAndWriteResults( eval_config=eval_config, eval_shared_models=[eval_shared_model], ...)) result = tfma.load_eval_result(output_path=output_path) tfma.view.render_slicing_metrics(result) Note that the exact serialization format is an internal implementation detail and subject to change. Users should only use the TFMA functions to write and read the results. Args: examples: PCollection of input examples. Can be any format the model accepts (e.g. string containing CSV row, TensorFlow.Example, etc). eval_shared_model: Shared model (single-model evaluation). eval_shared_models: Shared models (multi-model evaluation). eval_config: Eval config. extractors: Optional list of Extractors to apply to Extracts. Typically these will be added by calling the default_extractors function. If no extractors are provided, default_extractors (non-materialized) will be used. evaluators: Optional list of Evaluators for evaluating Extracts. Typically these will be added by calling the default_evaluators function. If no evaluators are provided, default_evaluators will be used. writers: Optional list of Writers for writing Evaluation output. Typically these will be added by calling the default_writers function. If no writers are provided, default_writers will be used. output_path: Deprecated (use EvalConfig). display_only_data_location: Deprecated (use EvalConfig). slice_spec: Deprecated (use EvalConfig). desired_batch_size: Deprecated (use EvalConfig). write_config: Deprecated (use EvalConfig). compute_confidence_intervals: Deprecated (use EvalConfig). k_anonymization_count: Deprecated (use EvalConfig). Raises: ValueError: If matching Extractor not found for an Evaluator. Returns: PDone. """ if eval_shared_model is not None: eval_shared_models = [eval_shared_model] if eval_config is None: data_location = '<user provided PCollection>' if display_only_data_location is not None: data_location = display_only_data_location disabled_outputs = None if not write_config: disabled_outputs = [_EVAL_CONFIG_FILE] model_specs = [] for m in eval_shared_models: example_weight_key = m.example_weight_key example_weight_keys = {} if example_weight_key and isinstance(example_weight_key, dict): example_weight_keys = example_weight_key example_weight_key = '' model_specs.append( config.ModelSpec(location=m.model_path, example_weight_key=example_weight_key, example_weight_keys=example_weight_keys)) slicing_specs = None if slice_spec: slicing_specs = [s.to_proto() for s in slice_spec] options = config.Options() options.compute_confidence_intervals.value = compute_confidence_intervals options.k_anonymization_count.value = k_anonymization_count if desired_batch_size: options.desired_batch_size.value = desired_batch_size eval_config = config.EvalConfig( input_data_specs=[config.InputDataSpec(location=data_location)], model_specs=model_specs, output_data_specs=[ config.OutputDataSpec(default_location=output_path, disabled_outputs=disabled_outputs) ], slicing_specs=slicing_specs, options=options) if not extractors: extractors = default_extractors(eval_config=eval_config, eval_shared_models=eval_shared_models, materialize=False) if not evaluators: evaluators = default_evaluators(eval_config=eval_config, eval_shared_models=eval_shared_models) for v in evaluators: evaluator.verify_evaluator(v, extractors) if not writers: writers = default_writers(eval_config=eval_config, eval_shared_models=eval_shared_models) # pylint: disable=no-value-for-parameter _ = (examples | 'InputsToExtracts' >> InputsToExtracts() | 'ExtractAndEvaluate' >> ExtractAndEvaluate(extractors=extractors, evaluators=evaluators) | 'WriteResults' >> WriteResults(writers=writers)) # TODO(b/141016373): Add support for multiple models. if _EVAL_CONFIG_FILE not in eval_config.output_data_specs[ 0].disabled_outputs: _ = examples.pipeline | WriteEvalConfig(eval_config) # pylint: enable=no-value-for-parameter return beam.pvalue.PDone(examples.pipeline)
def testWriteMetricsAndPlots(self): metrics_file = os.path.join(self._getTempDir(), 'metrics') plots_file = os.path.join(self._getTempDir(), 'plots') temp_eval_export_dir = os.path.join(self._getTempDir(), 'eval_export_dir') _, eval_export_dir = ( fixed_prediction_estimator.simple_fixed_prediction_estimator( None, temp_eval_export_dir)) eval_config = config.EvalConfig( model_specs=[config.ModelSpec()], options=config.Options( disabled_outputs={'values': ['eval_config.json']})) eval_shared_model = self.createTestEvalSharedModel( eval_saved_model_path=eval_export_dir, add_metrics_callbacks=[ post_export_metrics.example_count(), post_export_metrics.calibration_plot_and_prediction_histogram( num_buckets=2) ]) extractors = [ predict_extractor.PredictExtractor(eval_shared_model), slice_key_extractor.SliceKeyExtractor() ] evaluators = [ metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(eval_shared_model) ] output_paths = { constants.METRICS_KEY: metrics_file, constants.PLOTS_KEY: plots_file } writers = [ metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter( output_paths, eval_shared_model.add_metrics_callbacks) ] with beam.Pipeline() as pipeline: example1 = self._makeExample(prediction=0.0, label=1.0) example2 = self._makeExample(prediction=1.0, label=1.0) # pylint: disable=no-value-for-parameter _ = ( pipeline | 'Create' >> beam.Create([ example1.SerializeToString(), example2.SerializeToString(), ]) | 'ExtractEvaluateAndWriteResults' >> model_eval_lib.ExtractEvaluateAndWriteResults( eval_config=eval_config, eval_shared_model=eval_shared_model, extractors=extractors, evaluators=evaluators, writers=writers)) # pylint: enable=no-value-for-parameter expected_metrics_for_slice = text_format.Parse( """ slice_key {} metrics { key: "average_loss" value { double_value { value: 0.5 } } } metrics { key: "post_export_metrics/example_count" value { double_value { value: 2.0 } } } """, metrics_for_slice_pb2.MetricsForSlice()) metric_records = [] for record in tf.compat.v1.python_io.tf_record_iterator(metrics_file): metric_records.append( metrics_for_slice_pb2.MetricsForSlice.FromString(record)) self.assertEqual(1, len(metric_records), 'metrics: %s' % metric_records) self.assertProtoEquals(expected_metrics_for_slice, metric_records[0]) expected_plots_for_slice = text_format.Parse( """ slice_key {} plots { key: "post_export_metrics" value { calibration_histogram_buckets { buckets { lower_threshold_inclusive: -inf num_weighted_examples {} total_weighted_label {} total_weighted_refined_prediction {} } buckets { upper_threshold_exclusive: 0.5 num_weighted_examples { value: 1.0 } total_weighted_label { value: 1.0 } total_weighted_refined_prediction {} } buckets { lower_threshold_inclusive: 0.5 upper_threshold_exclusive: 1.0 num_weighted_examples { } total_weighted_label {} total_weighted_refined_prediction {} } buckets { lower_threshold_inclusive: 1.0 upper_threshold_exclusive: inf num_weighted_examples { value: 1.0 } total_weighted_label { value: 1.0 } total_weighted_refined_prediction { value: 1.0 } } } } } """, metrics_for_slice_pb2.PlotsForSlice()) plot_records = [] for record in tf.compat.v1.python_io.tf_record_iterator(plots_file): plot_records.append( metrics_for_slice_pb2.PlotsForSlice.FromString(record)) self.assertEqual(1, len(plot_records), 'plots: %s' % plot_records) self.assertProtoEquals(expected_plots_for_slice, plot_records[0])
def testWriteValidationResults(self): model_dir, baseline_dir = self._getExportDir(), self._getBaselineDir() eval_shared_model = self._build_keras_model(model_dir, mul=0) baseline_eval_shared_model = self._build_keras_model(baseline_dir, mul=1) validations_file = os.path.join(self._getTempDir(), constants.VALIDATIONS_KEY) examples = [ self._makeExample( input=0.0, label=1.0, example_weight=1.0, extra_feature='non_model_feature'), self._makeExample( input=1.0, label=0.0, example_weight=0.5, extra_feature='non_model_feature'), ] eval_config = config.EvalConfig( model_specs=[ config.ModelSpec( name='candidate', label_key='label', example_weight_key='example_weight'), config.ModelSpec( name='baseline', label_key='label', example_weight_key='example_weight', is_baseline=True) ], slicing_specs=[config.SlicingSpec()], metrics_specs=[ config.MetricsSpec( metrics=[ config.MetricConfig( class_name='WeightedExampleCount', # 1.5 < 1, NOT OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': 1}))), config.MetricConfig( class_name='ExampleCount', # 2 > 10, NOT OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( lower_bound={'value': 10}))), config.MetricConfig( class_name='MeanLabel', # 0 > 0 and 0 > 0%?: NOT OK. threshold=config.MetricThreshold( change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection .HIGHER_IS_BETTER, relative={'value': 0}, absolute={'value': 0}))), config.MetricConfig( # MeanPrediction = (0+0)/(1+0.5) = 0 class_name='MeanPrediction', # -.01 < 0 < .01, OK. # Diff% = -.333/.333 = -100% < -99%, OK. # Diff = 0 - .333 = -.333 < 0, OK. threshold=config.MetricThreshold( value_threshold=config.GenericValueThreshold( upper_bound={'value': .01}, lower_bound={'value': -.01}), change_threshold=config.GenericChangeThreshold( direction=config.MetricDirection .LOWER_IS_BETTER, relative={'value': -.99}, absolute={'value': 0}))) ], model_names=['candidate', 'baseline']), ], options=config.Options( disabled_outputs={'values': ['eval_config.json']}), ) slice_spec = [ slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs ] eval_shared_models = { 'candidate': eval_shared_model, 'baseline': baseline_eval_shared_model } extractors = [ input_extractor.InputExtractor(eval_config), predict_extractor_v2.PredictExtractor( eval_shared_model=eval_shared_models, eval_config=eval_config), slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec) ] evaluators = [ metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator( eval_config=eval_config, eval_shared_model=eval_shared_models) ] output_paths = { constants.VALIDATIONS_KEY: validations_file, } writers = [ metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter( output_paths, add_metrics_callbacks=[]) ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter _ = ( pipeline | 'Create' >> beam.Create([e.SerializeToString() for e in examples]) | 'ExtractEvaluateAndWriteResults' >> model_eval_lib.ExtractEvaluateAndWriteResults( eval_config=eval_config, eval_shared_model=eval_shared_model, extractors=extractors, evaluators=evaluators, writers=writers)) # pylint: enable=no-value-for-parameter validation_result = model_eval_lib.load_validation_result( os.path.dirname(validations_file)) expected_validations = [ text_format.Parse( """ metric_key { name: "weighted_example_count" model_name: "candidate" } metric_threshold { value_threshold { upper_bound { value: 1.0 } } } metric_value { double_value { value: 1.5 } } """, validation_result_pb2.ValidationFailure()), text_format.Parse( """ metric_key { name: "example_count" } metric_threshold { value_threshold { lower_bound { value: 10.0 } } } metric_value { double_value { value: 2.0 } } """, validation_result_pb2.ValidationFailure()), text_format.Parse( """ metric_key { name: "mean_label" model_name: "candidate" is_diff: true } metric_threshold { change_threshold { absolute { value: 0.0 } relative { value: 0.0 } direction: HIGHER_IS_BETTER } } metric_value { double_value { value: 0.0 } } """, validation_result_pb2.ValidationFailure()), ] self.assertFalse(validation_result.validation_ok) self.assertLen(validation_result.metric_validations_per_slice, 1) self.assertCountEqual( expected_validations, validation_result.metric_validations_per_slice[0].failures)
def run_model_analysis( eval_shared_model: Optional[Union[types.EvalSharedModel, Dict[Text, types.EvalSharedModel]]] = None, eval_config: config.EvalConfig = None, data_location: Text = '', file_format: Text = 'tfrecords', output_path: Optional[Text] = None, extractors: Optional[List[extractor.Extractor]] = None, evaluators: Optional[List[evaluator.Evaluator]] = None, writers: Optional[List[writer.Writer]] = None, pipeline_options: Optional[Any] = None, slice_spec: Optional[List[slicer.SingleSliceSpec]] = None, write_config: Optional[bool] = True, compute_confidence_intervals: Optional[bool] = False, k_anonymization_count: int = 1, desired_batch_size: Optional[int] = None, random_seed_for_testing: Optional[int] = None ) -> Union[EvalResult, EvalResults]: """Runs TensorFlow model analysis. It runs a Beam pipeline to compute the slicing metrics exported in TensorFlow Eval SavedModel and returns the results. This is a simplified API for users who want to quickly get something running locally. Users who wish to create their own Beam pipelines can use the Evaluate PTransform instead. Args: eval_shared_model: Optional shared model (single-model evaluation) or dict of shared models keyed by model name (multi-model evaluation). Only required if needed by default extractors, evaluators, or writers. eval_config: Eval config. data_location: The location of the data files. file_format: The file format of the data, can be either 'text' or 'tfrecords' for now. By default, 'tfrecords' will be used. output_path: The directory to output metrics and results to. If None, we use a temporary directory. extractors: Optional list of Extractors to apply to Extracts. Typically these will be added by calling the default_extractors function. If no extractors are provided, default_extractors (non-materialized) will be used. evaluators: Optional list of Evaluators for evaluating Extracts. Typically these will be added by calling the default_evaluators function. If no evaluators are provided, default_evaluators will be used. writers: Optional list of Writers for writing Evaluation output. Typically these will be added by calling the default_writers function. If no writers are provided, default_writers will be used. pipeline_options: Optional arguments to run the Pipeline, for instance whether to run directly. slice_spec: Deprecated (use EvalConfig). write_config: Deprecated (use EvalConfig). compute_confidence_intervals: Deprecated (use EvalConfig). k_anonymization_count: Deprecated (use EvalConfig). desired_batch_size: Optional batch size for batching in Predict. random_seed_for_testing: Provide for deterministic tests only. Returns: An EvalResult that can be used with the TFMA visualization functions. Raises: ValueError: If the file_format is unknown to us. """ _assert_tensorflow_version() if output_path is None: output_path = tempfile.mkdtemp() if not tf.io.gfile.exists(output_path): tf.io.gfile.makedirs(output_path) if eval_config is None: model_specs = [] eval_shared_models = eval_shared_model if not isinstance(eval_shared_model, dict): eval_shared_models = {'': eval_shared_model} for model_name, shared_model in eval_shared_models.items(): example_weight_key = shared_model.example_weight_key example_weight_keys = {} if example_weight_key and isinstance(example_weight_key, dict): example_weight_keys = example_weight_key example_weight_key = '' model_specs.append( config.ModelSpec( name=model_name, example_weight_key=example_weight_key, example_weight_keys=example_weight_keys)) slicing_specs = None if slice_spec: slicing_specs = [s.to_proto() for s in slice_spec] options = config.Options() options.compute_confidence_intervals.value = compute_confidence_intervals options.k_anonymization_count.value = k_anonymization_count if not write_config: options.disabled_outputs.values.append(_EVAL_CONFIG_FILE) eval_config = config.EvalConfig( model_specs=model_specs, slicing_specs=slicing_specs, options=options) with beam.Pipeline(options=pipeline_options) as p: if file_format == 'tfrecords': data = p | 'ReadFromTFRecord' >> beam.io.ReadFromTFRecord( file_pattern=data_location, compression_type=beam.io.filesystem.CompressionTypes.AUTO) elif file_format == 'text': data = p | 'ReadFromText' >> beam.io.textio.ReadFromText(data_location) else: raise ValueError('unknown file_format: {}'.format(file_format)) # pylint: disable=no-value-for-parameter _ = ( data | 'ExtractEvaluateAndWriteResults' >> ExtractEvaluateAndWriteResults( eval_config=eval_config, eval_shared_model=eval_shared_model, display_only_data_location=data_location, display_only_file_format=file_format, output_path=output_path, extractors=extractors, evaluators=evaluators, writers=writers, desired_batch_size=desired_batch_size, random_seed_for_testing=random_seed_for_testing)) # pylint: enable=no-value-for-parameter if len(eval_config.model_specs) <= 1: return load_eval_result(output_path) else: results = [] for spec in eval_config.model_specs: results.append(load_eval_result(output_path, model_name=spec.name)) return EvalResults(results, constants.MODEL_CENTRIC_MODE)