Exemple #1
0
 def testSerializeDeserializeEvalConfig(self):
     output_path = self._getTempDir()
     options = config.Options()
     options.compute_confidence_intervals.value = False
     options.k_anonymization_count.value = 1
     eval_config = config.EvalConfig(
         input_data_specs=[config.InputDataSpec(location='/path/to/data')],
         model_specs=[config.ModelSpec(location='/path/to/model')],
         output_data_specs=[
             config.OutputDataSpec(default_location=output_path)
         ],
         slicing_specs=[
             config.SlicingSpec(feature_keys=['country'],
                                feature_values={
                                    'age': '5',
                                    'gender': 'f'
                                }),
             config.SlicingSpec(feature_keys=['interest'],
                                feature_values={
                                    'age': '6',
                                    'gender': 'm'
                                })
         ],
         options=options)
     with tf.io.gfile.GFile(os.path.join(output_path, 'eval_config.json'),
                            'w') as f:
         f.write(model_eval_lib._serialize_eval_config(eval_config))
     got_eval_config = model_eval_lib.load_eval_config(output_path)
     self.assertEqual(eval_config, got_eval_config)
 def testSerializeDeserializeEvalConfig(self):
     output_path = self._getTempDir()
     options = config.Options()
     options.compute_confidence_intervals.value = False
     options.k_anonymization_count.value = 1
     eval_config = config.EvalConfig(slicing_specs=[
         config.SlicingSpec(feature_keys=['country'],
                            feature_values={
                                'age': '5',
                                'gender': 'f'
                            }),
         config.SlicingSpec(feature_keys=['interest'],
                            feature_values={
                                'age': '6',
                                'gender': 'm'
                            })
     ],
                                     options=options)
     data_location = '/path/to/data'
     file_format = 'tfrecords'
     model_location = '/path/to/model'
     with tf.io.gfile.GFile(os.path.join(output_path, 'eval_config.json'),
                            'w') as f:
         f.write(
             model_eval_lib._serialize_eval_run(eval_config, data_location,
                                                file_format,
                                                {'': model_location}))
     got_eval_config, got_data_location, got_file_format, got_model_locations = (
         model_eval_lib._load_eval_run(output_path))
     self.assertEqual(eval_config, got_eval_config)
     self.assertEqual(data_location, got_data_location)
     self.assertEqual(file_format, got_file_format)
     self.assertEqual({'': model_location}, got_model_locations)
Exemple #3
0
  def testMergeAccumulators(self):
    options = config.Options()
    options.desired_batch_size.value = 2
    computation = tf_metric_wrapper.tf_metric_computations(
        [tf.keras.metrics.MeanSquaredError(name='mse')],
        config.EvalConfig(options=options))[0]

    example1 = {'labels': [0.0], 'predictions': [0.0], 'example_weights': [1.0]}
    example2 = {'labels': [0.0], 'predictions': [0.5], 'example_weights': [1.0]}
    example3 = {'labels': [1.0], 'predictions': [0.3], 'example_weights': [1.0]}
    example4 = {'labels': [1.0], 'predictions': [0.9], 'example_weights': [1.0]}
    example5 = {'labels': [1.0], 'predictions': [0.5], 'example_weights': [0.0]}

    combiner_inputs = []
    for e in (example1, example2, example3, example4, example5):
      combiner_inputs.append(metric_util.to_standard_metric_inputs(e))
    acc1 = computation.combiner.create_accumulator()
    acc1 = computation.combiner.add_input(acc1, combiner_inputs[0])
    acc1 = computation.combiner.add_input(acc1, combiner_inputs[1])
    acc1 = computation.combiner.add_input(acc1, combiner_inputs[2])
    acc2 = computation.combiner.create_accumulator()
    acc2 = computation.combiner.add_input(acc2, combiner_inputs[3])
    acc2 = computation.combiner.add_input(acc2, combiner_inputs[4])
    acc = computation.combiner.merge_accumulators([acc1, acc2])

    got_metrics = computation.combiner.extract_output(acc)
    mse_key = metric_types.MetricKey(name='mse')
    self.assertDictElementsAlmostEqual(got_metrics, {mse_key: 0.1875})
Exemple #4
0
def _load_eval_run(
    output_path: Text
) -> Tuple[config.EvalConfig, Text, Text, Dict[Text, Text]]:
  """Returns eval config, data location, file format, and model locations."""
  path = os.path.join(output_path, _EVAL_CONFIG_FILE)
  if tf.io.gfile.exists(path):
    with tf.io.gfile.GFile(path, 'r') as f:
      pb = json_format.Parse(f.read(), config_pb2.EvalRun())
      _check_version(pb.version, output_path)
      return (pb.eval_config, pb.data_location, pb.file_format,
              pb.model_locations)
  else:
    # Legacy suppport (to be removed in future).
    # The previous version did not include file extension.
    path = os.path.splitext(path)[0]
    serialized_record = six.next(
        tf.compat.v1.python_io.tf_record_iterator(path))
    final_dict = pickle.loads(serialized_record)
    _check_version(final_dict, output_path)
    old_config = final_dict['eval_config']
    slicing_specs = None
    if old_config.slice_spec:
      slicing_specs = [s.to_proto() for s in old_config.slice_spec]
    options = config.Options()
    options.compute_confidence_intervals.value = (
        old_config.compute_confidence_intervals)
    options.k_anonymization_count.value = old_config.k_anonymization_count
    return (config.EvalConfig(slicing_specs=slicing_specs,
                              options=options), old_config.data_location, '', {
                                  '': old_config.model_location
                              })
def load_eval_config(output_path: Text) -> config.EvalConfig:
    """Loads eval config."""
    path = os.path.join(output_path, _EVAL_CONFIG_FILE)
    if tf.io.gfile.exists(path):
        with tf.io.gfile.GFile(path, 'r') as f:
            pb = json_format.Parse(f.read(), config_pb2.EvalConfigAndVersion())
            _check_version(pb.version, output_path)
            return pb.eval_config
    else:
        # Legacy suppport (to be removed in future).
        # The previous version did not include file extension.
        path = os.path.splitext(path)[0]
        serialized_record = six.next(
            tf.compat.v1.python_io.tf_record_iterator(path))
        final_dict = pickle.loads(serialized_record)
        _check_version(final_dict, output_path)
        old_config = final_dict['eval_config']
        slicing_specs = None
        if old_config.slice_spec:
            slicing_specs = [s.to_proto() for s in old_config.slice_spec]
        options = config.Options()
        options.compute_confidence_intervals.value = (
            old_config.compute_confidence_intervals)
        options.k_anonymization_count.value = old_config.k_anonymization_count
        return config.EvalConfig(
            input_data_specs=[
                config.InputDataSpec(location=old_config.data_location)
            ],
            model_specs=[config.ModelSpec(location=old_config.model_location)],
            output_data_specs=[
                config.OutputDataSpec(default_location=output_path)
            ],
            slicing_specs=slicing_specs,
            options=options)
def load_eval_run(
    output_path: Text,
    output_file_format: Text = EVAL_CONFIG_FILE_FORMAT,
    filename: Optional[Text] = None
) -> Tuple[Optional[config.EvalConfig], Text, Text, Dict[Text, Text]]:
    """Returns eval config, data location, file format, and model locations.

  Args:
    output_path: Directory containing config file.
    output_file_format: Format of output file. Currently only 'json' is
      supported.
    filename: Name of output file (including extension if any).

  Returns:
    Tuple of (EvalConfig, data location, file format, model locations). If an
    EvalConfig is not found at the given path, None will be returned.
  """
    if filename is None:
        filename = EVAL_CONFIG_FILE + '.' + output_file_format
    path = os.path.join(output_path, filename)
    if tf.io.gfile.exists(path):
        with tf.io.gfile.GFile(path, 'r') as f:
            pb = json_format.Parse(f.read(), config_pb2.EvalRun())
            _check_version(pb.version, output_path)
            return (pb.eval_config, pb.data_location, pb.file_format,
                    pb.model_locations)

    # Legacy suppport (to be removed in future).
    # The previous version did not include file extension.
    path = os.path.splitext(path)[0]
    if tf.io.gfile.exists(path):
        serialized_record = six.next(
            tf.compat.v1.python_io.tf_record_iterator(path))
        final_dict = pickle.loads(serialized_record)
        _check_version(final_dict, output_path)
        old_config = final_dict['eval_config']
        slicing_specs = None
        if old_config.slice_spec:
            slicing_specs = [s.to_proto() for s in old_config.slice_spec]
        options = config.Options()
        options.compute_confidence_intervals.value = (
            old_config.compute_confidence_intervals)
        options.min_slice_size.value = old_config.k_anonymization_count
        return (config.EvalConfig(slicing_specs=slicing_specs,
                                  options=options), old_config.data_location,
                '', {
                    '': old_config.model_location
                })

    # No config found
    return (None, '', '', {})
Exemple #7
0
 def testSerializeDeserializeLegacyEvalConfig(self):
     output_path = self._getTempDir()
     old_config = LegacyConfig(
         model_location='/path/to/model',
         data_location='/path/to/data',
         slice_spec=[
             slicer.SingleSliceSpec(columns=['country'],
                                    features=[('age', 5), ('gender', 'f')]),
             slicer.SingleSliceSpec(columns=['interest'],
                                    features=[('age', 6), ('gender', 'm')])
         ],
         example_count_metric_key=None,
         example_weight_metric_key='key',
         compute_confidence_intervals=False,
         k_anonymization_count=1)
     final_dict = {}
     final_dict['tfma_version'] = tfma_version.VERSION_STRING
     final_dict['eval_config'] = old_config
     with tf.io.TFRecordWriter(os.path.join(output_path,
                                            'eval_config')) as w:
         w.write(pickle.dumps(final_dict))
     got_eval_config = model_eval_lib.load_eval_config(output_path)
     options = config.Options()
     options.compute_confidence_intervals.value = (
         old_config.compute_confidence_intervals)
     options.k_anonymization_count.value = old_config.k_anonymization_count
     eval_config = config.EvalConfig(
         input_data_specs=[
             config.InputDataSpec(location=old_config.data_location)
         ],
         model_specs=[config.ModelSpec(location=old_config.model_location)],
         output_data_specs=[
             config.OutputDataSpec(default_location=output_path)
         ],
         slicing_specs=[
             config.SlicingSpec(feature_keys=['country'],
                                feature_values={
                                    'age': '5',
                                    'gender': 'f'
                                }),
             config.SlicingSpec(feature_keys=['interest'],
                                feature_values={
                                    'age': '6',
                                    'gender': 'm'
                                })
         ],
         options=options)
     self.assertEqual(eval_config, got_eval_config)
Exemple #8
0
  def testBatching(self):
    options = config.Options()
    options.desired_batch_size.value = 2
    computation = tf_metric_wrapper.tf_metric_computations(
        [_CustomMetric(),
         tf.keras.metrics.MeanSquaredError(name='mse')],
        config.EvalConfig(options=options))[0]

    example1 = {'labels': [0.0], 'predictions': [0.0], 'example_weights': [1.0]}
    example2 = {'labels': [0.0], 'predictions': [0.5], 'example_weights': [1.0]}
    example3 = {'labels': [1.0], 'predictions': [0.3], 'example_weights': [1.0]}
    example4 = {'labels': [1.0], 'predictions': [0.9], 'example_weights': [1.0]}
    example5 = {'labels': [1.0], 'predictions': [0.5], 'example_weights': [0.0]}

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create(
              [example1, example2, example3, example4, example5])
          | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
          | 'AddSlice' >> beam.Map(lambda x: ((), x))
          | 'Combine' >> beam.CombinePerKey(computation.combiner))

      # pylint: enable=no-value-for-parameter

      def check_result(got):
        try:
          self.assertEqual(1, len(got), 'got: %s' % got)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())

          custom_key = metric_types.MetricKey(name='custom')
          mse_key = metric_types.MetricKey(name='mse')
          self.assertDictElementsAlmostEqual(
              got_metrics, {
                  custom_key: (0.0 + 0.5 + 0.3 + 0.9 + 0.0) /
                              (1.0 + 1.0 + 1.0 + 1.0 + 0.0),
                  mse_key:
                      0.1875,
              })

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result, label='result')
Exemple #9
0
 def testRunModelAnalysisWithUncertainty(self):
     model_location = self._exportEvalSavedModel(
         linear_classifier.simple_linear_classifier)
     examples = [
         self._makeExample(age=3.0, language='english', label=1.0),
         self._makeExample(age=3.0, language='chinese', label=0.0),
         self._makeExample(age=4.0, language='english', label=1.0),
         self._makeExample(age=5.0, language='chinese', label=1.0),
         self._makeExample(age=5.0, language='hindi', label=1.0)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     slicing_specs = [config.SlicingSpec(feature_keys=['language'])]
     options = config.Options()
     options.compute_confidence_intervals.value = True
     options.k_anonymization_count.value = 2
     eval_config = config.EvalConfig(
         input_data_specs=[config.InputDataSpec(location=data_location)],
         model_specs=[config.ModelSpec(location=model_location)],
         output_data_specs=[
             config.OutputDataSpec(default_location=self._getTempDir())
         ],
         slicing_specs=slicing_specs,
         options=options)
     eval_result = model_eval_lib.run_model_analysis(
         eval_config=eval_config,
         eval_shared_models=[
             model_eval_lib.default_eval_shared_model(
                 eval_saved_model_path=model_location,
                 example_weight_key='age')
         ])
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
         (('language', 'hindi'), ): {
             u'__ERROR__': {
                 'debugMessage':
                 u'Example count for this slice key is lower than the '
                 u'minimum required value: 2. No data is aggregated for '
                 u'this slice.'
             },
         },
         (('language', 'chinese'), ): {
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 8.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 2.0
             },
         },
         (('language', 'english'), ): {
             'accuracy': {
                 'boundedValue': {
                     'value': 1.0,
                     'lowerBound': 1.0,
                     'upperBound': 1.0,
                     'methodology': 'POISSON_BOOTSTRAP'
                 }
             },
             'my_mean_label': {
                 'boundedValue': {
                     'value': 1.0,
                     'lowerBound': 1.0,
                     'upperBound': 1.0,
                     'methodology': 'POISSON_BOOTSTRAP'
                 }
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 7.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 2.0
             },
         }
     }
     self.assertEqual(eval_result.config.model_specs[0].location,
                      model_location.decode())
     self.assertEqual(eval_result.config.input_data_specs[0].location,
                      data_location)
     self.assertEqual(eval_result.config.slicing_specs[0],
                      config.SlicingSpec(feature_keys=['language']))
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
     self.assertFalse(eval_result.plots)
    def testRunModelAnalysisWithDeterministicConfidenceIntervals(self):
        model_location = self._exportEvalSavedModel(
            linear_classifier.simple_linear_classifier)
        examples = [
            self._makeExample(age=3.0, language='english', label=1.0),
            self._makeExample(age=3.0, language='chinese', label=0.0),
            self._makeExample(age=4.0, language='english', label=1.0),
            self._makeExample(age=5.0, language='chinese', label=1.0),
            self._makeExample(age=5.0, language='hindi', label=1.0)
        ]
        data_location = self._writeTFExamplesToTFRecords(examples)
        slicing_specs = [config.SlicingSpec(feature_keys=['language'])]
        options = config.Options()
        options.compute_confidence_intervals.value = True
        options.k_anonymization_count.value = 2
        eval_config = config.EvalConfig(slicing_specs=slicing_specs,
                                        options=options)
        eval_result = model_eval_lib.run_model_analysis(
            eval_config=eval_config,
            eval_shared_model=model_eval_lib.default_eval_shared_model(
                eval_saved_model_path=model_location,
                example_weight_key='age'),
            data_location=data_location,
            output_path=self._getTempDir(),
            random_seed_for_testing=_TEST_SEED)
        # We only check some of the metrics to ensure that the end-to-end
        # pipeline works.
        expected = {
            (('language', 'hindi'), ): {
                u'__ERROR__': {
                    'debugMessage':
                    u'Example count for this slice key is lower than the '
                    u'minimum required value: 2. No data is aggregated for '
                    u'this slice.'
                },
            },
            (('language', 'chinese'), ): {
                metric_keys.EXAMPLE_WEIGHT: {
                    'doubleValue': 8.0
                },
                metric_keys.EXAMPLE_COUNT: {
                    'doubleValue': 2.0
                },
            },
            (('language', 'english'), ): {
                'accuracy': {
                    'boundedValue': {
                        'value': 1.0,
                        'lowerBound': 1.0,
                        'upperBound': 1.0,
                        'methodology': 'POISSON_BOOTSTRAP'
                    }
                },
                'my_mean_label': {
                    'boundedValue': {
                        'value': 1.0,
                        'lowerBound': 1.0,
                        'upperBound': 1.0,
                        'methodology': 'POISSON_BOOTSTRAP'
                    }
                },
                metric_keys.EXAMPLE_WEIGHT: {
                    'doubleValue': 7.0
                },
                metric_keys.EXAMPLE_COUNT: {
                    'doubleValue': 2.0
                },
            }
        }
        self.assertEqual(eval_result.model_location, model_location.decode())
        self.assertEqual(eval_result.data_location, data_location)
        self.assertEqual(eval_result.config.slicing_specs[0],
                         config.SlicingSpec(feature_keys=['language']))
        self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)

        for key, value in eval_result.slicing_metrics:
            if (('language', 'english'), ) == key:
                metric = value['']['']['average_loss']
                self.assertAlmostEqual(0.171768754720,
                                       metric['boundedValue']['value'],
                                       delta=0.1)

                metric = value['']['']['auc_precision_recall']
                self.assertAlmostEqual(0.99999940395,
                                       metric['boundedValue']['value'],
                                       delta=0.1)

        self.assertFalse(eval_result.plots)
Exemple #11
0
  def testWriteValidationResults(self, output_file_format):
    model_dir, baseline_dir = self._getExportDir(), self._getBaselineDir()
    eval_shared_model = self._build_keras_model(model_dir, mul=0)
    baseline_eval_shared_model = self._build_keras_model(baseline_dir, mul=1)
    validations_file = os.path.join(self._getTempDir(),
                                    constants.VALIDATIONS_KEY)
    schema = text_format.Parse(
        """
        tensor_representation_group {
          key: ""
          value {
            tensor_representation {
              key: "input"
              value {
                dense_tensor {
                  column_name: "input"
                  shape { dim { size: 1 } }
                }
              }
            }
          }
        }
        feature {
          name: "input"
          type: FLOAT
        }
        feature {
          name: "label"
          type: FLOAT
        }
        feature {
          name: "example_weight"
          type: FLOAT
        }
        feature {
          name: "extra_feature"
          type: BYTES
        }
        """, schema_pb2.Schema())
    tfx_io = test_util.InMemoryTFExampleRecord(
        schema=schema, raw_record_column_name=constants.ARROW_INPUT_COLUMN)
    tensor_adapter_config = tensor_adapter.TensorAdapterConfig(
        arrow_schema=tfx_io.ArrowSchema(),
        tensor_representations=tfx_io.TensorRepresentations())
    examples = [
        self._makeExample(
            input=0.0,
            label=1.0,
            example_weight=1.0,
            extra_feature='non_model_feature'),
        self._makeExample(
            input=1.0,
            label=0.0,
            example_weight=0.5,
            extra_feature='non_model_feature'),
    ]

    eval_config = config.EvalConfig(
        model_specs=[
            config.ModelSpec(
                name='candidate',
                label_key='label',
                example_weight_key='example_weight'),
            config.ModelSpec(
                name='baseline',
                label_key='label',
                example_weight_key='example_weight',
                is_baseline=True)
        ],
        slicing_specs=[config.SlicingSpec()],
        metrics_specs=[
            config.MetricsSpec(
                metrics=[
                    config.MetricConfig(
                        class_name='WeightedExampleCount',
                        # 1.5 < 1, NOT OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                upper_bound={'value': 1}))),
                    config.MetricConfig(
                        class_name='ExampleCount',
                        # 2 > 10, NOT OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                lower_bound={'value': 10}))),
                    config.MetricConfig(
                        class_name='MeanLabel',
                        # 0 > 0 and 0 > 0%?: NOT OK.
                        threshold=config.MetricThreshold(
                            change_threshold=config.GenericChangeThreshold(
                                direction=config.MetricDirection
                                .HIGHER_IS_BETTER,
                                relative={'value': 0},
                                absolute={'value': 0}))),
                    config.MetricConfig(
                        # MeanPrediction = (0+0)/(1+0.5) = 0
                        class_name='MeanPrediction',
                        # -.01 < 0 < .01, OK.
                        # Diff% = -.333/.333 = -100% < -99%, OK.
                        # Diff = 0 - .333 = -.333 < 0, OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                upper_bound={'value': .01},
                                lower_bound={'value': -.01}),
                            change_threshold=config.GenericChangeThreshold(
                                direction=config.MetricDirection
                                .LOWER_IS_BETTER,
                                relative={'value': -.99},
                                absolute={'value': 0})))
                ],
                model_names=['candidate', 'baseline']),
        ],
        options=config.Options(
            disabled_outputs={'values': ['eval_config.json']}),
    )
    slice_spec = [
        slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
    ]
    eval_shared_models = {
        'candidate': eval_shared_model,
        'baseline': baseline_eval_shared_model
    }
    extractors = [
        batched_input_extractor.BatchedInputExtractor(eval_config),
        batched_predict_extractor_v2.BatchedPredictExtractor(
            eval_shared_model=eval_shared_models,
            eval_config=eval_config,
            tensor_adapter_config=tensor_adapter_config),
        unbatch_extractor.UnbatchExtractor(),
        slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
    ]
    evaluators = [
        metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
            eval_config=eval_config, eval_shared_model=eval_shared_models)
    ]
    output_paths = {
        constants.VALIDATIONS_KEY: validations_file,
    }
    writers = [
        metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter(
            output_paths,
            add_metrics_callbacks=[],
            output_file_format=output_file_format)
    ]

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      _ = (
          pipeline
          | 'Create' >> beam.Create([e.SerializeToString() for e in examples])
          | 'BatchExamples' >> tfx_io.BeamSource()
          | 'InputsToExtracts' >> model_eval_lib.BatchedInputsToExtracts()
          | 'ExtractEvaluate' >> model_eval_lib.ExtractAndEvaluate(
              extractors=extractors, evaluators=evaluators)
          | 'WriteResults' >> model_eval_lib.WriteResults(writers=writers))
      # pylint: enable=no-value-for-parameter

    validation_result = (
        metrics_plots_and_validations_writer
        .load_and_deserialize_validation_result(
            os.path.dirname(validations_file)))

    expected_validations = [
        text_format.Parse(
            """
            metric_key {
              name: "weighted_example_count"
              model_name: "candidate"
            }
            metric_threshold {
              value_threshold {
                upper_bound {
                  value: 1.0
                }
              }
            }
            metric_value {
              double_value {
                value: 1.5
              }
            }
            """, validation_result_pb2.ValidationFailure()),
        text_format.Parse(
            """
            metric_key {
              name: "example_count"
              model_name: "candidate"
            }
            metric_threshold {
              value_threshold {
                lower_bound {
                  value: 10.0
                }
              }
            }
            metric_value {
              double_value {
                value: 2.0
              }
            }
            """, validation_result_pb2.ValidationFailure()),
        text_format.Parse(
            """
            metric_key {
              name: "mean_label"
              model_name: "candidate"
              is_diff: true
            }
            metric_threshold {
              change_threshold {
                absolute {
                  value: 0.0
                }
                relative {
                  value: 0.0
                }
                direction: HIGHER_IS_BETTER
              }
            }
            metric_value {
              double_value {
                value: 0.0
              }
            }
            """, validation_result_pb2.ValidationFailure()),
    ]
    self.assertFalse(validation_result.validation_ok)
    self.assertLen(validation_result.metric_validations_per_slice, 1)
    self.assertCountEqual(
        expected_validations,
        validation_result.metric_validations_per_slice[0].failures)
Exemple #12
0
def ExtractEvaluateAndWriteResults(  # pylint: disable=invalid-name
    examples: beam.pvalue.PCollection,
    eval_shared_model: Optional[Union[types.EvalSharedModel,
                                      Dict[Text,
                                           types.EvalSharedModel]]] = None,
    eval_config: config.EvalConfig = None,
    extractors: Optional[List[extractor.Extractor]] = None,
    evaluators: Optional[List[evaluator.Evaluator]] = None,
    writers: Optional[List[writer.Writer]] = None,
    output_path: Optional[Text] = None,
    display_only_data_location: Optional[Text] = None,
    display_only_file_format: Optional[Text] = None,
    slice_spec: Optional[List[slicer.SingleSliceSpec]] = None,
    write_config: Optional[bool] = True,
    compute_confidence_intervals: Optional[bool] = False,
    k_anonymization_count: int = 1,
    desired_batch_size: Optional[int] = None,
    random_seed_for_testing: Optional[int] = None) -> beam.pvalue.PDone:
  """PTransform for performing extraction, evaluation, and writing results.

  Users who want to construct their own Beam pipelines instead of using the
  lightweight run_model_analysis functions should use this PTransform.

  Example usage:
    eval_config = tfma.EvalConfig(slicing_specs=[...], metrics_specs=[...])
    eval_shared_model = tfma.default_eval_shared_model(
        eval_saved_model_path=model_location, eval_config=eval_config)
    with beam.Pipeline(runner=...) as p:
      _ = (p
           | 'ReadData' >> beam.io.ReadFromTFRecord(data_location)
           | 'ExtractEvaluateAndWriteResults' >>
           tfma.ExtractEvaluateAndWriteResults(
               eval_shared_model=eval_shared_model,
               eval_config=eval_config,
               ...))
    result = tfma.load_eval_result(output_path=output_path)
    tfma.view.render_slicing_metrics(result)

  Note that the exact serialization format is an internal implementation detail
  and subject to change. Users should only use the TFMA functions to write and
  read the results.

  Args:
    examples: PCollection of input examples. Can be any format the model accepts
      (e.g. string containing CSV row, TensorFlow.Example, etc).
    eval_shared_model: Optional shared model (single-model evaluation) or dict
      of shared models keyed by model name (multi-model evaluation). Only
      required if needed by default extractors, evaluators, or writers and for
      display purposes of the model path.
    eval_config: Eval config.
    extractors: Optional list of Extractors to apply to Extracts. Typically
      these will be added by calling the default_extractors function. If no
      extractors are provided, default_extractors (non-materialized) will be
      used.
    evaluators: Optional list of Evaluators for evaluating Extracts. Typically
      these will be added by calling the default_evaluators function. If no
      evaluators are provided, default_evaluators will be used.
    writers: Optional list of Writers for writing Evaluation output. Typically
      these will be added by calling the default_writers function. If no writers
      are provided, default_writers will be used.
    output_path: Path to output metrics and plots results.
    display_only_data_location: Optional path indicating where the examples were
      read from. This is used only for display purposes - data will not actually
      be read from this path.
    display_only_file_format: Optional format of the examples. This is used only
      for display purposes.
    slice_spec: Deprecated (use EvalConfig).
    write_config: Deprecated (use EvalConfig).
    compute_confidence_intervals: Deprecated (use EvalConfig).
    k_anonymization_count: Deprecated (use EvalConfig).
    desired_batch_size: Optional batch size for batching in Predict.
    random_seed_for_testing: Provide for deterministic tests only.

  Raises:
    ValueError: If EvalConfig invalid or matching Extractor not found for an
      Evaluator.

  Returns:
    PDone.
  """
  eval_shared_models = eval_shared_model
  if not isinstance(eval_shared_model, dict):
    eval_shared_models = {'': eval_shared_model}

  if eval_config is None:
    model_specs = []
    for model_name, shared_model in eval_shared_models.items():
      example_weight_key = shared_model.example_weight_key
      example_weight_keys = {}
      if example_weight_key and isinstance(example_weight_key, dict):
        example_weight_keys = example_weight_key
        example_weight_key = ''
      model_specs.append(
          config.ModelSpec(
              name=model_name,
              example_weight_key=example_weight_key,
              example_weight_keys=example_weight_keys))
    slicing_specs = None
    if slice_spec:
      slicing_specs = [s.to_proto() for s in slice_spec]
    options = config.Options()
    options.compute_confidence_intervals.value = compute_confidence_intervals
    options.k_anonymization_count.value = k_anonymization_count
    if not write_config:
      options.disabled_outputs.values.append(_EVAL_CONFIG_FILE)
    eval_config = config.EvalConfig(
        model_specs=model_specs, slicing_specs=slicing_specs, options=options)
  else:
    eval_config = config.update_eval_config_with_defaults(eval_config)

  config.verify_eval_config(eval_config)

  if not extractors:
    extractors = default_extractors(
        eval_config=eval_config,
        eval_shared_model=eval_shared_model,
        materialize=False,
        desired_batch_size=desired_batch_size)

  if not evaluators:
    evaluators = default_evaluators(
        eval_config=eval_config,
        eval_shared_model=eval_shared_model,
        random_seed_for_testing=random_seed_for_testing)

  for v in evaluators:
    evaluator.verify_evaluator(v, extractors)

  if not writers:
    writers = default_writers(
        output_path=output_path, eval_shared_model=eval_shared_model)

  # pylint: disable=no-value-for-parameter
  _ = (
      examples
      | 'InputsToExtracts' >> InputsToExtracts()
      | 'ExtractAndEvaluate' >> ExtractAndEvaluate(
          extractors=extractors, evaluators=evaluators)
      | 'WriteResults' >> WriteResults(writers=writers))

  if _EVAL_CONFIG_FILE not in eval_config.options.disabled_outputs.values:
    data_location = '<user provided PCollection>'
    if display_only_data_location is not None:
      data_location = display_only_data_location
    file_format = '<unknown>'
    if display_only_file_format is not None:
      file_format = display_only_file_format
    model_locations = {}
    for k, v in eval_shared_models.items():
      model_locations[k] = ('<unknown>' if v is None or v.model_path is None
                            else v.model_path)
    _ = (
        examples.pipeline
        | WriteEvalConfig(eval_config, output_path, data_location, file_format,
                          model_locations))
  # pylint: enable=no-value-for-parameter

  return beam.pvalue.PDone(examples.pipeline)
    def testEvaluateWithConfidenceIntervals(self):
        # NOTE: This test does not actually test that confidence intervals are
        #   accurate it only tests that the proto output by the test is well formed.
        #   This test would pass if the confidence interval implementation did
        #   nothing at all except compute the unsampled value.
        temp_export_dir = self._getExportDir()
        _, export_dir = (fixed_prediction_estimator_extra_fields.
                         simple_fixed_prediction_estimator_extra_fields(
                             None, temp_export_dir))
        options = config.Options()
        options.compute_confidence_intervals.value = True
        eval_config = config.EvalConfig(
            model_specs=[
                config.ModelSpec(location=export_dir,
                                 label_key='label',
                                 example_weight_key='fixed_float')
            ],
            slicing_specs=[
                config.SlicingSpec(),
                config.SlicingSpec(feature_keys=['fixed_string']),
            ],
            metrics_specs=metric_specs.specs_from_metrics([
                calibration.MeanLabel('mean_label'),
                calibration.MeanPrediction('mean_prediction')
            ]),
            options=options)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])

        slice_spec = [
            slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
        ]
        extractors = [
            input_extractor.InputExtractor(eval_config=eval_config),
            predict_extractor_v2.PredictExtractor(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model]),
            slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
        ]
        evaluators = [
            metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
                eval_config=eval_config,
                eval_shared_models=[eval_shared_model])
        ]

        # fixed_float used as example_weight key
        examples = [
            self._makeExample(prediction=0.2,
                              label=1.0,
                              fixed_int=1,
                              fixed_float=1.0,
                              fixed_string='fixed_string1'),
            self._makeExample(prediction=0.8,
                              label=0.0,
                              fixed_int=1,
                              fixed_float=1.0,
                              fixed_string='fixed_string1'),
            self._makeExample(prediction=0.5,
                              label=0.0,
                              fixed_int=2,
                              fixed_float=2.0,
                              fixed_string='fixed_string2')
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            metrics = (
                pipeline
                | 'Create' >> beam.Create(
                    [e.SerializeToString() for e in examples])
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'ExtractAndEvaluate' >> model_eval_lib.ExtractAndEvaluate(
                    extractors=extractors, evaluators=evaluators))

            # pylint: enable=no-value-for-parameter

            def check_metrics(got):
                try:
                    self.assertLen(got, 3)
                    slices = {}
                    for slice_key, value in got:
                        slices[slice_key] = value
                    overall_slice = ()
                    fixed_string1_slice = (('fixed_string',
                                            b'fixed_string1'), )
                    fixed_string2_slice = (('fixed_string',
                                            b'fixed_string2'), )
                    self.assertCountEqual(list(slices.keys()), [
                        overall_slice, fixed_string1_slice, fixed_string2_slice
                    ])
                    example_count_key = metric_types.MetricKey(
                        name='example_count')
                    weighted_example_count_key = metric_types.MetricKey(
                        name='weighted_example_count')
                    label_key = metric_types.MetricKey(name='mean_label')
                    pred_key = metric_types.MetricKey(name='mean_prediction')
                    self.assertDictElementsWithTDistributionAlmostEqual(
                        slices[overall_slice], {
                            example_count_key: 3,
                            weighted_example_count_key: 4.0,
                            label_key:
                            (1.0 + 0.0 + 2 * 0.0) / (1.0 + 1.0 + 2.0),
                            pred_key:
                            (0.2 + 0.8 + 2 * 0.5) / (1.0 + 1.0 + 2.0),
                        })
                    self.assertDictElementsWithTDistributionAlmostEqual(
                        slices[fixed_string1_slice], {
                            example_count_key: 2,
                            weighted_example_count_key: 2.0,
                            label_key: (1.0 + 0.0) / (1.0 + 1.0),
                            pred_key: (0.2 + 0.8) / (1.0 + 1.0),
                        })
                    self.assertDictElementsWithTDistributionAlmostEqual(
                        slices[fixed_string2_slice], {
                            example_count_key: 1,
                            weighted_example_count_key: 2.0,
                            label_key: (2 * 0.0) / 2.0,
                            pred_key: (2 * 0.5) / 2.0,
                        })

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics[constants.METRICS_KEY],
                             check_metrics,
                             label='metrics')
def run_model_analysis(
        eval_shared_model: Optional[types.EvalSharedModel] = None,
        eval_shared_models: Optional[List[types.EvalSharedModel]] = None,
        eval_config: config.EvalConfig = None,
        extractors: Optional[List[extractor.Extractor]] = None,
        evaluators: Optional[List[evaluator.Evaluator]] = None,
        writers: Optional[List[writer.Writer]] = None,
        pipeline_options: Optional[Any] = None,
        data_location: Optional[Text] = None,
        file_format: Optional[Text] = 'tfrecords',
        slice_spec: Optional[List[slicer.SingleSliceSpec]] = None,
        output_path: Optional[Text] = None,
        write_config: Optional[bool] = True,
        desired_batch_size: Optional[int] = None,
        compute_confidence_intervals: Optional[bool] = False,
        k_anonymization_count: int = 1) -> EvalResult:
    """Runs TensorFlow model analysis.

  It runs a Beam pipeline to compute the slicing metrics exported in TensorFlow
  Eval SavedModel and returns the results.

  This is a simplified API for users who want to quickly get something running
  locally. Users who wish to create their own Beam pipelines can use the
  Evaluate PTransform instead.

  Args:
    eval_shared_model: Shared model (single-model evaluation).
    eval_shared_models: Shared models (multi-model evaluation).
    eval_config: Eval config.
    extractors: Optional list of Extractors to apply to Extracts. Typically
      these will be added by calling the default_extractors function. If no
      extractors are provided, default_extractors (non-materialized) will be
      used.
    evaluators: Optional list of Evaluators for evaluating Extracts. Typically
      these will be added by calling the default_evaluators function. If no
      evaluators are provided, default_evaluators will be used.
    writers: Optional list of Writers for writing Evaluation output. Typically
      these will be added by calling the default_writers function. If no writers
      are provided, default_writers will be used.
    pipeline_options: Optional arguments to run the Pipeline, for instance
      whether to run directly.
    data_location: Deprecated (use EvalConfig).
    file_format: Deprecated (use EvalConfig).
    slice_spec: Deprecated (use EvalConfig).
    output_path: Deprecated (use EvalConfig).
    write_config: Deprecated (use EvalConfig).
    desired_batch_size: Deprecated (use EvalConfig).
    compute_confidence_intervals: Deprecated (use EvalConfig).
    k_anonymization_count: Deprecated (use EvalConfig).

  Returns:
    An EvalResult that can be used with the TFMA visualization functions.

  Raises:
    ValueError: If the file_format is unknown to us.
  """
    _assert_tensorflow_version()

    if eval_shared_model is not None:
        eval_shared_models = [eval_shared_model]

    if eval_config is None:
        if output_path is None:
            output_path = tempfile.mkdtemp()
        if not tf.io.gfile.exists(output_path):
            tf.io.gfile.makedirs(output_path)
        disabled_outputs = None
        if not write_config:
            disabled_outputs = [_EVAL_CONFIG_FILE]
        model_specs = []
        for m in eval_shared_models:
            example_weight_key = m.example_weight_key
            example_weight_keys = {}
            if example_weight_key and isinstance(example_weight_key, dict):
                example_weight_keys = example_weight_key
                example_weight_key = ''
            model_specs.append(
                config.ModelSpec(location=m.model_path,
                                 example_weight_key=example_weight_key,
                                 example_weight_keys=example_weight_keys))
        slicing_specs = None
        if slice_spec:
            slicing_specs = [s.to_proto() for s in slice_spec]
        options = config.Options()
        options.compute_confidence_intervals.value = compute_confidence_intervals
        options.k_anonymization_count.value = k_anonymization_count
        if desired_batch_size:
            options.desired_batch_size.value = desired_batch_size
        eval_config = config.EvalConfig(
            input_data_specs=[
                config.InputDataSpec(location=data_location,
                                     file_format=file_format)
            ],
            model_specs=model_specs,
            output_data_specs=[
                config.OutputDataSpec(default_location=output_path,
                                      disabled_outputs=disabled_outputs)
            ],
            slicing_specs=slicing_specs,
            options=options)

    if len(eval_config.input_data_specs) != 1:
        raise NotImplementedError(
            'multiple input_data_specs are not yet supported.')
    if len(eval_config.model_specs) != 1:
        raise NotImplementedError(
            'multiple model_specs are not yet supported.')
    if len(eval_config.output_data_specs) != 1:
        raise NotImplementedError(
            'multiple output_data_specs are not yet supported.')

    with beam.Pipeline(options=pipeline_options) as p:
        if (not eval_config.input_data_specs[0].file_format
                or eval_config.input_data_specs[0].file_format == 'tfrecords'):
            data = p | 'ReadFromTFRecord' >> beam.io.ReadFromTFRecord(
                file_pattern=eval_config.input_data_specs[0].location,
                compression_type=beam.io.filesystem.CompressionTypes.AUTO)
        elif eval_config.input_data_specs[0].file_format == 'text':
            data = p | 'ReadFromText' >> beam.io.textio.ReadFromText(
                eval_config.input_data_specs[0].location)
        else:
            raise ValueError('unknown file_format: {}'.format(
                eval_config.input_data_specs[0].file_format))

        # pylint: disable=no-value-for-parameter
        _ = (
            data
            |
            'ExtractEvaluateAndWriteResults' >> ExtractEvaluateAndWriteResults(
                eval_config=eval_config,
                eval_shared_models=eval_shared_models,
                extractors=extractors,
                evaluators=evaluators,
                writers=writers))
        # pylint: enable=no-value-for-parameter

    # TODO(b/141016373): Add support for multiple models.
    return load_eval_result(eval_config.output_data_specs[0].default_location)
def ExtractEvaluateAndWriteResults(  # pylint: disable=invalid-name
        examples: beam.pvalue.PCollection,
        eval_shared_model: Optional[types.EvalSharedModel] = None,
        eval_shared_models: Optional[List[types.EvalSharedModel]] = None,
        eval_config: config.EvalConfig = None,
        extractors: Optional[List[extractor.Extractor]] = None,
        evaluators: Optional[List[evaluator.Evaluator]] = None,
        writers: Optional[List[writer.Writer]] = None,
        output_path: Optional[Text] = None,
        display_only_data_location: Optional[Text] = None,
        slice_spec: Optional[List[slicer.SingleSliceSpec]] = None,
        desired_batch_size: Optional[int] = None,
        write_config: Optional[bool] = True,
        compute_confidence_intervals: Optional[bool] = False,
        k_anonymization_count: int = 1) -> beam.pvalue.PDone:
    """PTransform for performing extraction, evaluation, and writing results.

  Users who want to construct their own Beam pipelines instead of using the
  lightweight run_model_analysis functions should use this PTransform.

  Example usage:
    eval_config = tfma.EvalConfig(
        input_data_specs=[tfma.InputDataSpec(location=data_location)],
        model_specs=[tfma.ModelSpec(location=model_location)],
        output_data_specs=[tfma.OutputDataSpec(default_location=output_path)],
        slicing_specs=[...],
        metrics_specs=[...])
    eval_shared_model = tfma.default_eval_shared_model(
        eval_saved_model_path=model_location,
        add_metrics_callbacks=[...])
    with beam.Pipeline(runner=...) as p:
      _ = (p
           | 'ReadData' >> beam.io.ReadFromTFRecord(data_location)
           | 'ExtractEvaluateAndWriteResults' >>
           tfma.ExtractEvaluateAndWriteResults(
               eval_config=eval_config,
               eval_shared_models=[eval_shared_model],
               ...))
    result = tfma.load_eval_result(output_path=output_path)
    tfma.view.render_slicing_metrics(result)

  Note that the exact serialization format is an internal implementation detail
  and subject to change. Users should only use the TFMA functions to write and
  read the results.

  Args:
    examples: PCollection of input examples. Can be any format the model accepts
      (e.g. string containing CSV row, TensorFlow.Example, etc).
    eval_shared_model: Shared model (single-model evaluation).
    eval_shared_models: Shared models (multi-model evaluation).
    eval_config: Eval config.
    extractors: Optional list of Extractors to apply to Extracts. Typically
      these will be added by calling the default_extractors function. If no
      extractors are provided, default_extractors (non-materialized) will be
      used.
    evaluators: Optional list of Evaluators for evaluating Extracts. Typically
      these will be added by calling the default_evaluators function. If no
      evaluators are provided, default_evaluators will be used.
    writers: Optional list of Writers for writing Evaluation output. Typically
      these will be added by calling the default_writers function. If no writers
      are provided, default_writers will be used.
    output_path: Deprecated (use EvalConfig).
    display_only_data_location: Deprecated (use EvalConfig).
    slice_spec: Deprecated (use EvalConfig).
    desired_batch_size: Deprecated (use EvalConfig).
    write_config: Deprecated (use EvalConfig).
    compute_confidence_intervals: Deprecated (use EvalConfig).
    k_anonymization_count: Deprecated (use EvalConfig).

  Raises:
    ValueError: If matching Extractor not found for an Evaluator.

  Returns:
    PDone.
  """
    if eval_shared_model is not None:
        eval_shared_models = [eval_shared_model]

    if eval_config is None:
        data_location = '<user provided PCollection>'
        if display_only_data_location is not None:
            data_location = display_only_data_location
        disabled_outputs = None
        if not write_config:
            disabled_outputs = [_EVAL_CONFIG_FILE]
        model_specs = []
        for m in eval_shared_models:
            example_weight_key = m.example_weight_key
            example_weight_keys = {}
            if example_weight_key and isinstance(example_weight_key, dict):
                example_weight_keys = example_weight_key
                example_weight_key = ''
            model_specs.append(
                config.ModelSpec(location=m.model_path,
                                 example_weight_key=example_weight_key,
                                 example_weight_keys=example_weight_keys))
        slicing_specs = None
        if slice_spec:
            slicing_specs = [s.to_proto() for s in slice_spec]
        options = config.Options()
        options.compute_confidence_intervals.value = compute_confidence_intervals
        options.k_anonymization_count.value = k_anonymization_count
        if desired_batch_size:
            options.desired_batch_size.value = desired_batch_size
        eval_config = config.EvalConfig(
            input_data_specs=[config.InputDataSpec(location=data_location)],
            model_specs=model_specs,
            output_data_specs=[
                config.OutputDataSpec(default_location=output_path,
                                      disabled_outputs=disabled_outputs)
            ],
            slicing_specs=slicing_specs,
            options=options)

    if not extractors:
        extractors = default_extractors(eval_config=eval_config,
                                        eval_shared_models=eval_shared_models,
                                        materialize=False)

    if not evaluators:
        evaluators = default_evaluators(eval_config=eval_config,
                                        eval_shared_models=eval_shared_models)

    for v in evaluators:
        evaluator.verify_evaluator(v, extractors)

    if not writers:
        writers = default_writers(eval_config=eval_config,
                                  eval_shared_models=eval_shared_models)

    # pylint: disable=no-value-for-parameter
    _ = (examples
         | 'InputsToExtracts' >> InputsToExtracts()
         | 'ExtractAndEvaluate' >> ExtractAndEvaluate(extractors=extractors,
                                                      evaluators=evaluators)
         | 'WriteResults' >> WriteResults(writers=writers))

    # TODO(b/141016373): Add support for multiple models.
    if _EVAL_CONFIG_FILE not in eval_config.output_data_specs[
            0].disabled_outputs:
        _ = examples.pipeline | WriteEvalConfig(eval_config)
    # pylint: enable=no-value-for-parameter

    return beam.pvalue.PDone(examples.pipeline)
  def testWriteMetricsAndPlots(self):
    metrics_file = os.path.join(self._getTempDir(), 'metrics')
    plots_file = os.path.join(self._getTempDir(), 'plots')
    temp_eval_export_dir = os.path.join(self._getTempDir(), 'eval_export_dir')

    _, eval_export_dir = (
        fixed_prediction_estimator.simple_fixed_prediction_estimator(
            None, temp_eval_export_dir))
    eval_config = config.EvalConfig(
        model_specs=[config.ModelSpec()],
        options=config.Options(
            disabled_outputs={'values': ['eval_config.json']}))
    eval_shared_model = self.createTestEvalSharedModel(
        eval_saved_model_path=eval_export_dir,
        add_metrics_callbacks=[
            post_export_metrics.example_count(),
            post_export_metrics.calibration_plot_and_prediction_histogram(
                num_buckets=2)
        ])
    extractors = [
        predict_extractor.PredictExtractor(eval_shared_model),
        slice_key_extractor.SliceKeyExtractor()
    ]
    evaluators = [
        metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(eval_shared_model)
    ]
    output_paths = {
        constants.METRICS_KEY: metrics_file,
        constants.PLOTS_KEY: plots_file
    }
    writers = [
        metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter(
            output_paths, eval_shared_model.add_metrics_callbacks)
    ]

    with beam.Pipeline() as pipeline:
      example1 = self._makeExample(prediction=0.0, label=1.0)
      example2 = self._makeExample(prediction=1.0, label=1.0)

      # pylint: disable=no-value-for-parameter
      _ = (
          pipeline
          | 'Create' >> beam.Create([
              example1.SerializeToString(),
              example2.SerializeToString(),
          ])
          | 'ExtractEvaluateAndWriteResults' >>
          model_eval_lib.ExtractEvaluateAndWriteResults(
              eval_config=eval_config,
              eval_shared_model=eval_shared_model,
              extractors=extractors,
              evaluators=evaluators,
              writers=writers))
      # pylint: enable=no-value-for-parameter

    expected_metrics_for_slice = text_format.Parse(
        """
        slice_key {}
        metrics {
          key: "average_loss"
          value {
            double_value {
              value: 0.5
            }
          }
        }
        metrics {
          key: "post_export_metrics/example_count"
          value {
            double_value {
              value: 2.0
            }
          }
        }
        """, metrics_for_slice_pb2.MetricsForSlice())

    metric_records = []
    for record in tf.compat.v1.python_io.tf_record_iterator(metrics_file):
      metric_records.append(
          metrics_for_slice_pb2.MetricsForSlice.FromString(record))
    self.assertEqual(1, len(metric_records), 'metrics: %s' % metric_records)
    self.assertProtoEquals(expected_metrics_for_slice, metric_records[0])

    expected_plots_for_slice = text_format.Parse(
        """
      slice_key {}
      plots {
        key: "post_export_metrics"
        value {
          calibration_histogram_buckets {
            buckets {
              lower_threshold_inclusive: -inf
              num_weighted_examples {}
              total_weighted_label {}
              total_weighted_refined_prediction {}
            }
            buckets {
              upper_threshold_exclusive: 0.5
              num_weighted_examples {
                value: 1.0
              }
              total_weighted_label {
                value: 1.0
              }
              total_weighted_refined_prediction {}
            }
            buckets {
              lower_threshold_inclusive: 0.5
              upper_threshold_exclusive: 1.0
              num_weighted_examples {
              }
              total_weighted_label {}
              total_weighted_refined_prediction {}
            }
            buckets {
              lower_threshold_inclusive: 1.0
              upper_threshold_exclusive: inf
              num_weighted_examples {
                value: 1.0
              }
              total_weighted_label {
                value: 1.0
              }
              total_weighted_refined_prediction {
                value: 1.0
              }
            }
         }
        }
      }
    """, metrics_for_slice_pb2.PlotsForSlice())

    plot_records = []
    for record in tf.compat.v1.python_io.tf_record_iterator(plots_file):
      plot_records.append(
          metrics_for_slice_pb2.PlotsForSlice.FromString(record))
    self.assertEqual(1, len(plot_records), 'plots: %s' % plot_records)
    self.assertProtoEquals(expected_plots_for_slice, plot_records[0])
  def testWriteValidationResults(self):
    model_dir, baseline_dir = self._getExportDir(), self._getBaselineDir()
    eval_shared_model = self._build_keras_model(model_dir, mul=0)
    baseline_eval_shared_model = self._build_keras_model(baseline_dir, mul=1)
    validations_file = os.path.join(self._getTempDir(),
                                    constants.VALIDATIONS_KEY)
    examples = [
        self._makeExample(
            input=0.0,
            label=1.0,
            example_weight=1.0,
            extra_feature='non_model_feature'),
        self._makeExample(
            input=1.0,
            label=0.0,
            example_weight=0.5,
            extra_feature='non_model_feature'),
    ]

    eval_config = config.EvalConfig(
        model_specs=[
            config.ModelSpec(
                name='candidate',
                label_key='label',
                example_weight_key='example_weight'),
            config.ModelSpec(
                name='baseline',
                label_key='label',
                example_weight_key='example_weight',
                is_baseline=True)
        ],
        slicing_specs=[config.SlicingSpec()],
        metrics_specs=[
            config.MetricsSpec(
                metrics=[
                    config.MetricConfig(
                        class_name='WeightedExampleCount',
                        # 1.5 < 1, NOT OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                upper_bound={'value': 1}))),
                    config.MetricConfig(
                        class_name='ExampleCount',
                        # 2 > 10, NOT OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                lower_bound={'value': 10}))),
                    config.MetricConfig(
                        class_name='MeanLabel',
                        # 0 > 0 and 0 > 0%?: NOT OK.
                        threshold=config.MetricThreshold(
                            change_threshold=config.GenericChangeThreshold(
                                direction=config.MetricDirection
                                .HIGHER_IS_BETTER,
                                relative={'value': 0},
                                absolute={'value': 0}))),
                    config.MetricConfig(
                        # MeanPrediction = (0+0)/(1+0.5) = 0
                        class_name='MeanPrediction',
                        # -.01 < 0 < .01, OK.
                        # Diff% = -.333/.333 = -100% < -99%, OK.
                        # Diff = 0 - .333 = -.333 < 0, OK.
                        threshold=config.MetricThreshold(
                            value_threshold=config.GenericValueThreshold(
                                upper_bound={'value': .01},
                                lower_bound={'value': -.01}),
                            change_threshold=config.GenericChangeThreshold(
                                direction=config.MetricDirection
                                .LOWER_IS_BETTER,
                                relative={'value': -.99},
                                absolute={'value': 0})))
                ],
                model_names=['candidate', 'baseline']),
        ],
        options=config.Options(
            disabled_outputs={'values': ['eval_config.json']}),
    )
    slice_spec = [
        slicer.SingleSliceSpec(spec=s) for s in eval_config.slicing_specs
    ]
    eval_shared_models = {
        'candidate': eval_shared_model,
        'baseline': baseline_eval_shared_model
    }
    extractors = [
        input_extractor.InputExtractor(eval_config),
        predict_extractor_v2.PredictExtractor(
            eval_shared_model=eval_shared_models, eval_config=eval_config),
        slice_key_extractor.SliceKeyExtractor(slice_spec=slice_spec)
    ]
    evaluators = [
        metrics_and_plots_evaluator_v2.MetricsAndPlotsEvaluator(
            eval_config=eval_config, eval_shared_model=eval_shared_models)
    ]
    output_paths = {
        constants.VALIDATIONS_KEY: validations_file,
    }
    writers = [
        metrics_plots_and_validations_writer.MetricsPlotsAndValidationsWriter(
            output_paths, add_metrics_callbacks=[])
    ]

    with beam.Pipeline() as pipeline:

      # pylint: disable=no-value-for-parameter
      _ = (
          pipeline
          | 'Create' >> beam.Create([e.SerializeToString() for e in examples])
          | 'ExtractEvaluateAndWriteResults' >>
          model_eval_lib.ExtractEvaluateAndWriteResults(
              eval_config=eval_config,
              eval_shared_model=eval_shared_model,
              extractors=extractors,
              evaluators=evaluators,
              writers=writers))
      # pylint: enable=no-value-for-parameter

    validation_result = model_eval_lib.load_validation_result(
        os.path.dirname(validations_file))

    expected_validations = [
        text_format.Parse(
            """
            metric_key {
              name: "weighted_example_count"
              model_name: "candidate"
            }
            metric_threshold {
              value_threshold {
                upper_bound {
                  value: 1.0
                }
              }
            }
            metric_value {
              double_value {
                value: 1.5
              }
            }
            """, validation_result_pb2.ValidationFailure()),
        text_format.Parse(
            """
            metric_key {
              name: "example_count"
            }
            metric_threshold {
              value_threshold {
                lower_bound {
                  value: 10.0
                }
              }
            }
            metric_value {
              double_value {
                value: 2.0
              }
            }
            """, validation_result_pb2.ValidationFailure()),
        text_format.Parse(
            """
            metric_key {
              name: "mean_label"
              model_name: "candidate"
              is_diff: true
            }
            metric_threshold {
              change_threshold {
                absolute {
                  value: 0.0
                }
                relative {
                  value: 0.0
                }
                direction: HIGHER_IS_BETTER
              }
            }
            metric_value {
              double_value {
                value: 0.0
              }
            }
            """, validation_result_pb2.ValidationFailure()),
    ]
    self.assertFalse(validation_result.validation_ok)
    self.assertLen(validation_result.metric_validations_per_slice, 1)
    self.assertCountEqual(
        expected_validations,
        validation_result.metric_validations_per_slice[0].failures)
Exemple #18
0
def run_model_analysis(
    eval_shared_model: Optional[Union[types.EvalSharedModel,
                                      Dict[Text,
                                           types.EvalSharedModel]]] = None,
    eval_config: config.EvalConfig = None,
    data_location: Text = '',
    file_format: Text = 'tfrecords',
    output_path: Optional[Text] = None,
    extractors: Optional[List[extractor.Extractor]] = None,
    evaluators: Optional[List[evaluator.Evaluator]] = None,
    writers: Optional[List[writer.Writer]] = None,
    pipeline_options: Optional[Any] = None,
    slice_spec: Optional[List[slicer.SingleSliceSpec]] = None,
    write_config: Optional[bool] = True,
    compute_confidence_intervals: Optional[bool] = False,
    k_anonymization_count: int = 1,
    desired_batch_size: Optional[int] = None,
    random_seed_for_testing: Optional[int] = None
) -> Union[EvalResult, EvalResults]:
  """Runs TensorFlow model analysis.

  It runs a Beam pipeline to compute the slicing metrics exported in TensorFlow
  Eval SavedModel and returns the results.

  This is a simplified API for users who want to quickly get something running
  locally. Users who wish to create their own Beam pipelines can use the
  Evaluate PTransform instead.

  Args:
    eval_shared_model: Optional shared model (single-model evaluation) or dict
      of shared models keyed by model name (multi-model evaluation). Only
      required if needed by default extractors, evaluators, or writers.
    eval_config: Eval config.
    data_location: The location of the data files.
    file_format: The file format of the data, can be either 'text' or
      'tfrecords' for now. By default, 'tfrecords' will be used.
    output_path: The directory to output metrics and results to. If None, we use
      a temporary directory.
    extractors: Optional list of Extractors to apply to Extracts. Typically
      these will be added by calling the default_extractors function. If no
      extractors are provided, default_extractors (non-materialized) will be
      used.
    evaluators: Optional list of Evaluators for evaluating Extracts. Typically
      these will be added by calling the default_evaluators function. If no
      evaluators are provided, default_evaluators will be used.
    writers: Optional list of Writers for writing Evaluation output. Typically
      these will be added by calling the default_writers function. If no writers
      are provided, default_writers will be used.
    pipeline_options: Optional arguments to run the Pipeline, for instance
      whether to run directly.
    slice_spec: Deprecated (use EvalConfig).
    write_config: Deprecated (use EvalConfig).
    compute_confidence_intervals: Deprecated (use EvalConfig).
    k_anonymization_count: Deprecated (use EvalConfig).
    desired_batch_size: Optional batch size for batching in Predict.
    random_seed_for_testing: Provide for deterministic tests only.

  Returns:
    An EvalResult that can be used with the TFMA visualization functions.

  Raises:
    ValueError: If the file_format is unknown to us.
  """
  _assert_tensorflow_version()

  if output_path is None:
    output_path = tempfile.mkdtemp()
  if not tf.io.gfile.exists(output_path):
    tf.io.gfile.makedirs(output_path)

  if eval_config is None:
    model_specs = []
    eval_shared_models = eval_shared_model
    if not isinstance(eval_shared_model, dict):
      eval_shared_models = {'': eval_shared_model}
    for model_name, shared_model in eval_shared_models.items():
      example_weight_key = shared_model.example_weight_key
      example_weight_keys = {}
      if example_weight_key and isinstance(example_weight_key, dict):
        example_weight_keys = example_weight_key
        example_weight_key = ''
      model_specs.append(
          config.ModelSpec(
              name=model_name,
              example_weight_key=example_weight_key,
              example_weight_keys=example_weight_keys))
    slicing_specs = None
    if slice_spec:
      slicing_specs = [s.to_proto() for s in slice_spec]
    options = config.Options()
    options.compute_confidence_intervals.value = compute_confidence_intervals
    options.k_anonymization_count.value = k_anonymization_count
    if not write_config:
      options.disabled_outputs.values.append(_EVAL_CONFIG_FILE)
    eval_config = config.EvalConfig(
        model_specs=model_specs, slicing_specs=slicing_specs, options=options)

  with beam.Pipeline(options=pipeline_options) as p:
    if file_format == 'tfrecords':
      data = p | 'ReadFromTFRecord' >> beam.io.ReadFromTFRecord(
          file_pattern=data_location,
          compression_type=beam.io.filesystem.CompressionTypes.AUTO)
    elif file_format == 'text':
      data = p | 'ReadFromText' >> beam.io.textio.ReadFromText(data_location)
    else:
      raise ValueError('unknown file_format: {}'.format(file_format))

    # pylint: disable=no-value-for-parameter
    _ = (
        data
        | 'ExtractEvaluateAndWriteResults' >> ExtractEvaluateAndWriteResults(
            eval_config=eval_config,
            eval_shared_model=eval_shared_model,
            display_only_data_location=data_location,
            display_only_file_format=file_format,
            output_path=output_path,
            extractors=extractors,
            evaluators=evaluators,
            writers=writers,
            desired_batch_size=desired_batch_size,
            random_seed_for_testing=random_seed_for_testing))
    # pylint: enable=no-value-for-parameter

  if len(eval_config.model_specs) <= 1:
    return load_eval_result(output_path)
  else:
    results = []
    for spec in eval_config.model_specs:
      results.append(load_eval_result(output_path, model_name=spec.name))
    return EvalResults(results, constants.MODEL_CENTRIC_MODE)