Ejemplo n.º 1
0
    def testSliceOnMetaFeature(self):
        # We want to make sure that slicing on the newly added feature works, so
        # pulling in slice here.
        with beam.Pipeline() as pipeline:
            fpls = create_fpls()
            metrics = (
                pipeline
                | 'CreateTestInput' >> beam.Create(fpls)
                | 'WrapFpls' >> beam.Map(wrap_fpl)
                | 'ExtractInterestsNum' >>
                meta_feature_extractor.ExtractMetaFeature(get_num_interests)
                | 'ExtractSlices' >> slice_key_extractor._ExtractSliceKeys([
                    slicer.SingleSliceSpec(),
                    slicer.SingleSliceSpec(columns=['num_interests'])
                ])
                | 'FanoutSlices' >> slicer.FanoutSlices())

            def check_result(got):
                try:
                    self.assertEqual(4, len(got), 'got: %s' % got)
                    expected_slice_keys = [
                        (),
                        (),
                        (('num_interests', 1), ),
                        (('num_interests', 2), ),
                    ]
                    self.assertEqual(sorted(slice_key for slice_key, _ in got),
                                     sorted(expected_slice_keys))
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics, check_result)
Ejemplo n.º 2
0
    def testSliceOneSlice(self):
        with beam.Pipeline() as pipeline:
            fpls = create_fpls()
            metrics = (
                pipeline
                | 'CreateTestInput' >> beam.Create(fpls)
                | 'WrapFpls' >> beam.Map(wrap_fpl)
                | 'ExtractSlices' >> slice_key_extractor._ExtractSliceKeys([
                    slicer.SingleSliceSpec(),
                    slicer.SingleSliceSpec(columns=['gender'])
                ])
                | 'FanoutSlices' >> slicer.FanoutSlices())

            def check_result(got):
                try:
                    self.assertEqual(4, len(got), 'got: %s' % got)
                    expected_result = [
                        ((), wrap_fpl(fpls[0])),
                        ((), wrap_fpl(fpls[1])),
                        ((('gender', 'f'), ), wrap_fpl(fpls[0])),
                        ((('gender', 'm'), ), wrap_fpl(fpls[1])),
                    ]
                    self.assertEqual(
                        sorted(got, key=lambda x: x[0]),
                        sorted(expected_result, key=lambda x: x[0]))
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics, check_result)
  def testMaterializedSliceKeys(self):
    with beam.Pipeline() as pipeline:
      fpls = create_fpls()
      slice_keys_extracts = (
          pipeline
          | 'CreateTestInput' >> beam.Create(fpls)
          | 'WrapFpls' >> beam.Map(wrap_fpl)
          | 'ExtractSlices' >> slice_key_extractor._ExtractSliceKeys(
              [
                  slicer.SingleSliceSpec(),
                  slicer.SingleSliceSpec(columns=['gender'])
              ],
              materialize=True))

      def check_result(got):
        try:
          self.assertEqual(2, len(got), 'got: %s' % got)
          expected_results = sorted([
              types.MaterializedColumn(
                  name=constants.SLICE_KEYS_KEY,
                  value=[b'Overall', b'gender:f']),
              types.MaterializedColumn(
                  name=constants.SLICE_KEYS_KEY,
                  value=[b'Overall', b'gender:m'])
          ])
          got_results = []
          for item in got:
            self.assertTrue(constants.SLICE_KEYS_KEY in item)
            got_results.append(item[constants.SLICE_KEYS_KEY])
          self.assertEqual(sorted(got_results), sorted(expected_results))
        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(slice_keys_extracts, check_result)
  def testSliceKeys(self):
    with beam.Pipeline() as pipeline:
      fpls = create_fpls()
      slice_keys_extracts = (
          pipeline
          | 'CreateTestInput' >> beam.Create(fpls)
          | 'WrapFpls' >> beam.Map(wrap_fpl)
          | 'ExtractSlices' >> slice_key_extractor._ExtractSliceKeys([
              slicer.SingleSliceSpec(),
              slicer.SingleSliceSpec(columns=['gender'])
          ]))

      def check_result(got):
        try:
          self.assertEqual(2, len(got), 'got: %s' % got)
          expected_results = sorted([[(), (('gender', 'f'),)],
                                     [(), (('gender', 'm'),)]])
          got_results = []
          for item in got:
            self.assertTrue(constants.SLICE_KEY_TYPES_KEY in item)
            got_results.append(sorted(item[constants.SLICE_KEY_TYPES_KEY]))
          self.assertEqual(sorted(got_results), sorted(expected_results))
        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(slice_keys_extracts, check_result)
Ejemplo n.º 5
0
    def testSerializeDeserializeToFile(self):
        metrics_slice_key1 = (('fruit', 'pear'), ('animal', 'duck'))
        metrics1 = {
            'alpha': np.array([0.1]),
            'bravo': np.array([0.2]),
            'charlie': np.float32(0.3)
        }
        expected_metrics1 = {
            'alpha': [0.1],
            'bravo': [0.2],
            'charlie': 0.3,
        }
        plots_slice_key1 = (('fruit', 'peach'), ('animal', 'cow'))
        plots1 = {
            'alpha': np.array([0.5, 0.6, 0.7]),
            'bravo': np.array([0.6, 0.7, 0.8]),
            'charlie': np.float32(0.7)
        }
        expected_plots1 = {
            'alpha': [0.5, 0.6, 0.7],
            'bravo': [0.6, 0.7, 0.8],
            'charlie': 0.7,
        }
        eval_config = api_types.EvalConfig(
            model_location='/path/to/model',
            data_location='/path/to/data',
            slice_spec=[
                slicer.SingleSliceSpec(features=[('age', 5), ('gender', 'f')],
                                       columns=['country']),
                slicer.SingleSliceSpec(features=[('age', 6), ('gender', 'm')],
                                       columns=['interest'])
            ],
            example_weight_metric_key='key')

        output_path = self._getTempDir()
        with beam.Pipeline() as pipeline:
            metrics = (pipeline
                       | 'CreateMetrics' >> beam.Create(
                           [(metrics_slice_key1, metrics1)]))
            plots = (
                pipeline
                | 'CreatePlots' >> beam.Create([(plots_slice_key1, plots1)]))

            _ = ((metrics, plots)
                 | 'WriteMetricsPlotsAndConfig' >>
                 serialization.WriteMetricsPlotsAndConfig(
                     output_path=output_path, eval_config=eval_config))

        metrics, plots = serialization.load_plots_and_metrics(output_path)
        self.assertSliceMetricsListEqual(
            [(metrics_slice_key1, expected_metrics1)], metrics)
        self.assertSliceMetricsListEqual([(plots_slice_key1, expected_plots1)],
                                         plots)

        got_eval_config = serialization.load_eval_config(output_path)
        self.assertEqual(eval_config, got_eval_config)
Ejemplo n.º 6
0
def run_analysis(output_dir, model_dir, eval_path, schema, project, mode,
                 slice_columns):
    if mode == 'local':
        pipeline_options = None
        runner = 'DirectRunner'
    elif mode == 'cloud':
        tmp_location = os.path.join(output_dir, 'tmp')
        options = {
            'job_name':
            'pipeline-tfma-' +
            datetime.datetime.now().strftime('%y%m%d-%H%M%S'),
            'setup_file':
            './analysis/setup.py',
            'project':
            project,
            'temp_location':
            tmp_location,
        }
        pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
        runner = 'DataFlowRunner'
    else:
        raise ValueError("Invalid mode %s." % mode)

    column_names = [x['name'] for x in schema]
    for slice_column in slice_columns:
        if slice_column not in column_names:
            raise ValueError("Unknown slice column: %s" % slice_column)

    slice_spec = [
        slicer.SingleSliceSpec(
        ),  # An empty spec is required for the 'Overall' slice
        slicer.SingleSliceSpec(columns=slice_columns)
    ]

    with beam.Pipeline(runner=runner, options=pipeline_options) as pipeline:
        raw_feature_spec = get_raw_feature_spec(schema)
        raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
        example_coder = tft_coders.example_proto_coder.ExampleProtoCoder(
            raw_schema)
        csv_coder = tft_coders.CsvCoder(column_names, raw_schema)

        raw_data = (
            pipeline
            | 'ReadFromText' >> beam.io.ReadFromText(eval_path)
            | 'ParseCSV' >> beam.Map(csv_coder.decode)
            | 'CleanData' >> beam.Map(clean_raw_data_dict(raw_feature_spec))
            | 'ToSerializedTFExample' >> beam.Map(example_coder.encode)
            | 'EvaluateAndWriteResults' >> tfma.EvaluateAndWriteResults(
                eval_saved_model_path=model_dir,
                slice_spec=slice_spec,
                output_path=output_dir))
Ejemplo n.º 7
0
    def testBuildAnalysisTableWithSlices(self):
        model_location = self._exportEvalSavedModel(
            linear_classifier.simple_linear_classifier)
        eval_shared_model = model_eval_lib.default_eval_shared_model(
            eval_saved_model_path=model_location)

        example1 = self._makeExample(age=3.0,
                                     language='english',
                                     label=1.0,
                                     slice_key='first_slice')
        slice_spec = [
            slicer.SingleSliceSpec(columns=['age']),
            slicer.SingleSliceSpec(features=[('age', 3)]),
            slicer.SingleSliceSpec(columns=['age'],
                                   features=[('language', 'english')])
        ]

        with beam.Pipeline() as pipeline:
            result = (
                pipeline
                | 'CreateInput' >> beam.Create([example1.SerializeToString()])
                | 'BuildTable' >> contrib.BuildAnalysisTable(
                    eval_shared_model, slice_spec))

            def check_result(got):
                self.assertEqual(1, len(got), 'got: %s' % got)
                extracts = got[0]

                # Values of type MaterializedColumn are emitted to signal to
                # downstream sink components to output the data to file.
                materialized_dict = dict(
                    (k, v) for k, v in extracts.items()
                    if isinstance(v, types.MaterializedColumn))
                self._assertMaterializedColumns(
                    materialized_dict, {
                        constants.SLICE_KEYS_KEY:
                        types.MaterializedColumn(
                            name=constants.SLICE_KEYS_KEY,
                            value=[
                                b'age:3.0', b'age:3',
                                b'age_X_language:3.0_X_english'
                            ])
                    })
                self._assertMaterializedColumnsExist(materialized_dict, [
                    'predictions__logits', 'predictions__probabilities',
                    'predictions__classes', 'predictions__logistic',
                    'predictions__class_ids'
                ])

            util.assert_that(result[constants.ANALYSIS_KEY], check_result)
Ejemplo n.º 8
0
 def testSerializeDeserializeEvalConfig(self):
   eval_config = model_eval_lib.EvalConfig(
       model_location='/path/to/model',
       data_location='/path/to/data',
       slice_spec=[
           slicer.SingleSliceSpec(
               features=[('age', 5), ('gender', 'f')], columns=['country']),
           slicer.SingleSliceSpec(
               features=[('age', 6), ('gender', 'm')], columns=['interest'])
       ],
       example_weight_metric_key='key')
   serialized = model_eval_lib._serialize_eval_config(eval_config)
   deserialized = pickle.loads(serialized)
   got_eval_config = deserialized[model_eval_lib._EVAL_CONFIG_KEY]
   self.assertEqual(eval_config, got_eval_config)
Ejemplo n.º 9
0
def BuildDiagnosticTable(  # pylint: disable=invalid-name
    examples,
    eval_saved_model_path,
    slice_spec = None,
    desired_batch_size = None):
  """Public API version of evaluate.BuildDiagnosticTable.

  Use this function to build an example-oriented PCollection containing, for
  each example, an ExampleAndExtracts, useful for debugging models.

  Args:
    examples: PCollection of input examples. Can be any format the model accepts
      (e.g. string containing CSV row, TensorFlow.Example, etc).
    eval_saved_model_path: Path to EvalSavedModel. This directory should contain
      the saved_model.pb file.
    slice_spec: Optional list of SingleSliceSpec specifying the slices to slice
      the data into. If None, defaults to the overall slice.
    desired_batch_size: Optional batch size for batching in Predict and
      Aggregate.

  Returns:
    beam.PCollection of ExampleAndExtracts. The caller is responsible for
    committing to file for now.
  """
  if slice_spec is None:
    slice_spec = [slicer.SingleSliceSpec()]
  return (examples
          | 'BuildDiagnosticTable' >> evaluate.BuildDiagnosticTable(
              eval_saved_model_path, slice_spec, desired_batch_size))
Ejemplo n.º 10
0
  def testSliceDefaultSlice(self):
    with beam.Pipeline() as pipeline:
      fpls = create_fpls()

      metrics = (
          pipeline
          | 'CreateTestInput' >> beam.Create(fpls)
          | 'WrapFpls' >> beam.Map(wrap_fpl)
          | 'ExtractSlices' >> slice_key_extractor._ExtractSliceKeys(
              [slicer.SingleSliceSpec()])
          | 'FanoutSlices' >> slicer.FanoutSlices())

      def check_result(got):
        try:
          self.assertEqual(2, len(got), 'got: %s' % got)
          expected_result = [
              ((), wrap_fpl(fpls[0])),
              ((), wrap_fpl(fpls[1])),
          ]
          self.assertEqual(len(got), len(expected_result))
          self.assertTrue(
              got[0] == expected_result[0] and got[1] == expected_result[1] or
              got[1] == expected_result[0] and got[0] == expected_result[1])
        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(metrics, check_result)
Ejemplo n.º 11
0
def default_extractors(  # pylint: disable=invalid-name
        eval_shared_model: Optional[types.EvalSharedModel] = None,
        eval_shared_models: Optional[List[types.EvalSharedModel]] = None,
        eval_config: config.EvalConfig = None,
        slice_spec: Optional[List[slicer.SingleSliceSpec]] = None,
        desired_batch_size: Optional[int] = None,
        materialize: Optional[bool] = True) -> List[extractor.Extractor]:
    """Returns the default extractors for use in ExtractAndEvaluate.

  Args:
    eval_shared_model: Shared model (single-model evaluation).
    eval_shared_models: Shared models (multi-model evaluation).
    eval_config: Eval config.
    slice_spec: Deprecated (use EvalConfig).
    desired_batch_size: Deprecated (use EvalConfig).
    materialize: True to have extractors create materialized output.
  """
    # TODO(b/141016373): Add support for multiple models.
    if eval_config is not None:
        slice_spec = [
            slicer.SingleSliceSpec(spec=spec)
            for spec in eval_config.slicing_specs
        ]
        desired_batch_size = eval_config.desired_batch_size
    if eval_shared_model is not None:
        eval_shared_models = [eval_shared_model]
    return [
        predict_extractor.PredictExtractor(eval_shared_models[0],
                                           desired_batch_size,
                                           materialize=materialize),
        slice_key_extractor.SliceKeyExtractor(slice_spec,
                                              materialize=materialize)
    ]
Ejemplo n.º 12
0
 def testMultipleDataAnalysis(self):
   model_location = self._exportEvalSavedModel(
       linear_classifier.simple_linear_classifier)
   data_location_1 = self._writeTFExamplesToTFRecords([
       self._makeExample(age=3.0, language='english', label=1.0),
       self._makeExample(age=3.0, language='english', label=0.0),
       self._makeExample(age=5.0, language='chinese', label=1.0)
   ])
   data_location_2 = self._writeTFExamplesToTFRecords(
       [self._makeExample(age=4.0, language='english', label=1.0)])
   eval_results = model_eval_lib.multiple_data_analysis(
       model_location, [data_location_1, data_location_2],
       slice_spec=[slicer.SingleSliceSpec(features=[('language', 'english')])])
   self.assertEqual(2, len(eval_results._results))
   # We only check some of the metrics to ensure that the end-to-end
   # pipeline works.
   expected_result_1 = {
       (('language', 'english'),): {
           metric_keys.EXAMPLE_COUNT: {
               'doubleValue': 2.0
           },
       }
   }
   expected_result_2 = {
       (('language', 'english'),): {
           metric_keys.EXAMPLE_COUNT: {
               'doubleValue': 1.0
           },
       }
   }
   self.assertMetricsAlmostEqual(eval_results._results[0].slicing_metrics,
                                 expected_result_1)
   self.assertMetricsAlmostEqual(eval_results._results[1].slicing_metrics,
                                 expected_result_2)
Ejemplo n.º 13
0
  def testIsSliceApplicable(self):
    test_cases = [
        ('applicable', ['column1'],
         [('column3', 'value3'), ('column4', 'value4')],
         (('column1', 'value1'), ('column3', 'value3'), ('column4', 'value4')),
         True),
        ('wrongcolumns', ['column1', 'column2'],
         [('column3', 'value3'), ('column4', 'value4')],
         (('column1', 'value1'), ('column3', 'value3'), ('column4', 'value4')),
         False),
        ('wrongfeatures', ['column1'], [('column3', 'value3')],
         (('column1', 'value1'), ('column3', 'value3'), ('column4', 'value4')),
         False),
        ('nocolumns', [], [('column3', 'value3')],
         (('column1', 'value1'), ('column3', 'value3'), ('column4', 'value4')),
         False),
        ('nofeatures', ['column1'], [], (('column1', 'value1'),), True),
        ('empty slice key', ['column1'], [('column2', 'value1')], (), False),
        ('overall', [], [], (), True)
    ]  # pyformat: disable

    for (name, columns, features, slice_key, result) in test_cases:
      slice_spec = slicer.SingleSliceSpec(columns=columns, features=features)
      self.assertEqual(
          slice_spec.is_slice_applicable(slice_key), result, msg=name)
Ejemplo n.º 14
0
 def assertSliceResult(self, name, features_dict, columns, features, expected):
   spec = slicer.SingleSliceSpec(columns=columns, features=features)
   msg = 'Test case %s: slice on columns %s, features %s' % (name, columns,
                                                             features)
   six.assertCountEqual(
       self, expected,
       slicer.get_slices_for_features_dict(features_dict, [spec]), msg)
Ejemplo n.º 15
0
def SliceKeyExtractor(
        slice_spec: Optional[List[slicer.SingleSliceSpec]] = None,
        materialize: Optional[bool] = True) -> extractor.Extractor:
    """Creates an extractor for extracting slice keys.

  The incoming Extracts must contain a FeaturesPredictionsLabels extract keyed
  by tfma.FEATURES_PREDICTIONS_LABELS_KEY. Typically this will be obtained by
  calling the PredictExtractor.

  The extractor's PTransform yields a copy of the Extracts input with an
  additional extract pointing at the list of SliceKeyType values keyed by
  tfma.SLICE_KEY_TYPES_KEY. If materialize is True then a materialized version
  of the slice keys will be added under the key tfma.MATERIALZED_SLICE_KEYS_KEY.

  Args:
    slice_spec: Optional list of SingleSliceSpec specifying the slices to slice
      the data into. If None, defaults to the overall slice.
    materialize: True to add MaterializedColumn entries for the slice keys.

  Returns:
    Extractor for slice keys.
  """
    if slice_spec is None:
        slice_spec = [slicer.SingleSliceSpec()]
    return extractor.Extractor(stage_name=SLICE_KEY_EXTRACTOR_STAGE_NAME,
                               ptransform=_ExtractSliceKeys(
                                   slice_spec, materialize))
 def testRunModelAnalysisWithUncertainty(self):
     model_location = self._exportEvalSavedModel(
         linear_classifier.simple_linear_classifier)
     examples = [
         self._makeExample(age=3.0, language='english', label=1.0),
         self._makeExample(age=3.0, language='chinese', label=0.0),
         self._makeExample(age=4.0, language='english', label=1.0),
         self._makeExample(age=5.0, language='chinese', label=1.0)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     slice_spec = [slicer.SingleSliceSpec(columns=['language'])]
     eval_result = model_eval_lib.run_model_analysis(
         model_eval_lib.default_eval_shared_model(
             eval_saved_model_path=model_location,
             example_weight_key='age'),
         data_location,
         slice_spec=slice_spec,
         num_bootstrap_samples=20)
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
         ((b'language', b'chinese'), ): {
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 8.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 2.0
             },
         },
         ((b'language', b'english'), ): {
             'accuracy': {
                 'boundedValue': {
                     'value': 1.0,
                     'lowerBound': 1.0,
                     'upperBound': 1.0,
                     'methodology': 'POISSON_BOOTSTRAP'
                 }
             },
             'my_mean_label': {
                 'boundedValue': {
                     'value': 1.0,
                     'lowerBound': 1.0,
                     'upperBound': 1.0,
                     'methodology': 'POISSON_BOOTSTRAP'
                 }
             },
             metric_keys.EXAMPLE_WEIGHT: {
                 'doubleValue': 7.0
             },
             metric_keys.EXAMPLE_COUNT: {
                 'doubleValue': 2.0
             },
         }
     }
     self.assertEqual(eval_result.config.model_location, model_location)
     self.assertEqual(eval_result.config.data_location, data_location)
     self.assertEqual(eval_result.config.slice_spec, slice_spec)
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
     self.assertFalse(eval_result.plots)
Ejemplo n.º 17
0
def get_slicing_metrics(
    results,
    slicing_column = None,
    slicing_spec = None,
):
  """Util function that extracts slicing metrics from the results.

  If neither slicing_column nor slicing_spec is provided, get Overall. If
  slicing_column is set, use it to filter metrics from results. Otherwise, use
  slicing_spec for filtering.

  Args:
    results: A list of records. Each record is a tuple of (slice_name,
      {metric_name, metric_value}).
    slicing_column: The column to filter the resuslts with.
    slicing_spec: The slicer.SingleSliceSpec to filter the resutls with.

  Returns:
    A list of {slice, metrics}

  Raises:
    ValueError: The provided slicing_column does not exist in results or more
    than one set of overall result is found.
  """

  if slicing_column:
    data = find_all_slices(results,
                           slicer.SingleSliceSpec(columns=[slicing_column]))
  elif not slicing_spec:
    data = find_all_slices(results, slicer.SingleSliceSpec())
  else:
    data = find_all_slices(results, slicing_spec)

  slice_count = len(data)
  if not slice_count:
    if slicing_spec is None:
      if not slicing_column:
        slicing_column = slicer.OVERALL_SLICE_NAME
      raise ValueError('No slices found for %s' % slicing_column)
    else:
      raise ValueError('No slices found for %s' % slicing_spec)
  elif not slicing_column and not slicing_spec and slice_count > 1:
    raise ValueError(
        'More than one slice found for %s' % slicer.OVERALL_SLICE_NAME)
  else:
    return data
Ejemplo n.º 18
0
 def testSerializeDeserializeLegacyEvalConfig(self):
     output_path = self._getTempDir()
     old_config = LegacyConfig(
         model_location='/path/to/model',
         data_location='/path/to/data',
         slice_spec=[
             slicer.SingleSliceSpec(columns=['country'],
                                    features=[('age', 5), ('gender', 'f')]),
             slicer.SingleSliceSpec(columns=['interest'],
                                    features=[('age', 6), ('gender', 'm')])
         ],
         example_count_metric_key=None,
         example_weight_metric_key='key',
         compute_confidence_intervals=False,
         k_anonymization_count=1)
     final_dict = {}
     final_dict['tfma_version'] = tfma_version.VERSION_STRING
     final_dict['eval_config'] = old_config
     with tf.io.TFRecordWriter(os.path.join(output_path,
                                            'eval_config')) as w:
         w.write(pickle.dumps(final_dict))
     got_eval_config = model_eval_lib.load_eval_config(output_path)
     eval_config = config.EvalConfig(
         input_data_specs=[
             config.InputDataSpec(location=old_config.data_location)
         ],
         model_specs=[config.ModelSpec(location=old_config.model_location)],
         output_data_specs=[
             config.OutputDataSpec(default_location=output_path)
         ],
         slicing_specs=[
             config.SlicingSpec(feature_keys=['country'],
                                feature_values={
                                    'age': '5',
                                    'gender': 'f'
                                }),
             config.SlicingSpec(feature_keys=['interest'],
                                feature_values={
                                    'age': '6',
                                    'gender': 'm'
                                })
         ],
         compute_confidence_intervals=old_config.
         compute_confidence_intervals,
         k_anonymization_count=old_config.k_anonymization_count)
     self.assertEqual(eval_config, got_eval_config)
Ejemplo n.º 19
0
  def testGetSlicesForFeaturesDictMultipleSingleSliceSpecs(self):
    features_dict = self._makeFeaturesDict({
        'gender': ['f'],
        'age': [5],
        'interest': ['cars']
    })

    spec_overall = slicer.SingleSliceSpec()
    spec_age = slicer.SingleSliceSpec(columns=['age'])
    spec_age4 = slicer.SingleSliceSpec(features=[('age', 4)])
    spec_age5_gender = slicer.SingleSliceSpec(
        columns=['gender'], features=[('age', 5)])

    slice_spec = [spec_overall, spec_age, spec_age4, spec_age5_gender]
    expected = [(), (('age', 5),), (('age', 5), ('gender', 'f'))]
    self.assertItemsEqual(
        expected, slicer.get_slices_for_features_dict(features_dict,
                                                      slice_spec))
Ejemplo n.º 20
0
def BuildAnalysisTable(  # pylint: disable=invalid-name
    examples: beam.pvalue.PCollection,
    eval_shared_model: types.EvalSharedModel,
    slice_spec: Optional[List[slicer.SingleSliceSpec]] = None,
    desired_batch_size: Optional[int] = None,
    extractors: Optional[List[extractor.Extractor]] = None,
    evaluators: Optional[List[evaluator.Evaluator]] = None
) -> beam.pvalue.PCollection:
    """Builds an analysis table from data extracted from the input.

  Use this function to build an example-oriented PCollection of output data
  useful for debugging models.

  Args:
    examples: PCollection of input examples. Can be any format the model accepts
      (e.g. string containing CSV row, TensorFlow.Example, etc).
    eval_shared_model: Shared model parameters for EvalSavedModel.
    slice_spec: Optional list of SingleSliceSpec specifying the slices to slice
      the data into. If None, defaults to the overall slice.
    desired_batch_size: Optional batch size for batching in Predict and
      Aggregate.
    extractors: Optional list of Extractors to execute prior to slicing and
      aggregating the metrics. If not provided, a default set will be run.
    evaluators: Optional list of Evaluators for evaluating Extracts. If not
      provided a default set will be used..

  Returns:
    beam.pvalue.PCollection of Extracts. The caller is responsible for
    committing to file for now.
  """
    if not slice_spec:
        slice_spec = [slicer.SingleSliceSpec()]

    if not extractors:
        extractors = [
            predict_extractor.PredictExtractor(eval_shared_model,
                                               desired_batch_size),
            feature_extractor.FeatureExtractor(),
            slice_key_extractor.SliceKeyExtractor(slice_spec)
        ]
    if not evaluators:
        evaluators = [analysis_table_evaluator.AnalysisTableEvaluator()]

    # pylint: disable=no-value-for-parameter
    return (examples
            | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
            | model_eval_lib.ExtractAndEvaluate(extractors=extractors,
                                                evaluators=evaluators))
Ejemplo n.º 21
0
def default_extractors(  # pylint: disable=invalid-name
        eval_shared_model: Optional[types.EvalSharedModel] = None,
        eval_shared_models: Optional[List[types.EvalSharedModel]] = None,
        eval_config: config.EvalConfig = None,
        slice_spec: Optional[List[slicer.SingleSliceSpec]] = None,
        desired_batch_size: Optional[int] = None,
        materialize: Optional[bool] = True) -> List[extractor.Extractor]:
    """Returns the default extractors for use in ExtractAndEvaluate.

  Args:
    eval_shared_model: Shared model (single-model evaluation).
    eval_shared_models: Shared models (multi-model evaluation).
    eval_config: Eval config.
    slice_spec: Deprecated (use EvalConfig).
    desired_batch_size: Deprecated (use EvalConfig).
    materialize: True to have extractors create materialized output.
  """
    # TODO(b/141016373): Add support for multiple models.
    if eval_config is not None:
        slice_spec = [
            slicer.SingleSliceSpec(spec=spec)
            for spec in eval_config.slicing_specs
        ]
        desired_batch_size = eval_config.desired_batch_size
    if eval_shared_model is not None:
        eval_shared_models = [eval_shared_model]
    if (not eval_shared_models[0].model_loader.tags
            or eval_shared_models[0].model_loader.tags
            == [eval_constants.EVAL_TAG]):
        # Backwards compatibility for previous EvalSavedModel implementation.
        return [
            predict_extractor.PredictExtractor(eval_shared_models[0],
                                               desired_batch_size,
                                               materialize=materialize),
            slice_key_extractor.SliceKeyExtractor(slice_spec,
                                                  materialize=materialize)
        ]
    else:
        raise NotImplementedError(
            'keras and serving models not implemented yet.')
Ejemplo n.º 22
0
  def testSliceDefaultSlice(self):
    with beam.Pipeline() as pipeline:
      fpls = create_fpls()

      metrics = (
          pipeline
          | 'CreateTestInput' >> beam.Create(fpls)
          | 'WrapFpls' >> beam.Map(wrap_fpl)
          | 'Slice' >> slice_api.Slice([slicer.SingleSliceSpec()]))

      def check_result(got):
        try:
          self.assertEqual(2, len(got), 'got: %s' % got)
          expected_result = [
              ((), fpls[0]),
              ((), fpls[1]),
          ]
          self.assertEqual(sorted(got), sorted(expected_result))
        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(metrics, check_result)
Ejemplo n.º 23
0
 def testRunModelAnalysis(self):
     model_location = self._exportEvalSavedModel(
         linear_classifier.simple_linear_classifier)
     examples = [
         self._makeExample(age=3.0, language='english', label=1.0),
         self._makeExample(age=3.0, language='chinese', label=0.0),
         self._makeExample(age=4.0, language='english', label=1.0),
         self._makeExample(age=5.0, language='chinese', label=1.0)
     ]
     data_location = self._writeTFExamplesToTFRecords(examples)
     slice_spec = [slicer.SingleSliceSpec(columns=['language'])]
     eval_result = model_eval_lib.run_model_analysis(
         model_location,
         data_location,
         slice_spec=slice_spec,
         example_weight_key='age')
     # We only check some of the metrics to ensure that the end-to-end
     # pipeline works.
     expected = {
         (('language', 'chinese'), ): {
             'accuracy': 0.5,
             'my_mean_label': 0.5,
             metric_keys.EXAMPLE_WEIGHT: 8.0,
             metric_keys.EXAMPLE_COUNT: 2.0,
         },
         (('language', 'english'), ): {
             'accuracy': 1.0,
             'my_mean_label': 1.0,
             metric_keys.EXAMPLE_WEIGHT: 7.0,
             metric_keys.EXAMPLE_COUNT: 2.0,
         }
     }
     self.assertEqual(eval_result.config.model_location, model_location)
     self.assertEqual(eval_result.config.data_location, data_location)
     self.assertEqual(eval_result.config.slice_spec, slice_spec)
     self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
     self.assertFalse(eval_result.plots)
Ejemplo n.º 24
0
  def testSliceEquality(self):
    overall = slicer.SingleSliceSpec()
    age_column = slicer.SingleSliceSpec(columns=['age'])
    age_feature = slicer.SingleSliceSpec(features=[('age', 5)])
    age_and_gender = slicer.SingleSliceSpec(
        columns=['age'], features=[('gender', 'f')])

    # Note that we construct new instances of the slices to ensure that we
    # aren't just checking object identity.
    def check_equality_and_hash_equality(left, right):
      self.assertEqual(left, right)
      self.assertEqual(hash(left), hash(right))

    check_equality_and_hash_equality(overall, slicer.SingleSliceSpec())
    check_equality_and_hash_equality(age_column,
                                     slicer.SingleSliceSpec(columns=['age']))
    check_equality_and_hash_equality(
        age_feature, slicer.SingleSliceSpec(features=[('age', 5)]))
    check_equality_and_hash_equality(
        age_and_gender,
        slicer.SingleSliceSpec(columns=['age'], features=[('gender', 'f')]))

    self.assertNotEqual(overall, age_column)
    self.assertNotEqual(age_column, age_feature)
    self.assertNotEqual(age_column, age_and_gender)
    self.assertNotEqual(age_feature, age_and_gender)

    self.assertItemsEqual([slicer.SingleSliceSpec()], [overall])
    self.assertItemsEqual([
        slicer.SingleSliceSpec(columns=['age']),
        slicer.SingleSliceSpec(),
        slicer.SingleSliceSpec(features=[('age', 5)]),
        slicer.SingleSliceSpec(columns=['age'], features=[('gender', 'f')])
    ], [age_and_gender, age_feature, overall, age_column])
    def testModelAgnosticConstructFn(self):
        # End to end test for the entire flow going from tf.Examples -> metrics
        # with slicing.
        with beam.Pipeline() as pipeline:
            # Set up the inputs. All we need is are tf.Examples and an example parsing
            # spec with explicit mapping for key to (Features, Predictions, Labels).
            examples = [
                self._makeExample(age=3.0,
                                  language='english',
                                  probabilities=1.0,
                                  labels=1.0),
                self._makeExample(age=3.0,
                                  language='chinese',
                                  probabilities=3.0,
                                  labels=0.0),
                self._makeExample(age=4.0,
                                  language='english',
                                  probabilities=2.0,
                                  labels=1.0),
                self._makeExample(age=5.0,
                                  language='chinese',
                                  probabilities=3.0,
                                  labels=0.0),
                # Add some examples with no language.
                self._makeExample(age=5.0, probabilities=2.0, labels=10.0),
                self._makeExample(age=6.0, probabilities=1.0, labels=0.0)
            ]
            serialized_examples = [e.SerializeToString() for e in examples]

            # Set up a config to bucket our example keys.
            feature_map = {
                'age': tf.FixedLenFeature([], tf.float32),
                'language': tf.VarLenFeature(tf.string),
                'probabilities': tf.FixedLenFeature([], tf.float32),
                'labels': tf.FixedLenFeature([], tf.float32)
            }

            model_agnostic_config = agnostic_predict.ModelAgnosticConfig(
                label_keys=['labels'],
                prediction_keys=['probabilities'],
                feature_spec=feature_map)

            # Set up the Model Agnostic Extractor
            extractors = [
                model_agnostic_extractor.ModelAgnosticExtractor(
                    model_agnostic_config=model_agnostic_config,
                    desired_batch_size=3),
                slice_key_extractor.SliceKeyExtractor([
                    slicer.SingleSliceSpec(),
                    slicer.SingleSliceSpec(columns=['language'])
                ])
            ]

            # Set up the metrics we wish to calculate via a metric callback. In
            # particular, this metric calculates the mean and sum of all labels.
            eval_shared_model = types.EvalSharedModel(
                add_metrics_callbacks=[add_mean_callback],
                construct_fn=model_agnostic_evaluate_graph.make_construct_fn(
                    add_metrics_callbacks=[add_mean_callback],
                    fpl_feed_config=model_agnostic_extractor.
                    ModelAgnosticGetFPLFeedConfig(model_agnostic_config)))

            # Run our pipeline doing Extract -> Slice -> Fanout -> Calculate Metrics.
            metrics, _ = (
                pipeline
                | 'Create Examples' >> beam.Create(serialized_examples)
                | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                | 'Extract' >> tfma_unit.Extract(extractors=extractors)  # pylint: disable=no-value-for-parameter
                | 'ComputeMetricsAndPlots' >> metrics_and_plots_evaluator.
                ComputeMetricsAndPlots(eval_shared_model=eval_shared_model))

            # Verify our metrics are properly generated per slice.
            def check_result(got):
                self.assertEqual(3, len(got), 'got: %s' % got)
                slices = {}
                for slice_key, metrics in got:
                    slices[slice_key] = metrics
                overall_slice = ()
                english_slice = (('language', b'english'), )
                chinese_slice = (('language', b'chinese'), )

                self.assertItemsEqual(
                    list(slices.keys()),
                    [overall_slice, english_slice, chinese_slice])
                # Overall slice has label/predictions sum = 24 and 12 elements.
                self.assertDictElementsAlmostEqual(slices[overall_slice], {
                    'tf_metric_mean': 2.0,
                    'py_func_total_label': 24.0,
                })
                # English slice has label/predictions sum = 5 and 4 elements.
                self.assertDictElementsAlmostEqual(slices[english_slice], {
                    'tf_metric_mean': 1.25,
                    'py_func_total_label': 5.0,
                })
                # Chinese slice has label/predictions sum = 6 and 4 elements.
                self.assertDictElementsAlmostEqual(slices[chinese_slice], {
                    'tf_metric_mean': 1.5,
                    'py_func_total_label': 6.0,
                })

            util.assert_that(metrics, check_result)
    def testEvaluateWithSlicingAndDifferentBatchSizes(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = linear_classifier.simple_linear_classifier(
            None, temp_eval_export_dir)
        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=eval_export_dir,
            add_metrics_callbacks=[_addExampleCountMetricCallback])
        extractors = [
            predict_extractor.PredictExtractor(eval_shared_model),
            slice_key_extractor.SliceKeyExtractor([
                slicer.SingleSliceSpec(),
                slicer.SingleSliceSpec(columns=['slice_key'])
            ])
        ]

        for batch_size in [1, 2, 4, 8]:

            with beam.Pipeline() as pipeline:
                example1 = self._makeExample(age=3.0,
                                             language='english',
                                             label=1.0,
                                             slice_key='first_slice')
                example2 = self._makeExample(age=3.0,
                                             language='chinese',
                                             label=0.0,
                                             slice_key='first_slice')
                example3 = self._makeExample(age=4.0,
                                             language='english',
                                             label=0.0,
                                             slice_key='second_slice')
                example4 = self._makeExample(age=5.0,
                                             language='chinese',
                                             label=1.0,
                                             slice_key='second_slice')
                example5 = self._makeExample(age=5.0,
                                             language='chinese',
                                             label=1.0,
                                             slice_key='second_slice')

                metrics, plots = (
                    pipeline
                    | 'Create' >> beam.Create([
                        example1.SerializeToString(),
                        example2.SerializeToString(),
                        example3.SerializeToString(),
                        example4.SerializeToString(),
                        example5.SerializeToString(),
                    ])
                    | 'InputsToExtracts' >> model_eval_lib.InputsToExtracts()
                    | 'Extract' >> tfma_unit.Extract(extractors=extractors)  # pylint: disable=no-value-for-parameter
                    | 'ComputeMetricsAndPlots' >>
                    metrics_and_plots_evaluator.ComputeMetricsAndPlots(
                        eval_shared_model=eval_shared_model,
                        desired_batch_size=batch_size))

                def check_result(got):
                    try:
                        self.assertEqual(3, len(got), 'got: %s' % got)
                        slices = {}
                        for slice_key, value in got:
                            slices[slice_key] = value
                        overall_slice = ()
                        first_slice = (('slice_key', b'first_slice'), )
                        second_slice = (('slice_key', b'second_slice'), )
                        self.assertItemsEqual(
                            list(slices.keys()),
                            [overall_slice, first_slice, second_slice])
                        self.assertDictElementsAlmostEqual(
                            slices[overall_slice], {
                                'accuracy': 0.4,
                                'label/mean': 0.6,
                                'my_mean_age': 4.0,
                                'my_mean_age_times_label': 2.6,
                                'added_example_count': 5.0
                            })
                        self.assertDictElementsAlmostEqual(
                            slices[first_slice], {
                                'accuracy': 1.0,
                                'label/mean': 0.5,
                                'my_mean_age': 3.0,
                                'my_mean_age_times_label': 1.5,
                                'added_example_count': 2.0
                            })
                        self.assertDictElementsAlmostEqual(
                            slices[second_slice], {
                                'accuracy': 0.0,
                                'label/mean': 2.0 / 3.0,
                                'my_mean_age': 14.0 / 3.0,
                                'my_mean_age_times_label': 10.0 / 3.0,
                                'added_example_count': 3.0
                            })

                    except AssertionError as err:
                        # This function is redefined every iteration, so it will have the
                        # right value of batch_size.
                        raise util.BeamAssertException(
                            'batch_size = %d, error: %s' % (batch_size, err))  # pylint: disable=cell-var-from-loop

                util.assert_that(metrics, check_result, label='metrics')
                util.assert_that(plots, util.is_empty(), label='plots')
Ejemplo n.º 27
0
    def testAssertGeneralMetricsComputedWithBeamAre(self):
        temp_eval_export_dir = self._getEvalExportDir()
        _, eval_export_dir = (fixed_prediction_estimator_extra_fields.
                              simple_fixed_prediction_estimator_extra_fields(
                                  None, temp_eval_export_dir))
        examples = [
            self.makeExample(prediction=0.0,
                             label=0.0,
                             fixed_string='negative_slice',
                             fixed_float=0.0,
                             fixed_int=0),
            self.makeExample(prediction=0.2,
                             label=0.0,
                             fixed_string='negative_slice',
                             fixed_float=0.0,
                             fixed_int=0),
            self.makeExample(prediction=0.4,
                             label=0.0,
                             fixed_string='negative_slice',
                             fixed_float=0.0,
                             fixed_int=0),
            self.makeExample(prediction=0.8,
                             label=1.0,
                             fixed_string='positive_slice',
                             fixed_float=0.0,
                             fixed_int=0),
            self.makeExample(prediction=0.9,
                             label=1.0,
                             fixed_string='positive_slice',
                             fixed_float=0.0,
                             fixed_int=0),
            self.makeExample(prediction=1.0,
                             label=1.0,
                             fixed_string='positive_slice',
                             fixed_float=0.0,
                             fixed_int=0),
        ]
        expected_slice_metrics = {}
        expected_slice_metrics[()] = {
            'average_loss': (0.00 + 0.04 + 0.16 + 0.04 + 0.01 + 0.00) / 6.0,
            'mae':
            0.15,
            # Note that we don't check the exact value because of numerical errors.
            metric_keys.AUC:
            tfma_unit.BoundedValue(0.98, 1.00),
        }
        # We don't check AUC for the positive / negative only slices because
        # it's not clear what the value should be.
        expected_slice_metrics[(('fixed_string', b'negative_slice'), )] = {
            'average_loss': (0.00 + 0.04 + 0.16) / 3.0,
            'mae': 0.2,
        }
        expected_slice_metrics[(('fixed_string', b'positive_slice'), )] = {
            'average_loss': (0.04 + 0.01 + 0.00) / 3.0,
            'mae': 0.1,
        }

        def add_metrics(features, predictions, labels):
            del features
            metric_ops = {
                'mae':
                tf.metrics.mean_absolute_error(labels,
                                               predictions['predictions']),
            }
            return metric_ops

        with beam.Pipeline() as pipeline:
            examples_pcollection = pipeline | 'Create' >> beam.Create(examples)
            self.assertGeneralMetricsComputedWithBeamAre(
                eval_saved_model_path=eval_export_dir,
                examples_pcollection=examples_pcollection,
                slice_spec=[
                    slicer.SingleSliceSpec(),
                    slicer.SingleSliceSpec(columns=['fixed_string'])
                ],
                add_metrics_callbacks=[add_metrics,
                                       post_export_metrics.auc()],
                expected_slice_metrics=expected_slice_metrics)
Ejemplo n.º 28
0
def process_tfma(eval_result_dir,
                 schema_file,
                 input_csv=None,
                 big_query_table=None,
                 eval_model_dir=None,
                 max_eval_rows=None,
                 pipeline_args=None):
  """Runs a batch job to evaluate the eval_model against the given input.

  Args:
    eval_result_dir: A directory where the evaluation result should be written
      to.
    schema_file: A file containing a text-serialized Schema that describes the
      eval data.
    input_csv: A path to a csv file which should be the input for evaluation.
      This can only be set if big_query_table is None.
    big_query_table: A BigQuery table name specified as DATASET.TABLE which
      should be the input for evaluation. This can only be set if input_csv is
      None.
    eval_model_dir: A directory where the eval model is located.
    max_eval_rows: Number of rows to query from BigQuery.

    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.

  Raises:
    ValueError: if input_csv and big_query_table are not specified correctly.
  """

  if input_csv == big_query_table and input_csv is None:
    raise ValueError(
        'one of --input_csv or --big_query_table should be provided.')

  slice_spec = [
      slicer.SingleSliceSpec(),
      slicer.SingleSliceSpec(columns=['trip_start_hour'])
  ]

  schema = taxi.read_schema(schema_file)

  with beam.Pipeline(argv=pipeline_args) as pipeline:
    if input_csv:
      csv_coder = taxi.make_csv_coder(schema)
      raw_data = (
          pipeline
          | 'ReadFromText' >> beam.io.ReadFromText(
              input_csv, skip_header_lines=1)
          | 'ParseCSV' >> beam.Map(csv_coder.decode))
    else:
      assert big_query_table
      query = taxi.make_sql(big_query_table, max_eval_rows, for_eval=True)
      raw_feature_spec = taxi.get_raw_feature_spec(schema)
      raw_data = (
          pipeline
          | 'ReadBigQuery' >> beam.io.Read(
              beam.io.BigQuerySource(query=query, use_standard_sql=True))
          | 'CleanData' >>
          beam.Map(lambda x: (taxi.clean_raw_data_dict(x, raw_feature_spec))))

    # Examples must be in clean tf-example format.
    coder = taxi.make_proto_coder(schema)

    _ = (
        raw_data
        | 'ToSerializedTFExample' >> beam.Map(coder.encode)
        | 'EvaluateAndWriteResults' >> tfma.EvaluateAndWriteResults(
            eval_saved_model_path=eval_model_dir,
            slice_spec=slice_spec,
            add_metrics_callbacks=[
                post_export_metrics.calibration_plot_and_prediction_histogram(),
                post_export_metrics.auc_plots()
            ],
            output_path=eval_result_dir))
Ejemplo n.º 29
0
def Evaluate(
    # pylint: disable=invalid-name
    examples,
    eval_saved_model_path,
    add_metrics_callbacks=None,
    slice_spec=None,
    desired_batch_size=None,
):
    """Evaluate the given EvalSavedModel on the given examples.

  This is for TFMA use only. Users should call tfma.EvaluateAndWriteResults
  instead of this function.

  Args:
    examples: PCollection of input examples. Can be any format the model accepts
      (e.g. string containing CSV row, TensorFlow.Example, etc).
    eval_saved_model_path: Path to EvalSavedModel. This directory should contain
      the saved_model.pb file.
    add_metrics_callbacks: Optional list of callbacks for adding additional
      metrics to the graph. The names of the metrics added by the callbacks
      should not conflict with existing metrics, or metrics added by other
      callbacks. See below for more details about what each callback should do.
    slice_spec: Optional list of SingleSliceSpec specifying the slices to slice
      the data into. If None, defaults to the overall slice.
    desired_batch_size: Optional batch size for batching in Predict and
      Aggregate.

  More details on add_metrics_callbacks:

    Each add_metrics_callback should have the following prototype:
      def add_metrics_callback(features_dict, predictions_dict, labels_dict):

    Note that features_dict, predictions_dict and labels_dict are not
    necessarily dictionaries - they might also be Tensors, depending on what the
    model's eval_input_receiver_fn returns.

    It should create and return a metric_ops dictionary, such that
    metric_ops['metric_name'] = (value_op, update_op), just as in the Trainer.

    Short example:

    def add_metrics_callback(features_dict, predictions_dict, labels):
      metrics_ops = {}
      metric_ops['mean_label'] = tf.metrics.mean(labels)
      metric_ops['mean_probability'] = tf.metrics.mean(tf.slice(
        predictions_dict['probabilities'], [0, 1], [2, 1]))
      return metric_ops

  Returns:
    DoOutputsTuple. The tuple entries are
    PCollection of (slice key, metrics) and
    PCollection of (slice key, plot metrics).
  """
    if slice_spec is None:
        slice_spec = [slicer.SingleSliceSpec()]

    shared_handle = shared.Shared()

    # pylint: disable=no-value-for-parameter
    return (
        examples
        # Our diagnostic outputs, pass types.ExampleAndExtracts throughout,
        # however our aggregating functions do not use this interface.
        | 'ToExampleAndExtracts' >>
        beam.Map(lambda x: types.ExampleAndExtracts(example=x, extracts={}))

        # Map function which loads and runs the eval_saved_model against every
        # example, yielding an types.ExampleAndExtracts containing a
        # FeaturesPredictionsLabels value (where key is 'fpl').
        | 'Predict' >> predict_extractor.TFMAPredict(
            eval_saved_model_path=eval_saved_model_path,
            add_metrics_callbacks=add_metrics_callbacks,
            shared_handle=shared_handle,
            desired_batch_size=desired_batch_size)

        # Input: one example fpl at a time
        # Output: one fpl example per slice key (notice that the example turns
        #         into n, replicated once per applicable slice key)
        | 'Slice' >> slice_api.Slice(slice_spec)

        # Each slice key lands on one shard where metrics are computed for all
        # examples in that shard -- the "map" and "reduce" parts of the
        # computation happen within this shard.
        # Output: Tuple[slicer.SliceKeyType, MetricVariablesType]
        |
        'Aggregate' >> _Aggregate(eval_saved_model_path=eval_saved_model_path,
                                  add_metrics_callbacks=add_metrics_callbacks,
                                  shared_handle=shared_handle,
                                  desired_batch_size=desired_batch_size)

        # Different metrics for a given slice key are brought together.
        | 'ExtractOutput' >> _ExtractOutput(
            eval_saved_model_path=eval_saved_model_path,
            add_metrics_callbacks=add_metrics_callbacks,
            shared_handle=shared_handle))
Ejemplo n.º 30
0
 def testRunModelAnalysisWithQueryExtractor(self):
   model_location = self._exportEvalSavedModel(
       linear_classifier.simple_linear_classifier)
   examples = [
       self._makeExample(age=3.0, language='english', label=1.0),
       self._makeExample(age=3.0, language='chinese', label=0.0),
       self._makeExample(age=4.0, language='english', label=0.0),
       self._makeExample(age=5.0, language='chinese', label=1.0)
   ]
   data_location = self._writeTFExamplesToTFRecords(examples)
   slice_spec = [slicer.SingleSliceSpec()]
   eval_shared_model = model_eval_lib.default_eval_shared_model(
       eval_saved_model_path=model_location, example_weight_key='age')
   eval_result = model_eval_lib.run_model_analysis(
       eval_shared_model=eval_shared_model,
       data_location=data_location,
       slice_spec=slice_spec,
       evaluators=[
           metrics_and_plots_evaluator.MetricsAndPlotsEvaluator(
               eval_shared_model),
           query_based_metrics_evaluator.QueryBasedMetricsEvaluator(
               query_id='language',
               prediction_key='logistic',
               combine_fns=[
                   query_statistics.QueryStatisticsCombineFn(),
                   ndcg.NdcgMetricCombineFn(
                       at_vals=[1], gain_key='label', weight_key='')
               ]),
       ])
   # We only check some of the metrics to ensure that the end-to-end
   # pipeline works.
   expected = {
       (): {
           'post_export_metrics/total_queries': {
               'doubleValue': 2.0
           },
           'post_export_metrics/min_documents': {
               'doubleValue': 2.0
           },
           'post_export_metrics/max_documents': {
               'doubleValue': 2.0
           },
           'post_export_metrics/total_documents': {
               'doubleValue': 4.0
           },
           'post_export_metrics/ndcg@1': {
               'doubleValue': 0.5
           },
           'post_export_metrics/example_weight': {
               'doubleValue': 15.0
           },
           'post_export_metrics/example_count': {
               'doubleValue': 4.0
           },
       }
   }
   self.assertEqual(eval_result.config.model_location, model_location)
   self.assertEqual(eval_result.config.data_location, data_location)
   self.assertEqual(eval_result.config.slice_spec, slice_spec)
   self.assertMetricsAlmostEqual(eval_result.slicing_metrics, expected)
   self.assertFalse(eval_result.plots)