Ejemplo n.º 1
0
  def testMetricSpecsFromKerasSequential(self):
    export_dir = os.path.join(self._getTempDir(), 'export_dir')
    model = tf.keras.models.Sequential([
        tf.keras.layers.InputLayer(input_shape=(1,), name='test'),
        tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)
    ])
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=[tf.keras.metrics.MeanSquaredError(name='mse')])
    features = [[0.0], [1.0]]
    labels = [[1], [0]]
    dataset = tf.data.Dataset.from_tensor_slices((features, labels))
    dataset = dataset.shuffle(buffer_size=1).repeat().batch(2)
    model.fit(dataset, steps_per_epoch=1)
    model.save(export_dir, save_format='tf')

    eval_shared_model = self.createTestEvalSharedModel(
        eval_saved_model_path=export_dir, tags=[tf.saved_model.SERVING])

    metrics_specs = (
        keras_util.metrics_specs_from_keras('', eval_shared_model.model_loader))

    # TODO(b/149995449): Keras does not support re-loading metrics with the new
    #   API. Re-enable after this is fixed.
    model = eval_shared_model.model_loader.construct_fn(lambda x: None)()
    if not hasattr(model, 'loss_functions'):
      return

    self.assertLen(metrics_specs, 1)
    self.assertProtoEquals(
        self._comparable_spec(metrics_specs[0]),
        config.MetricsSpec(
            metrics=[
                config.MetricConfig(
                    class_name='BinaryCrossentropy',
                    config=json.dumps(
                        {
                            'from_logits': False,
                            'label_smoothing': 0,
                            'reduction': 'auto',
                            'name': 'binary_crossentropy'
                        },
                        sort_keys=True)),
                config.MetricConfig(
                    class_name='MeanSquaredError',
                    config=json.dumps({
                        'name': 'mse',
                        'dtype': 'float32'
                    },
                                      sort_keys=True))
            ],
            model_names=['']))
def _ComputeMetricsAndPlots(  # pylint: disable=invalid-name
        extracts: beam.pvalue.PCollection,
        eval_config: config.EvalConfig,
        metrics_specs: List[config.MetricsSpec],
        eval_shared_models: Optional[Dict[Text, types.EvalSharedModel]] = None,
        metrics_key: Text = constants.METRICS_KEY,
        plots_key: Text = constants.PLOTS_KEY,
        schema: Optional[schema_pb2.Schema] = None,
        random_seed_for_testing: Optional[int] = None) -> evaluator.Evaluation:
    """Computes metrics and plots.

  Args:
    extracts: PCollection of Extracts. If a query_key was used then the
      PCollection will contain a list of extracts.
    eval_config: Eval config.
    metrics_specs: Subset of the metric specs to compute metrics for. If a
      query_key was used all of the metric specs will be for the same query_key.
    eval_shared_models: Optional dict of shared models keyed by model name. Only
      required if there are metrics to be computed in-graph using the model.
    metrics_key: Name to use for metrics key in Evaluation output.
    plots_key: Name to use for plots key in Evaluation output.
    schema: A schema to use for customizing metrics and plots.
    random_seed_for_testing: Seed to use for unit testing.

  Returns:
    Evaluation containing dict of PCollections of (slice_key, results_dict)
    tuples where the dict is keyed by either the metrics_key (e.g. 'metrics') or
    plots_key (e.g. 'plots') depending on what the results_dict contains.
    schema: A schema to use for customizing metrics and plots.
  """
    computations = []
    # Add default metric computations
    if eval_shared_models:
        for model_name, eval_shared_model in eval_shared_models.items():
            if not eval_shared_model.include_default_metrics:
                continue
            if eval_shared_model.model_type == constants.TF_KERAS:
                keras_specs = keras_util.metrics_specs_from_keras(
                    model_name, eval_shared_model.model_loader)
                metrics_specs = keras_specs + metrics_specs[:]
                # TODO(mdreves): Add support for calling keras.evaluate().
            elif (eval_shared_model.model_type == constants.TF_ESTIMATOR
                  and eval_constants.EVAL_TAG
                  in eval_shared_model.model_loader.tags):
                # Note that there is the possibility for metric naming collisions here
                # (e.g. 'auc' calculated within the EvalSavedModel as well as by AUC
                # metric computation performed outside the model). Currently all the
                # overlapping metrics such as AUC that are computed outside the model
                # are all derived metrics so they will override the metrics calculated
                # by the model which is the desired behavior.
                computations.extend(
                    eval_saved_model_util.
                    metric_computations_using_eval_saved_model(
                        model_name, eval_shared_model.model_loader))
    # Add metric computations from specs
    computations_from_specs, derived_computations = (
        _filter_and_separate_computations(
            metric_specs.to_computations(metrics_specs,
                                         eval_config=eval_config,
                                         schema=schema)))
    computations.extend(computations_from_specs)

    # Find out which model is baseline.
    baseline_spec = model_util.get_baseline_model_spec(eval_config)
    baseline_model_name = baseline_spec.name if baseline_spec else None

    # pylint: disable=no-value-for-parameter

    # Input: Single extract per example (or list of extracts if query_key used)
    #        where each item contains slice keys and other extracts from upstream
    #        extractors (e.g. labels, predictions, etc).
    # Output: Single extract (per example) containing slice keys and initial
    #         combiner state returned from preprocessor. Note that even if a
    #         query_key was used the output is still only a single extract
    #         (though, that extract may contain lists of values (predictions,
    #         labels, etc) in its keys).
    #
    # Note that the output of this step is extracts instead of just a tuple of
    # computation outputs because FanoutSlices takes extracts as input (and in
    # many cases a subset of the extracts themselves are what is fanned out).
    extracts = (extracts
                | 'Preprocesss' >> beam.ParDo(_PreprocessorDoFn(computations)))

    # Input: Single extract containing slice keys and initial combiner inputs. If
    #        query_key is used the extract represents multiple examples with the
    #        same query_key, otherwise the extract represents a single example.
    # Output: Tuple (slice key, combiner inputs extracts). Notice that the per
    #         example (or list or examples if query_key used) input extract turns
    #         into n logical extracts, references to which are replicated once per
    #         applicable slice key.
    slices = extracts | 'FanoutSlices' >> slicer.FanoutSlices()

    slices_count = (slices
                    | 'ExtractSliceKeys' >> beam.Keys()
                    | 'CountPerSliceKey' >> beam.combiners.Count.PerElement())

    _ = (extracts.pipeline
         | 'IncrementMetricsSpecsCounters' >>
         counter_util.IncrementMetricsSpecsCounters(metrics_specs),
         slices_count
         | 'IncrementSliceSpecCounters' >>
         counter_util.IncrementSliceSpecCounters())

    ci_params = _get_confidence_interval_params(eval_config, metrics_specs)

    cross_slice_specs = []
    if eval_config.cross_slicing_specs:
        cross_slice_specs = eval_config.cross_slicing_specs

    # TODO(b/151482616): Make bootstrap and jackknife confidence interval
    # implementations more parallel.

    # Input: Tuple of (slice key, combiner input extracts).
    # Output: Tuple of (slice key, dict of computed metrics/plots). The dicts will
    #         be keyed by MetricKey/PlotKey and the values will be the result
    #         of the associated computations. A given MetricComputation can
    #         perform computations for multiple keys, but the keys should be
    #         unique across computations.
    sliced_metrics_and_plots = (
        slices
        |
        'ComputePerSlice' >> poisson_bootstrap.ComputeWithConfidenceIntervals(
            _ComputePerSlice,
            computations=computations,
            derived_computations=derived_computations,
            baseline_model_name=baseline_model_name,
            cross_slice_specs=cross_slice_specs,
            num_jackknife_samples=ci_params.num_jackknife_samples,
            num_bootstrap_samples=ci_params.num_bootstrap_samples,
            skip_ci_metric_keys=ci_params.skip_ci_metric_keys,
            random_seed_for_testing=random_seed_for_testing))

    if eval_config.options.min_slice_size.value > 1:
        sliced_metrics_and_plots = (
            sliced_metrics_and_plots
            | 'FilterSmallSlices' >> slicer.FilterOutSlices(
                slices_count, eval_config.options.min_slice_size.value))

    sliced_metrics = (sliced_metrics_and_plots
                      | 'FilterByMetrics' >> beam.Map(_filter_by_key_type,
                                                      metric_types.MetricKey))
    sliced_plots = (
        sliced_metrics_and_plots
        |
        'FilterByPlots' >> beam.Map(_filter_by_key_type, metric_types.PlotKey))

    # pylint: enable=no-value-for-parameter

    return {metrics_key: sliced_metrics, plots_key: sliced_plots}
Ejemplo n.º 3
0
    def testMetricSpecsFromKerasWithMultipleOutputs(self):
        export_dir = os.path.join(self._getTempDir(), 'export_dir')
        input_layer = tf.keras.layers.Input(shape=(1, ))
        output_layer1 = tf.keras.layers.Dense(1, name='output_1')(input_layer)
        output_layer2 = tf.keras.layers.Dense(1, name='output_2')(input_layer)
        model = tf.keras.models.Model([input_layer],
                                      [output_layer1, output_layer2])
        model.compile(loss={
            'output_1':
            (tf.keras.losses.BinaryCrossentropy(name='binary_crossentropy')),
            'output_2':
            (tf.keras.losses.BinaryCrossentropy(name='binary_crossentropy'))
        },
                      metrics=[tf.keras.metrics.MeanSquaredError(name='mse')])
        features = [[0.0], [1.0]]
        labels = [[1], [0]]
        dataset = tf.data.Dataset.from_tensor_slices((features, {
            'output_1': labels,
            'output_2': labels
        }))
        dataset = dataset.shuffle(buffer_size=1).repeat().batch(2)
        model.fit(dataset, steps_per_epoch=1)
        model.save(export_dir, save_format='tf')

        eval_shared_model = self.createTestEvalSharedModel(
            eval_saved_model_path=export_dir)

        metrics_specs = (keras_util.metrics_specs_from_keras(
            '', eval_shared_model.model_loader))

        # TODO(b/149995449): Keras does not support re-loading metrics with the new
        #   API. Re-enable after this is fixed.
        model = eval_shared_model.model_loader.construct_fn(lambda x: None)()
        if not hasattr(model, 'loss_functions'):
            return

        self.assertLen(metrics_specs, 2)
        self.assertProtoEquals(
            self._comparable_spec(metrics_specs[0]),
            config.MetricsSpec(metrics=[
                config.MetricConfig(
                    class_name='BinaryCrossentropy',
                    config=json.dumps(
                        {
                            'from_logits':
                            False,
                            'label_smoothing':
                            0,
                            'reduction':
                            'auto',
                            'name':
                            self._loss_name(model, 'binary_crossentropy',
                                            'output_1')
                        },
                        sort_keys=True)),
                config.MetricConfig(class_name='MeanSquaredError',
                                    config=json.dumps(
                                        {
                                            'name': 'output_1_mse',
                                            'dtype': 'float32'
                                        },
                                        sort_keys=True))
            ],
                               model_names=[''],
                               output_names=['output_1']))
        self.assertProtoEquals(
            self._comparable_spec(metrics_specs[1]),
            config.MetricsSpec(metrics=[
                config.MetricConfig(
                    class_name='BinaryCrossentropy',
                    config=json.dumps(
                        {
                            'from_logits':
                            False,
                            'label_smoothing':
                            0,
                            'reduction':
                            'auto',
                            'name':
                            self._loss_name(model, 'binary_crossentropy',
                                            'output_2')
                        },
                        sort_keys=True)),
                config.MetricConfig(class_name='MeanSquaredError',
                                    config=json.dumps(
                                        {
                                            'name': 'output_2_mse',
                                            'dtype': 'float32'
                                        },
                                        sort_keys=True))
            ],
                               model_names=[''],
                               output_names=['output_2']))