Beispiel #1
0
  def testToComputations(self):
    computations = metric_specs.to_computations(
        metric_specs.specs_from_metrics(
            {
                'output_name': [
                    tf.keras.metrics.MeanSquaredError('mse'),
                    calibration.MeanLabel('mean_label')
                ]
            },
            model_names=['model_name'],
            binarize=config.BinarizationOptions(class_ids={'values': [0, 1]}),
            aggregate=config.AggregationOptions(macro_average=True)),
        config.EvalConfig())

    keys = []
    for m in computations:
      for k in m.keys:
        if not k.name.startswith('_'):
          keys.append(k)
    self.assertLen(keys, 8)
    self.assertIn(metric_types.MetricKey(name='example_count'), keys)
    self.assertIn(
        metric_types.MetricKey(
            name='weighted_example_count',
            model_name='model_name',
            output_name='output_name'), keys)
    self.assertIn(
        metric_types.MetricKey(
            name='mse',
            model_name='model_name',
            output_name='output_name',
            sub_key=metric_types.SubKey(class_id=0)), keys)
    self.assertIn(
        metric_types.MetricKey(
            name='mse',
            model_name='model_name',
            output_name='output_name',
            sub_key=metric_types.SubKey(class_id=1)), keys)
    self.assertIn(
        metric_types.MetricKey(
            name='mse', model_name='model_name', output_name='output_name'),
        keys)
    self.assertIn(
        metric_types.MetricKey(
            name='mean_label',
            model_name='model_name',
            output_name='output_name',
            sub_key=metric_types.SubKey(class_id=0)), keys)
    self.assertIn(
        metric_types.MetricKey(
            name='mean_label',
            model_name='model_name',
            output_name='output_name',
            sub_key=metric_types.SubKey(class_id=1)), keys)
    self.assertIn(
        metric_types.MetricKey(
            name='mean_label',
            model_name='model_name',
            output_name='output_name'), keys)
Beispiel #2
0
  def testToComputationsWithMixedAggregationAndNonAggregationMetrics(self):
    computations = metric_specs.to_computations([
        config.MetricsSpec(
            metrics=[config.MetricConfig(class_name='CategoricalAccuracy')]),
        config.MetricsSpec(
            metrics=[config.MetricConfig(class_name='BinaryCrossentropy')],
            binarize=config.BinarizationOptions(class_ids={'values': [1]}),
            aggregate=config.AggregationOptions(micro_average=True))
    ], config.EvalConfig())

    # 3 separate computations should be used (one for aggregated metrics, one
    # for non-aggregated metrics, and one for metrics associated with class 1)
    self.assertLen(computations, 3)
    def _runMetricsAndPlotsEvaluatorManualActuation(self,
                                                    with_confidence_intervals,
                                                    metrics_specs=None):
        """Benchmark MetricsAndPlotsEvaluatorV2 "manually"."""
        self._init_model()
        if not metrics_specs:
            metrics_specs = self._eval_config.metrics_specs

        records = self._readDatasetIntoExtracts()
        extracts = []
        for elem in records:
            extracts.append(
                input_extractor._ParseExample(elem, self._eval_config))  # pylint: disable=protected-access

        prediction_do_fn = predict_extractor_v2._PredictionDoFn(  # pylint: disable=protected-access
            eval_config=self._eval_config,
            eval_shared_models={"": self._eval_shared_model})
        prediction_do_fn.setup()

        # Have to predict first
        predict_result = []
        predict_batch_size = 1000
        for batch in benchmark_utils.batched_iterator(extracts,
                                                      predict_batch_size):
            predict_result.extend(prediction_do_fn.process(batch))

        # Now Evaluate
        inputs_per_accumulator = 1000
        start = time.time()

        computations, _ = (
            metrics_and_plots_evaluator_v2._filter_and_separate_computations(  # pylint: disable=protected-access
                metric_specs.to_computations(metrics_specs,
                                             eval_config=self._eval_config)))

        processed = []
        for elem in predict_result:
            processed.append(
                next(
                    metrics_and_plots_evaluator_v2._PreprocessorDoFn(  # pylint: disable=protected-access
                        computations).process(elem)))

        combiner = metrics_and_plots_evaluator_v2._ComputationsCombineFn(  # pylint: disable=protected-access
            computations=computations,
            compute_with_sampling=with_confidence_intervals)

        accumulators = []
        for batch in benchmark_utils.batched_iterator(processed,
                                                      inputs_per_accumulator):
            accumulator = combiner.create_accumulator()
            for elem in batch:
                accumulator = combiner.add_input(accumulator, elem)
            accumulators.append(accumulator)

        final_accumulator = combiner.merge_accumulators(accumulators)
        final_output = combiner.extract_output(final_accumulator)
        end = time.time()
        delta = end - start

        # Sanity check the example count. This is not timed.
        example_count_key = metric_types.MetricKey(name="example_count")
        example_count = None
        for x in final_output:
            if example_count_key in x:
                example_count = x[example_count_key]
                break

        if example_count is None:
            raise ValueError(
                "example_count was not in the final list of metrics. "
                "metrics were: %s" % str(final_output))

        if with_confidence_intervals:
            # If we're computing using confidence intervals, the example count will
            # not be exact.
            lower_bound = int(0.9 * len(records))
            upper_bound = int(1.1 * len(records))
            if example_count < lower_bound or example_count > upper_bound:
                raise ValueError("example count out of bounds: expecting "
                                 "%d < example_count < %d, but got %d" %
                                 (lower_bound, upper_bound, example_count))
        else:
            # If we're not using confidence intervals, we expect the example count to
            # be exact.
            if example_count != len(records):
                raise ValueError(
                    "example count mismatch: expecting %d got %d" %
                    (len(records), example_count))

        self.report_benchmark(iters=1,
                              wall_time=delta,
                              extras={
                                  "inputs_per_accumulator":
                                  inputs_per_accumulator,
                                  "num_examples": len(records)
                              })
def _ComputeMetricsAndPlots(  # pylint: disable=invalid-name
    extracts: beam.pvalue.PCollection,
    eval_config: config_pb2.EvalConfig,
    metrics_specs: List[config_pb2.MetricsSpec],
    eval_shared_models: Optional[Dict[Text, types.EvalSharedModel]] = None,
    metrics_key: Text = constants.METRICS_KEY,
    plots_key: Text = constants.PLOTS_KEY,
    attributions_key: Text = constants.ATTRIBUTIONS_KEY,
    schema: Optional[schema_pb2.Schema] = None,
    random_seed_for_testing: Optional[int] = None,
    tensor_adapter_config: Optional[tensor_adapter.TensorAdapterConfig] = None
) -> evaluator.Evaluation:
  """Computes metrics and plots.

  Args:
    extracts: PCollection of Extracts. If a query_key was used then the
      PCollection will contain a list of extracts.
    eval_config: Eval config.
    metrics_specs: Subset of the metric specs to compute metrics for. If a
      query_key was used all of the metric specs will be for the same query_key.
    eval_shared_models: Optional dict of shared models keyed by model name. Only
      required if there are metrics to be computed in-graph using the model.
    metrics_key: Name to use for metrics key in Evaluation output.
    plots_key: Name to use for plots key in Evaluation output.
    attributions_key: Name to use for attributions key in Evaluation output.
    schema: A schema to use for customizing metrics and plots.
    random_seed_for_testing: Seed to use for unit testing.
    tensor_adapter_config: Tensor adapter config which specifies how to obtain
      tensors from the Arrow RecordBatch. The model's signature will be invoked
      with those tensors (matched by names). If None, an attempt will be made to
      create an adapter based on the model's input signature otherwise the model
      will be invoked with raw examples (assuming a  signature of a single 1-D
      string tensor).

  Returns:
    Evaluation containing dict of PCollections of (slice_key, results_dict)
    tuples where the dict is keyed by either the metrics_key (e.g. 'metrics'),
    plots_key (e.g. 'plots'), or attributions_key (e.g. 'attributions')
    depending on what the results_dict contains.
  """
  computations = []
  # Add default metric computations
  if eval_shared_models:
    # Note that there is the possibility for metric naming collisions here
    # (e.g. 'auc' calculated within the model as well as by AUC metric
    # computation performed outside the model). Currently all the overlapping
    # metrics such as AUC that are computed outside the model are all derived
    # metrics so they will override the metrics calculated by the model which is
    # the desired behavior.
    for model_name, eval_shared_model in eval_shared_models.items():
      if not eval_shared_model.include_default_metrics:
        continue
      if eval_shared_model.model_type == constants.TF_KERAS:
        computations.extend(
            keras_util.metric_computations_using_keras_saved_model(
                model_name, eval_shared_model.model_loader, eval_config,
                tensor_adapter_config))
      elif (eval_shared_model.model_type == constants.TF_ESTIMATOR and
            eval_constants.EVAL_TAG in eval_shared_model.model_loader.tags):
        computations.extend(
            eval_saved_model_util.metric_computations_using_eval_saved_model(
                model_name, eval_shared_model.model_loader))
  # Add metric computations from specs
  metric_computations = _filter_and_separate_computations(
      metric_specs.to_computations(
          metrics_specs, eval_config=eval_config, schema=schema))
  computations.extend(metric_computations.non_derived_computations)

  # Find out which model is baseline.
  baseline_spec = model_util.get_baseline_model_spec(eval_config)
  baseline_model_name = baseline_spec.name if baseline_spec else None

  # pylint: disable=no-value-for-parameter

  # Input: Single extract per example (or list of extracts if query_key used)
  #        where each item contains slice keys and other extracts from upstream
  #        extractors (e.g. labels, predictions, etc).
  # Output: Single extract (per example) containing slice keys and initial
  #         combiner state returned from preprocessor. Note that even if a
  #         query_key was used the output is still only a single extract
  #         (though, that extract may contain lists of values (predictions,
  #         labels, etc) in its keys).
  #
  # Note that the output of this step is extracts instead of just a tuple of
  # computation outputs because FanoutSlices takes extracts as input (and in
  # many cases a subset of the extracts themselves are what is fanned out).
  extracts = (
      extracts
      | 'Preprocesss' >> beam.ParDo(_PreprocessorDoFn(computations)))

  # Input: Single extract containing slice keys and initial combiner inputs. If
  #        query_key is used the extract represents multiple examples with the
  #        same query_key, otherwise the extract represents a single example.
  # Output: Tuple (slice key, combiner inputs extracts). Notice that the per
  #         example (or list or examples if query_key used) input extract turns
  #         into n logical extracts, references to which are replicated once per
  #         applicable slice key.
  slices = extracts | 'FanoutSlices' >> slicer.FanoutSlices()

  slices_count = (
      slices
      | 'ExtractSliceKeys' >> beam.Keys()
      | 'CountPerSliceKey' >> beam.combiners.Count.PerElement())

  model_types = _get_model_types_for_logging(eval_shared_models)

  _ = (
      extracts.pipeline
      | 'IncrementMetricsSpecsCounters' >>
      counter_util.IncrementMetricsSpecsCounters(metrics_specs, model_types),
      slices_count
      |
      'IncrementSliceSpecCounters' >> counter_util.IncrementSliceSpecCounters())

  ci_params = _get_confidence_interval_params(eval_config, metrics_specs)

  cross_slice_specs = []
  if eval_config.cross_slicing_specs:
    cross_slice_specs = eval_config.cross_slicing_specs

  computations_combine_fn = _ComputationsCombineFn(computations=computations)
  derived_metrics_ptransform = _AddDerivedCrossSliceAndDiffMetrics(
      metric_computations.derived_computations,
      metric_computations.cross_slice_computations, cross_slice_specs,
      baseline_model_name)

  # Input: Tuple of (slice key, combiner input extracts).
  # Output: Tuple of (slice key, dict of computed metrics/plots/attributions).
  #         The dicts will be keyed by MetricKey/PlotKey/AttributionsKey and the
  #         values will be the result of the associated computations. A given
  #         MetricComputation can perform computations for multiple keys, but
  #         the keys should be unique across computations.
  if ci_params.num_bootstrap_samples:
    sliced_metrics_plots_and_attributions = (
        slices | 'PoissonBootstrapConfidenceIntervals' >>
        poisson_bootstrap.ComputeWithConfidenceIntervals(
            computations_combine_fn=computations_combine_fn,
            derived_metrics_ptransform=derived_metrics_ptransform,
            num_bootstrap_samples=ci_params.num_bootstrap_samples,
            hot_key_fanout=_COMBINE_PER_SLICE_KEY_HOT_KEY_FANOUT,
            skip_ci_metric_keys=ci_params.skip_ci_metric_keys,
            random_seed_for_testing=random_seed_for_testing))
  elif ci_params.num_jackknife_samples:
    sliced_metrics_plots_and_attributions = (
        slices
        | 'JackknifeConfidenceIntervals' >>
        jackknife.ComputeWithConfidenceIntervals(
            computations_combine_fn=computations_combine_fn,
            derived_metrics_ptransform=derived_metrics_ptransform,
            num_jackknife_samples=ci_params.num_jackknife_samples,
            skip_ci_metric_keys=ci_params.skip_ci_metric_keys,
            random_seed_for_testing=random_seed_for_testing))
  else:
    sliced_metrics_plots_and_attributions = (
        slices
        |
        'CombineMetricsPerSlice' >> beam.CombinePerKey(computations_combine_fn)
        .with_hot_key_fanout(_COMBINE_PER_SLICE_KEY_HOT_KEY_FANOUT)
        | 'AddDerivedCrossSliceAndDiffMetrics' >> derived_metrics_ptransform)

  sliced_metrics_plots_and_attributions = (
      sliced_metrics_plots_and_attributions
      | 'AddCIDerivedMetrics' >> beam.Map(
          _add_ci_derived_metrics, metric_computations.ci_derived_computations))

  if eval_config.options.min_slice_size.value > 1:
    sliced_metrics_plots_and_attributions = (
        sliced_metrics_plots_and_attributions
        | 'FilterSmallSlices' >> slicer.FilterOutSlices(
            slices_count, eval_config.options.min_slice_size.value))

  sliced_metrics = (
      sliced_metrics_plots_and_attributions
      | 'FilterByMetrics' >> beam.Map(_filter_by_key_type,
                                      metric_types.MetricKey))
  sliced_plots = (
      sliced_metrics_plots_and_attributions
      | 'FilterByPlots' >> beam.Map(_filter_by_key_type, metric_types.PlotKey))

  sliced_attributions = (
      sliced_metrics_plots_and_attributions
      | 'FilterByAttributions' >> beam.Map(_filter_by_key_type,
                                           metric_types.AttributionsKey))

  # pylint: enable=no-value-for-parameter

  return {
      metrics_key: sliced_metrics,
      plots_key: sliced_plots,
      attributions_key: sliced_attributions
  }
Beispiel #5
0
    def testToComputations(self):
        computations = metric_specs.to_computations(
            metric_specs.specs_from_metrics(
                {
                    'output_name': [
                        tf.keras.metrics.MeanSquaredError('mse'),
                        # Add a loss exactly same as metric
                        # (https://github.com/tensorflow/tfx/issues/1550)
                        tf.keras.losses.MeanSquaredError(name='loss'),
                        calibration.MeanLabel('mean_label')
                    ]
                },
                model_names=['model_name'],
                binarize=config.BinarizationOptions(
                    class_ids={'values': [0, 1]}),
                aggregate=config.AggregationOptions(macro_average=True,
                                                    class_weights={
                                                        0: 1.0,
                                                        1: 1.0
                                                    })),
            config.EvalConfig())

        keys = []
        for m in computations:
            for k in m.keys:
                if not k.name.startswith('_'):
                    keys.append(k)
        self.assertLen(keys, 11)
        self.assertIn(
            metric_types.MetricKey(name='example_count',
                                   model_name='model_name'), keys)
        self.assertIn(
            metric_types.MetricKey(name='weighted_example_count',
                                   model_name='model_name',
                                   output_name='output_name'), keys)
        self.assertIn(
            metric_types.MetricKey(name='mse',
                                   model_name='model_name',
                                   output_name='output_name',
                                   sub_key=metric_types.SubKey(class_id=0)),
            keys)
        self.assertIn(
            metric_types.MetricKey(name='mse',
                                   model_name='model_name',
                                   output_name='output_name',
                                   sub_key=metric_types.SubKey(class_id=1)),
            keys)
        self.assertIn(
            metric_types.MetricKey(name='mse',
                                   model_name='model_name',
                                   output_name='output_name'), keys)
        self.assertIn(
            metric_types.MetricKey(name='loss',
                                   model_name='model_name',
                                   output_name='output_name',
                                   sub_key=metric_types.SubKey(class_id=0)),
            keys)
        self.assertIn(
            metric_types.MetricKey(name='loss',
                                   model_name='model_name',
                                   output_name='output_name',
                                   sub_key=metric_types.SubKey(class_id=1)),
            keys)
        self.assertIn(
            metric_types.MetricKey(name='loss',
                                   model_name='model_name',
                                   output_name='output_name'), keys)
        self.assertIn(
            metric_types.MetricKey(name='mean_label',
                                   model_name='model_name',
                                   output_name='output_name',
                                   sub_key=metric_types.SubKey(class_id=0)),
            keys)
        self.assertIn(
            metric_types.MetricKey(name='mean_label',
                                   model_name='model_name',
                                   output_name='output_name',
                                   sub_key=metric_types.SubKey(class_id=1)),
            keys)
        self.assertIn(
            metric_types.MetricKey(name='mean_label',
                                   model_name='model_name',
                                   output_name='output_name'), keys)
Beispiel #6
0
    def _runMetricsPlotsAndValidationsEvaluatorManualActuation(
            self,
            with_confidence_intervals,
            multi_model,
            metrics_specs=None,
            validation=False):
        """Benchmark MetricsPlotsAndValidationsEvaluator "manually"."""
        self._init_model(multi_model, validation)
        if not metrics_specs:
            metrics_specs = self._eval_config.metrics_specs

        extracts = self._readDatasetIntoBatchedExtracts()
        num_examples = sum(
            [e[constants.ARROW_RECORD_BATCH_KEY].num_rows for e in extracts])
        extracts = [self._extract_features_and_labels(e) for e in extracts]

        prediction_do_fn = model_util.ModelSignaturesDoFn(
            eval_config=self._eval_config,
            eval_shared_models=self._eval_shared_models,
            signature_names={
                constants.PREDICTIONS_KEY:
                {name: [None]
                 for name in self._eval_shared_models}
            },
            prefer_dict_outputs=False)
        prediction_do_fn.setup()

        # Have to predict first
        predict_result = []
        for e in extracts:
            predict_result.extend(prediction_do_fn.process(e))

        # Unbatch extracts
        unbatched_extracts = []
        for e in predict_result:
            unbatched_extracts.extend(
                unbatch_extractor._extract_unbatched_inputs(e))  # pylint: disable=protected-access

        # Add global slice key.
        for e in unbatched_extracts:
            e[tfma.SLICE_KEY_TYPES_KEY] = ()

        # Now Evaluate
        inputs_per_accumulator = 1000
        start = time.time()
        for _ in range(_ITERS):
            computations, _, _, _ = (
                # pylint: disable=protected-access
                metrics_plots_and_validations_evaluator.
                _filter_and_separate_computations(
                    metric_specs_util.to_computations(
                        metrics_specs, eval_config=self._eval_config)))
            # pylint: enable=protected-access

            processed = []
            for elem in unbatched_extracts:
                processed.append(
                    next(
                        metrics_plots_and_validations_evaluator.
                        _PreprocessorDoFn(  # pylint: disable=protected-access
                            computations).process(elem)))

            combiner = metrics_plots_and_validations_evaluator._ComputationsCombineFn(  # pylint: disable=protected-access
                computations=computations)
            if with_confidence_intervals:
                combiner = poisson_bootstrap._BootstrapCombineFn(combiner)  # pylint: disable=protected-access
            combiner.setup()

            accumulators = []
            for batch in benchmark_utils.batched_iterator(
                    processed, inputs_per_accumulator):
                accumulator = combiner.create_accumulator()
                for elem in batch:
                    accumulator = combiner.add_input(accumulator, elem)
                accumulators.append(accumulator)

            final_accumulator = combiner.merge_accumulators(accumulators)
            final_output = combiner.extract_output(final_accumulator)
        end = time.time()
        delta = end - start

        # Sanity check the example count. This is not timed.
        example_count_key = metric_types.MetricKey(
            name="example_count",
            model_name="candidate" if multi_model else "")
        if example_count_key in final_output:
            example_count = final_output[example_count_key]
        else:
            raise ValueError(
                "example_count_key ({}) was not in the final list of "
                "metrics. metrics were: {}".format(example_count_key,
                                                   final_output))

        if with_confidence_intervals:
            # If we're computing using confidence intervals, the example count will
            # not be exact.
            lower_bound = int(0.9 * num_examples)
            upper_bound = int(1.1 * num_examples)
            if example_count < lower_bound or example_count > upper_bound:
                raise ValueError("example count out of bounds: expecting "
                                 "%d < example_count < %d, but got %d" %
                                 (lower_bound, upper_bound, example_count))
        else:
            # If we're not using confidence intervals, we expect the example count to
            # be exact.
            if example_count != num_examples:
                raise ValueError(
                    "example count mismatch: expecting %d got %d" %
                    (num_examples, example_count))

        self.report_benchmark(iters=_ITERS,
                              wall_time=delta,
                              extras={
                                  "inputs_per_accumulator":
                                  inputs_per_accumulator,
                                  "num_examples": num_examples
                              })
def _ComputeMetricsAndPlots(  # pylint: disable=invalid-name
        extracts: beam.pvalue.PCollection,
        eval_config: config.EvalConfig,
        metrics_specs: List[config.MetricsSpec],
        eval_shared_models: Optional[Dict[Text, types.EvalSharedModel]] = None,
        metrics_key: Text = constants.METRICS_KEY,
        plots_key: Text = constants.PLOTS_KEY,
        schema: Optional[schema_pb2.Schema] = None,
        random_seed_for_testing: Optional[int] = None) -> evaluator.Evaluation:
    """Computes metrics and plots.

  Args:
    extracts: PCollection of Extracts. If a query_key was used then the
      PCollection will contain a list of extracts.
    eval_config: Eval config.
    metrics_specs: Subset of the metric specs to compute metrics for. If a
      query_key was used all of the metric specs will be for the same query_key.
    eval_shared_models: Optional dict of shared models keyed by model name. Only
      required if there are metrics to be computed in-graph using the model.
    metrics_key: Name to use for metrics key in Evaluation output.
    plots_key: Name to use for plots key in Evaluation output.
    schema: A schema to use for customizing metrics and plots.
    random_seed_for_testing: Seed to use for unit testing.

  Returns:
    Evaluation containing dict of PCollections of (slice_key, results_dict)
    tuples where the dict is keyed by either the metrics_key (e.g. 'metrics') or
    plots_key (e.g. 'plots') depending on what the results_dict contains.
    schema: A schema to use for customizing metrics and plots.
  """
    computations = []
    # Add default metric computations
    if eval_shared_models:
        for model_name, eval_shared_model in eval_shared_models.items():
            if not eval_shared_model.include_default_metrics:
                continue
            if eval_shared_model.model_type == constants.TF_KERAS:
                keras_specs = keras_util.metrics_specs_from_keras(
                    model_name, eval_shared_model.model_loader)
                metrics_specs = keras_specs + metrics_specs[:]
                # TODO(mdreves): Add support for calling keras.evaluate().
            elif (eval_shared_model.model_type == constants.TF_ESTIMATOR
                  and eval_constants.EVAL_TAG
                  in eval_shared_model.model_loader.tags):
                # Note that there is the possibility for metric naming collisions here
                # (e.g. 'auc' calculated within the EvalSavedModel as well as by AUC
                # metric computation performed outside the model). Currently all the
                # overlapping metrics such as AUC that are computed outside the model
                # are all derived metrics so they will override the metrics calculated
                # by the model which is the desired behavior.
                computations.extend(
                    eval_saved_model_util.
                    metric_computations_using_eval_saved_model(
                        model_name, eval_shared_model.model_loader))
    # Add metric computations from specs
    computations_from_specs, derived_computations = (
        _filter_and_separate_computations(
            metric_specs.to_computations(metrics_specs,
                                         eval_config=eval_config,
                                         schema=schema)))
    computations.extend(computations_from_specs)

    # Find out which model is baseline.
    baseline_spec = model_util.get_baseline_model_spec(eval_config)
    baseline_model_name = baseline_spec.name if baseline_spec else None

    # pylint: disable=no-value-for-parameter

    # Input: Single extract per example (or list of extracts if query_key used)
    #        where each item contains slice keys and other extracts from upstream
    #        extractors (e.g. labels, predictions, etc).
    # Output: Single extract (per example) containing slice keys and initial
    #         combiner state returned from preprocessor. Note that even if a
    #         query_key was used the output is still only a single extract
    #         (though, that extract may contain lists of values (predictions,
    #         labels, etc) in its keys).
    #
    # Note that the output of this step is extracts instead of just a tuple of
    # computation outputs because FanoutSlices takes extracts as input (and in
    # many cases a subset of the extracts themselves are what is fanned out).
    extracts = (extracts
                | 'Preprocesss' >> beam.ParDo(_PreprocessorDoFn(computations)))

    # Input: Single extract containing slice keys and initial combiner inputs. If
    #        query_key is used the extract represents multiple examples with the
    #        same query_key, otherwise the extract represents a single example.
    # Output: Tuple (slice key, combiner inputs extracts). Notice that the per
    #         example (or list or examples if query_key used) input extract turns
    #         into n logical extracts, references to which are replicated once per
    #         applicable slice key.
    slices = extracts | 'FanoutSlices' >> slicer.FanoutSlices()

    slices_count = (slices
                    | 'ExtractSliceKeys' >> beam.Keys()
                    | 'CountPerSliceKey' >> beam.combiners.Count.PerElement())

    _ = (extracts.pipeline
         | 'IncrementMetricsSpecsCounters' >>
         counter_util.IncrementMetricsSpecsCounters(metrics_specs),
         slices_count
         | 'IncrementSliceSpecCounters' >>
         counter_util.IncrementSliceSpecCounters())

    ci_params = _get_confidence_interval_params(eval_config, metrics_specs)

    cross_slice_specs = []
    if eval_config.cross_slicing_specs:
        cross_slice_specs = eval_config.cross_slicing_specs

    # TODO(b/151482616): Make bootstrap and jackknife confidence interval
    # implementations more parallel.

    # Input: Tuple of (slice key, combiner input extracts).
    # Output: Tuple of (slice key, dict of computed metrics/plots). The dicts will
    #         be keyed by MetricKey/PlotKey and the values will be the result
    #         of the associated computations. A given MetricComputation can
    #         perform computations for multiple keys, but the keys should be
    #         unique across computations.
    sliced_metrics_and_plots = (
        slices
        |
        'ComputePerSlice' >> poisson_bootstrap.ComputeWithConfidenceIntervals(
            _ComputePerSlice,
            computations=computations,
            derived_computations=derived_computations,
            baseline_model_name=baseline_model_name,
            cross_slice_specs=cross_slice_specs,
            num_jackknife_samples=ci_params.num_jackknife_samples,
            num_bootstrap_samples=ci_params.num_bootstrap_samples,
            skip_ci_metric_keys=ci_params.skip_ci_metric_keys,
            random_seed_for_testing=random_seed_for_testing))

    if eval_config.options.min_slice_size.value > 1:
        sliced_metrics_and_plots = (
            sliced_metrics_and_plots
            | 'FilterSmallSlices' >> slicer.FilterOutSlices(
                slices_count, eval_config.options.min_slice_size.value))

    sliced_metrics = (sliced_metrics_and_plots
                      | 'FilterByMetrics' >> beam.Map(_filter_by_key_type,
                                                      metric_types.MetricKey))
    sliced_plots = (
        sliced_metrics_and_plots
        |
        'FilterByPlots' >> beam.Map(_filter_by_key_type, metric_types.PlotKey))

    # pylint: enable=no-value-for-parameter

    return {metrics_key: sliced_metrics, plots_key: sliced_plots}
Beispiel #8
0
def _ComputeMetricsAndPlots(  # pylint: disable=invalid-name
        extracts: beam.pvalue.PCollection,
        eval_config: config.EvalConfig,
        metrics_specs: List[config.MetricsSpec],
        eval_shared_models: Optional[Dict[Text, types.EvalSharedModel]] = None,
        metrics_key: Text = constants.METRICS_KEY,
        plots_key: Text = constants.PLOTS_KEY) -> evaluator.Evaluation:
    """Computes metrics and plots.

  Args:
    extracts: PCollection of Extracts. If a query_key was used then the
      PCollection will contain a list of extracts.
    eval_config: Eval config.
    metrics_specs: Subset of the metric specs to compute metrics for. If a
      query_key was used all of the metric specs will be for the same query_key.
    eval_shared_models: Optional dict of shared models keyed by model name. Only
      required if there are metrics to be computed in-graph using the model.
    metrics_key: Name to use for metrics key in Evaluation output.
    plots_key: Name to use for plots key in Evaluation output.

  Returns:
    Evaluation containing dict of PCollections of (slice_key, results_dict)
    tuples where the dict is keyed by either the metrics_key (e.g. 'metrics') or
    plots_key (e.g. 'plots') depending on what the results_dict contains.
  """
    model_loaders = None
    if eval_shared_models:
        model_loaders = {}
        for k, v in eval_shared_models.items():
            if v.include_default_metrics:
                model_loaders[k] = v.model_loader
    computations, derived_computations = _filter_and_separate_computations(
        metric_specs.to_computations(metrics_specs,
                                     eval_config=eval_config,
                                     model_loaders=model_loaders))
    # Add default metric computations
    if (model_loaders and eval_config
            and (not eval_config.options.HasField('include_default_metrics')
                 or eval_config.options.include_default_metrics.value)):
        for model_name, model_loader in model_loaders.items():
            model_types = model_loader.construct_fn(lambda x: None)()
            if model_types.keras_model is not None:
                # TODO(mdreves): Move handling of keras metrics to here.
                pass
            elif model_types.eval_saved_model is not None:
                # Note that there is the possibility for metric naming collisions here
                # (e.g. 'auc' calculated within the EvalSavedModel as well as by AUC
                # metric computation performed outside the model). Currently all the
                # overlapping metrics such as AUC that are computed outside the model
                # are all derived metrics so they will override the metrics calculated
                # by the model which is the desired behavior.
                computations.extend(
                    eval_saved_model_util.
                    metric_computations_using_eval_saved_model(
                        model_name, model_loader))

    # pylint: disable=no-value-for-parameter

    # Input: Single extract per example (or list of extracts if query_key used)
    #        where each item contains slice keys and other extracts from upstream
    #        extractors (e.g. labels, predictions, etc).
    # Output: Single extract (per example) containing slice keys and initial
    #         combiner state returned from preprocessor. Note that even if a
    #         query_key was used the output is still only a single extract
    #         (though, that extract may contain lists of values (predictions,
    #         labels, etc) in its keys).
    #
    # Note that the output of this step is extracts instead of just a tuple of
    # computation outputs because FanoutSlices takes extracts as input (and in
    # many cases a subset of the extracts themselves are what is fanned out).
    extracts = (extracts
                | 'Preprocesss' >> beam.ParDo(_PreprocessorDoFn(computations)))

    # Input: Single extract containing slice keys and initial combiner inputs. If
    #        query_key is used the extract represents multiple examples with the
    #        same query_key, otherwise the extract represents a single example.
    # Output: Tuple (slice key, combiner inputs extracts). Notice that the per
    #         example (or list or examples if query_key used) input extract turns
    #         into n logical extracts, references to which are replicated once per
    #         applicable slice key.
    slices = extracts | 'FanoutSlices' >> slicer.FanoutSlices()

    slices_count = (slices
                    | 'ExtractSliceKeys' >> beam.Keys()
                    | 'CountPerSliceKey' >> beam.combiners.Count.PerElement())

    # Input: Tuple of (slice key, combiner input extracts).
    # Output: Tuple of (slice key, dict of computed metrics/plots). The dicts will
    #         be keyed by MetricKey/PlotKey and the values will be the result
    #         of the associated computations. A given MetricComputation can
    #         perform computations for multiple keys, but the keys should be
    #         unique across computations.
    sliced_metrics_and_plots = (
        slices
        |
        'ComputePerSlice' >> poisson_bootstrap.ComputeWithConfidenceIntervals(
            _ComputePerSlice,
            computations=computations,
            derived_computations=derived_computations,
            num_bootstrap_samples=(
                poisson_bootstrap.DEFAULT_NUM_BOOTSTRAP_SAMPLES if
                eval_config.options.compute_confidence_intervals.value else 1))
    )

    if eval_config.options.k_anonymization_count.value > 1:
        sliced_metrics_and_plots = (
            sliced_metrics_and_plots
            | 'FilteForSmallSlices' >> slicer.FilterOutSlices(
                slices_count, eval_config.options.k_anonymization_count.value))

    sliced_metrics = (sliced_metrics_and_plots
                      | 'FilterByMetrics' >> beam.Map(_filter_by_key_type,
                                                      metric_types.MetricKey))
    sliced_plots = (
        sliced_metrics_and_plots
        |
        'FilterByPlots' >> beam.Map(_filter_by_key_type, metric_types.PlotKey))

    # pylint: enable=no-value-for-parameter

    return {metrics_key: sliced_metrics, plots_key: sliced_plots}
Beispiel #9
0
    def _runMetricsAndPlotsEvaluatorManualActuation(self,
                                                    with_confidence_intervals,
                                                    metrics_specs=None):
        """Benchmark MetricsAndPlotsEvaluatorV2 "manually"."""
        self._init_model()
        if not metrics_specs:
            metrics_specs = self._eval_config.metrics_specs

        extracts = self._readDatasetIntoBatchedExtracts()
        num_examples = sum(
            [e[constants.ARROW_RECORD_BATCH_KEY].num_rows for e in extracts])
        extracts = [
            batched_input_extractor._ExtractInputs(e, self._eval_config)  # pylint: disable=protected-access
            for e in extracts
        ]

        prediction_do_fn = batched_predict_extractor_v2._BatchedPredictionDoFn(  # pylint: disable=protected-access
            eval_config=self._eval_config,
            eval_shared_models={"": self._eval_shared_model})
        prediction_do_fn.setup()

        # Have to predict first
        predict_result = []
        for e in extracts:
            predict_result.extend(prediction_do_fn.process(e))

        # Unbatch extracts
        unbatched_extarcts = []
        for e in predict_result:
            unbatched_extarcts.extend(
                unbatch_extractor._ExtractUnbatchedInputs(e))  # pylint: disable=protected-access

        # Add global slice key.
        for e in unbatched_extarcts:
            e[tfma.SLICE_KEY_TYPES_KEY] = ()

        # Now Evaluate
        inputs_per_accumulator = 1000
        start = time.time()

        computations, _ = (
            # pylint: disable=protected-access
            metrics_plots_and_validations_evaluator.
            _filter_and_separate_computations(
                metric_specs.to_computations(metrics_specs,
                                             eval_config=self._eval_config)))
        # pylint: enable=protected-access

        processed = []
        for elem in unbatched_extarcts:
            processed.append(
                next(
                    metrics_plots_and_validations_evaluator._PreprocessorDoFn(  # pylint: disable=protected-access
                        computations).process(elem)))

        combiner = metrics_plots_and_validations_evaluator._ComputationsCombineFn(  # pylint: disable=protected-access
            computations=computations,
            compute_with_sampling=with_confidence_intervals)

        accumulators = []
        for batch in benchmark_utils.batched_iterator(processed,
                                                      inputs_per_accumulator):
            accumulator = combiner.create_accumulator()
            for elem in batch:
                accumulator = combiner.add_input(accumulator, elem)
            accumulators.append(accumulator)

        final_accumulator = combiner.merge_accumulators(accumulators)
        final_output = combiner.extract_output(final_accumulator)
        end = time.time()
        delta = end - start

        # Sanity check the example count. This is not timed.
        example_count_key = metric_types.MetricKey(name="example_count")
        example_count = None
        for x in final_output:
            if example_count_key in x:
                example_count = x[example_count_key]
                break

        if example_count is None:
            raise ValueError(
                "example_count was not in the final list of metrics. "
                "metrics were: %s" % str(final_output))

        if with_confidence_intervals:
            # If we're computing using confidence intervals, the example count will
            # not be exact.
            lower_bound = int(0.9 * num_examples)
            upper_bound = int(1.1 * num_examples)
            if example_count < lower_bound or example_count > upper_bound:
                raise ValueError("example count out of bounds: expecting "
                                 "%d < example_count < %d, but got %d" %
                                 (lower_bound, upper_bound, example_count))
        else:
            # If we're not using confidence intervals, we expect the example count to
            # be exact.
            if example_count != num_examples:
                raise ValueError(
                    "example count mismatch: expecting %d got %d" %
                    (num_examples, example_count))

        self.report_benchmark(iters=1,
                              wall_time=delta,
                              extras={
                                  "inputs_per_accumulator":
                                  inputs_per_accumulator,
                                  "num_examples": num_examples
                              })
def _ComputeMetricsAndPlots(  # pylint: disable=invalid-name
        extracts: beam.pvalue.PCollection,
        eval_config: config.EvalConfig,
        metrics_specs: List[config.MetricsSpec],
        eval_shared_models: List[types.EvalSharedModel],
        metrics_key: Text = constants.METRICS_KEY,
        plots_key: Text = constants.PLOTS_KEY) -> evaluator.Evaluation:
    """Computes metrics and plots.

  Args:
    extracts: PCollection of Extracts. If a query_key was used then the
      PCollection will contain a list of extracts.
    eval_config: Eval config.
    metrics_specs: Subset of the metric specs to compute metrics for. If a
      query_key was used all of the metric specs will be for the same query_key.
    eval_shared_models: Shared models.
    metrics_key: Name to use for metrics key in Evaluation output.
    plots_key: Name to use for plots key in Evaluation output.

  Returns:
    Evaluation containing dict of PCollections of (slice_key, results_dict)
    tuples where the dict is keyed by either the metrics_key (e.g. 'metrics') or
    plots_key (e.g. 'plots') depending on what the results_dict contains.
  """
    model_loaders = {m.model_path: m.model_loader for m in eval_shared_models}
    computations, derived_computations = _filter_and_separate_computations(
        metric_specs.to_computations(metrics_specs,
                                     eval_config=eval_config,
                                     model_loaders=model_loaders))

    # pylint: disable=no-value-for-parameter

    # Input: Single extract per example (or list of extracts if query_key used)
    #        where each item contains slice keys and other extracts from upstream
    #        extractors (e.g. labels, predictions, etc).
    # Output: Single extract (per example) containing slice keys and initial
    #         combiner state returned from preprocessor. Note that even if a
    #         query_key was used the output is still only a single extract
    #         (though, that extract may contain lists of values (predictions,
    #         labels, etc) in its keys).
    #
    # Note that the output of this step is extracts instead of just a tuple of
    # computation outputs because FanoutSlices takes extracts as input (and in
    # many cases a subset of the extracts themselves are what is fanned out).
    extracts = (extracts
                | 'Preprocesss' >> beam.ParDo(_PreprocessorDoFn(computations)))

    # Input: Single extract containing slice keys and initial combiner inputs. If
    #        query_key is used the extract represents multiple examples with the
    #        same query_key, otherwise the extract represents a single example.
    # Output: Tuple (slice key, combiner inputs extracts). Notice that the per
    #         example (or list or examples if query_key used) input extract turns
    #         into n logical extracts, references to which are replicated once per
    #         applicable slice key.
    slices = extracts | 'FanoutSlices' >> slicer.FanoutSlices()

    slices_count = (slices
                    | 'ExtractSliceKeys' >> beam.Keys()
                    | 'CountPerSliceKey' >> beam.combiners.Count.PerElement())

    # Input: Tuple of (slice key, combiner input extracts).
    # Output: Tuple of (slice key, dict of computed metrics/plots). The dicts will
    #         be keyed by MetricKey/PlotKey and the values will be the result
    #         of the associated computations. A given MetricComputation can
    #         perform computations for multiple keys, but the keys should be
    #         unique across computations.
    sliced_metrics_and_plots = (
        slices
        |
        'ComputePerSlice' >> poisson_bootstrap.ComputeWithConfidenceIntervals(
            _ComputePerSlice,
            computations=computations,
            derived_computations=derived_computations,
            num_bootstrap_samples=(
                poisson_bootstrap.DEFAULT_NUM_BOOTSTRAP_SAMPLES if
                eval_config.options.compute_confidence_intervals.value else 1))
    )

    if eval_config.options.k_anonymization_count.value > 1:
        sliced_metrics_and_plots = (
            sliced_metrics_and_plots
            | 'FilteForSmallSlices' >> slicer.FilterOutSlices(
                slices_count, eval_config.options.k_anonymization_count.value))

    sliced_metrics = (sliced_metrics_and_plots
                      | 'FilterByMetrics' >> beam.Map(_filter_by_key_type,
                                                      metric_types.MetricKey))
    sliced_plots = (
        sliced_metrics_and_plots
        |
        'FilterByPlots' >> beam.Map(_filter_by_key_type, metric_types.PlotKey))

    # pylint: enable=no-value-for-parameter

    return {metrics_key: sliced_metrics, plots_key: sliced_plots}