コード例 #1
0
    def testSliceSpecBeamCounter(self):
        with beam.Pipeline() as pipeline:
            _ = (pipeline
                 | beam.Create([[[('slice_key', 'first_slice')]]])
                 | counter_util.IncrementSliceSpecCounters())

        result = pipeline.run()

        slice_spec_filter = beam.metrics.metric.MetricsFilter().with_namespace(
            constants.METRICS_NAMESPACE).with_name(
                'slice_computed_slice_key_first_slice')
        slice_count = result.metrics().query(
            filter=slice_spec_filter)['counters'][0].committed
        self.assertEqual(slice_count, 1)
def _ComputeMetricsAndPlots(  # pylint: disable=invalid-name
    extracts: beam.pvalue.PCollection,
    eval_config: config_pb2.EvalConfig,
    metrics_specs: List[config_pb2.MetricsSpec],
    eval_shared_models: Optional[Dict[Text, types.EvalSharedModel]] = None,
    metrics_key: Text = constants.METRICS_KEY,
    plots_key: Text = constants.PLOTS_KEY,
    attributions_key: Text = constants.ATTRIBUTIONS_KEY,
    schema: Optional[schema_pb2.Schema] = None,
    random_seed_for_testing: Optional[int] = None,
    tensor_adapter_config: Optional[tensor_adapter.TensorAdapterConfig] = None
) -> evaluator.Evaluation:
  """Computes metrics and plots.

  Args:
    extracts: PCollection of Extracts. If a query_key was used then the
      PCollection will contain a list of extracts.
    eval_config: Eval config.
    metrics_specs: Subset of the metric specs to compute metrics for. If a
      query_key was used all of the metric specs will be for the same query_key.
    eval_shared_models: Optional dict of shared models keyed by model name. Only
      required if there are metrics to be computed in-graph using the model.
    metrics_key: Name to use for metrics key in Evaluation output.
    plots_key: Name to use for plots key in Evaluation output.
    attributions_key: Name to use for attributions key in Evaluation output.
    schema: A schema to use for customizing metrics and plots.
    random_seed_for_testing: Seed to use for unit testing.
    tensor_adapter_config: Tensor adapter config which specifies how to obtain
      tensors from the Arrow RecordBatch. The model's signature will be invoked
      with those tensors (matched by names). If None, an attempt will be made to
      create an adapter based on the model's input signature otherwise the model
      will be invoked with raw examples (assuming a  signature of a single 1-D
      string tensor).

  Returns:
    Evaluation containing dict of PCollections of (slice_key, results_dict)
    tuples where the dict is keyed by either the metrics_key (e.g. 'metrics'),
    plots_key (e.g. 'plots'), or attributions_key (e.g. 'attributions')
    depending on what the results_dict contains.
  """
  computations = []
  # Add default metric computations
  if eval_shared_models:
    # Note that there is the possibility for metric naming collisions here
    # (e.g. 'auc' calculated within the model as well as by AUC metric
    # computation performed outside the model). Currently all the overlapping
    # metrics such as AUC that are computed outside the model are all derived
    # metrics so they will override the metrics calculated by the model which is
    # the desired behavior.
    for model_name, eval_shared_model in eval_shared_models.items():
      if not eval_shared_model.include_default_metrics:
        continue
      if eval_shared_model.model_type == constants.TF_KERAS:
        computations.extend(
            keras_util.metric_computations_using_keras_saved_model(
                model_name, eval_shared_model.model_loader, eval_config,
                tensor_adapter_config))
      elif (eval_shared_model.model_type == constants.TF_ESTIMATOR and
            eval_constants.EVAL_TAG in eval_shared_model.model_loader.tags):
        computations.extend(
            eval_saved_model_util.metric_computations_using_eval_saved_model(
                model_name, eval_shared_model.model_loader))
  # Add metric computations from specs
  metric_computations = _filter_and_separate_computations(
      metric_specs.to_computations(
          metrics_specs, eval_config=eval_config, schema=schema))
  computations.extend(metric_computations.non_derived_computations)

  # Find out which model is baseline.
  baseline_spec = model_util.get_baseline_model_spec(eval_config)
  baseline_model_name = baseline_spec.name if baseline_spec else None

  # pylint: disable=no-value-for-parameter

  # Input: Single extract per example (or list of extracts if query_key used)
  #        where each item contains slice keys and other extracts from upstream
  #        extractors (e.g. labels, predictions, etc).
  # Output: Single extract (per example) containing slice keys and initial
  #         combiner state returned from preprocessor. Note that even if a
  #         query_key was used the output is still only a single extract
  #         (though, that extract may contain lists of values (predictions,
  #         labels, etc) in its keys).
  #
  # Note that the output of this step is extracts instead of just a tuple of
  # computation outputs because FanoutSlices takes extracts as input (and in
  # many cases a subset of the extracts themselves are what is fanned out).
  extracts = (
      extracts
      | 'Preprocesss' >> beam.ParDo(_PreprocessorDoFn(computations)))

  # Input: Single extract containing slice keys and initial combiner inputs. If
  #        query_key is used the extract represents multiple examples with the
  #        same query_key, otherwise the extract represents a single example.
  # Output: Tuple (slice key, combiner inputs extracts). Notice that the per
  #         example (or list or examples if query_key used) input extract turns
  #         into n logical extracts, references to which are replicated once per
  #         applicable slice key.
  slices = extracts | 'FanoutSlices' >> slicer.FanoutSlices()

  slices_count = (
      slices
      | 'ExtractSliceKeys' >> beam.Keys()
      | 'CountPerSliceKey' >> beam.combiners.Count.PerElement())

  model_types = _get_model_types_for_logging(eval_shared_models)

  _ = (
      extracts.pipeline
      | 'IncrementMetricsSpecsCounters' >>
      counter_util.IncrementMetricsSpecsCounters(metrics_specs, model_types),
      slices_count
      |
      'IncrementSliceSpecCounters' >> counter_util.IncrementSliceSpecCounters())

  ci_params = _get_confidence_interval_params(eval_config, metrics_specs)

  cross_slice_specs = []
  if eval_config.cross_slicing_specs:
    cross_slice_specs = eval_config.cross_slicing_specs

  computations_combine_fn = _ComputationsCombineFn(computations=computations)
  derived_metrics_ptransform = _AddDerivedCrossSliceAndDiffMetrics(
      metric_computations.derived_computations,
      metric_computations.cross_slice_computations, cross_slice_specs,
      baseline_model_name)

  # Input: Tuple of (slice key, combiner input extracts).
  # Output: Tuple of (slice key, dict of computed metrics/plots/attributions).
  #         The dicts will be keyed by MetricKey/PlotKey/AttributionsKey and the
  #         values will be the result of the associated computations. A given
  #         MetricComputation can perform computations for multiple keys, but
  #         the keys should be unique across computations.
  if ci_params.num_bootstrap_samples:
    sliced_metrics_plots_and_attributions = (
        slices | 'PoissonBootstrapConfidenceIntervals' >>
        poisson_bootstrap.ComputeWithConfidenceIntervals(
            computations_combine_fn=computations_combine_fn,
            derived_metrics_ptransform=derived_metrics_ptransform,
            num_bootstrap_samples=ci_params.num_bootstrap_samples,
            hot_key_fanout=_COMBINE_PER_SLICE_KEY_HOT_KEY_FANOUT,
            skip_ci_metric_keys=ci_params.skip_ci_metric_keys,
            random_seed_for_testing=random_seed_for_testing))
  elif ci_params.num_jackknife_samples:
    sliced_metrics_plots_and_attributions = (
        slices
        | 'JackknifeConfidenceIntervals' >>
        jackknife.ComputeWithConfidenceIntervals(
            computations_combine_fn=computations_combine_fn,
            derived_metrics_ptransform=derived_metrics_ptransform,
            num_jackknife_samples=ci_params.num_jackknife_samples,
            skip_ci_metric_keys=ci_params.skip_ci_metric_keys,
            random_seed_for_testing=random_seed_for_testing))
  else:
    sliced_metrics_plots_and_attributions = (
        slices
        |
        'CombineMetricsPerSlice' >> beam.CombinePerKey(computations_combine_fn)
        .with_hot_key_fanout(_COMBINE_PER_SLICE_KEY_HOT_KEY_FANOUT)
        | 'AddDerivedCrossSliceAndDiffMetrics' >> derived_metrics_ptransform)

  sliced_metrics_plots_and_attributions = (
      sliced_metrics_plots_and_attributions
      | 'AddCIDerivedMetrics' >> beam.Map(
          _add_ci_derived_metrics, metric_computations.ci_derived_computations))

  if eval_config.options.min_slice_size.value > 1:
    sliced_metrics_plots_and_attributions = (
        sliced_metrics_plots_and_attributions
        | 'FilterSmallSlices' >> slicer.FilterOutSlices(
            slices_count, eval_config.options.min_slice_size.value))

  sliced_metrics = (
      sliced_metrics_plots_and_attributions
      | 'FilterByMetrics' >> beam.Map(_filter_by_key_type,
                                      metric_types.MetricKey))
  sliced_plots = (
      sliced_metrics_plots_and_attributions
      | 'FilterByPlots' >> beam.Map(_filter_by_key_type, metric_types.PlotKey))

  sliced_attributions = (
      sliced_metrics_plots_and_attributions
      | 'FilterByAttributions' >> beam.Map(_filter_by_key_type,
                                           metric_types.AttributionsKey))

  # pylint: enable=no-value-for-parameter

  return {
      metrics_key: sliced_metrics,
      plots_key: sliced_plots,
      attributions_key: sliced_attributions
  }
コード例 #3
0
def _ComputeMetricsAndPlots(  # pylint: disable=invalid-name
        extracts: beam.pvalue.PCollection,
        eval_config: config.EvalConfig,
        metrics_specs: List[config.MetricsSpec],
        eval_shared_models: Optional[Dict[Text, types.EvalSharedModel]] = None,
        metrics_key: Text = constants.METRICS_KEY,
        plots_key: Text = constants.PLOTS_KEY,
        schema: Optional[schema_pb2.Schema] = None,
        random_seed_for_testing: Optional[int] = None) -> evaluator.Evaluation:
    """Computes metrics and plots.

  Args:
    extracts: PCollection of Extracts. If a query_key was used then the
      PCollection will contain a list of extracts.
    eval_config: Eval config.
    metrics_specs: Subset of the metric specs to compute metrics for. If a
      query_key was used all of the metric specs will be for the same query_key.
    eval_shared_models: Optional dict of shared models keyed by model name. Only
      required if there are metrics to be computed in-graph using the model.
    metrics_key: Name to use for metrics key in Evaluation output.
    plots_key: Name to use for plots key in Evaluation output.
    schema: A schema to use for customizing metrics and plots.
    random_seed_for_testing: Seed to use for unit testing.

  Returns:
    Evaluation containing dict of PCollections of (slice_key, results_dict)
    tuples where the dict is keyed by either the metrics_key (e.g. 'metrics') or
    plots_key (e.g. 'plots') depending on what the results_dict contains.
    schema: A schema to use for customizing metrics and plots.
  """
    computations = []
    # Add default metric computations
    if eval_shared_models:
        for model_name, eval_shared_model in eval_shared_models.items():
            if not eval_shared_model.include_default_metrics:
                continue
            if eval_shared_model.model_type == constants.TF_KERAS:
                keras_specs = keras_util.metrics_specs_from_keras(
                    model_name, eval_shared_model.model_loader)
                metrics_specs = keras_specs + metrics_specs[:]
                # TODO(mdreves): Add support for calling keras.evaluate().
            elif (eval_shared_model.model_type == constants.TF_ESTIMATOR
                  and eval_constants.EVAL_TAG
                  in eval_shared_model.model_loader.tags):
                # Note that there is the possibility for metric naming collisions here
                # (e.g. 'auc' calculated within the EvalSavedModel as well as by AUC
                # metric computation performed outside the model). Currently all the
                # overlapping metrics such as AUC that are computed outside the model
                # are all derived metrics so they will override the metrics calculated
                # by the model which is the desired behavior.
                computations.extend(
                    eval_saved_model_util.
                    metric_computations_using_eval_saved_model(
                        model_name, eval_shared_model.model_loader))
    # Add metric computations from specs
    computations_from_specs, derived_computations = (
        _filter_and_separate_computations(
            metric_specs.to_computations(metrics_specs,
                                         eval_config=eval_config,
                                         schema=schema)))
    computations.extend(computations_from_specs)

    # Find out which model is baseline.
    baseline_spec = model_util.get_baseline_model_spec(eval_config)
    baseline_model_name = baseline_spec.name if baseline_spec else None

    # pylint: disable=no-value-for-parameter

    # Input: Single extract per example (or list of extracts if query_key used)
    #        where each item contains slice keys and other extracts from upstream
    #        extractors (e.g. labels, predictions, etc).
    # Output: Single extract (per example) containing slice keys and initial
    #         combiner state returned from preprocessor. Note that even if a
    #         query_key was used the output is still only a single extract
    #         (though, that extract may contain lists of values (predictions,
    #         labels, etc) in its keys).
    #
    # Note that the output of this step is extracts instead of just a tuple of
    # computation outputs because FanoutSlices takes extracts as input (and in
    # many cases a subset of the extracts themselves are what is fanned out).
    extracts = (extracts
                | 'Preprocesss' >> beam.ParDo(_PreprocessorDoFn(computations)))

    # Input: Single extract containing slice keys and initial combiner inputs. If
    #        query_key is used the extract represents multiple examples with the
    #        same query_key, otherwise the extract represents a single example.
    # Output: Tuple (slice key, combiner inputs extracts). Notice that the per
    #         example (or list or examples if query_key used) input extract turns
    #         into n logical extracts, references to which are replicated once per
    #         applicable slice key.
    slices = extracts | 'FanoutSlices' >> slicer.FanoutSlices()

    slices_count = (slices
                    | 'ExtractSliceKeys' >> beam.Keys()
                    | 'CountPerSliceKey' >> beam.combiners.Count.PerElement())

    _ = (extracts.pipeline
         | 'IncrementMetricsSpecsCounters' >>
         counter_util.IncrementMetricsSpecsCounters(metrics_specs),
         slices_count
         | 'IncrementSliceSpecCounters' >>
         counter_util.IncrementSliceSpecCounters())

    ci_params = _get_confidence_interval_params(eval_config, metrics_specs)

    cross_slice_specs = []
    if eval_config.cross_slicing_specs:
        cross_slice_specs = eval_config.cross_slicing_specs

    # TODO(b/151482616): Make bootstrap and jackknife confidence interval
    # implementations more parallel.

    # Input: Tuple of (slice key, combiner input extracts).
    # Output: Tuple of (slice key, dict of computed metrics/plots). The dicts will
    #         be keyed by MetricKey/PlotKey and the values will be the result
    #         of the associated computations. A given MetricComputation can
    #         perform computations for multiple keys, but the keys should be
    #         unique across computations.
    sliced_metrics_and_plots = (
        slices
        |
        'ComputePerSlice' >> poisson_bootstrap.ComputeWithConfidenceIntervals(
            _ComputePerSlice,
            computations=computations,
            derived_computations=derived_computations,
            baseline_model_name=baseline_model_name,
            cross_slice_specs=cross_slice_specs,
            num_jackknife_samples=ci_params.num_jackknife_samples,
            num_bootstrap_samples=ci_params.num_bootstrap_samples,
            skip_ci_metric_keys=ci_params.skip_ci_metric_keys,
            random_seed_for_testing=random_seed_for_testing))

    if eval_config.options.min_slice_size.value > 1:
        sliced_metrics_and_plots = (
            sliced_metrics_and_plots
            | 'FilterSmallSlices' >> slicer.FilterOutSlices(
                slices_count, eval_config.options.min_slice_size.value))

    sliced_metrics = (sliced_metrics_and_plots
                      | 'FilterByMetrics' >> beam.Map(_filter_by_key_type,
                                                      metric_types.MetricKey))
    sliced_plots = (
        sliced_metrics_and_plots
        |
        'FilterByPlots' >> beam.Map(_filter_by_key_type, metric_types.PlotKey))

    # pylint: enable=no-value-for-parameter

    return {metrics_key: sliced_metrics, plots_key: sliced_plots}
コード例 #4
0
def ComputeMetricsAndPlots(  # pylint: disable=invalid-name
    extracts: beam.pvalue.PCollection,
    eval_shared_model: types.EvalSharedModel,
    desired_batch_size: Optional[int] = None,
    compute_confidence_intervals: Optional[bool] = False,
    random_seed_for_testing: Optional[int] = None
) -> Tuple[beam.pvalue.DoOutputsTuple, beam.pvalue.PCollection]:
    """Computes metrics and plots using the EvalSavedModel.

  Args:
    extracts: PCollection of Extracts. The extracts MUST contain a
      FeaturesPredictionsLabels extract keyed by
      tfma.FEATURE_PREDICTIONS_LABELS_KEY and a list of SliceKeyType extracts
      keyed by tfma.SLICE_KEY_TYPES_KEY. Typically these will be added by
      calling the default_extractors function.
    eval_shared_model: Shared model parameters for EvalSavedModel including any
      additional metrics (see EvalSharedModel for more information on how to
      configure additional metrics).
    desired_batch_size: Optional batch size for batching in Aggregate.
    compute_confidence_intervals: Set to True to run metrics analysis over
      multiple bootstrap samples and compute uncertainty intervals.
    random_seed_for_testing: Provide for deterministic tests only.

  Returns:
    Tuple of Tuple[PCollection of (slice key, metrics),
    PCollection of (slice key, plot metrics)] and
    PCollection of (slice_key and its example count).
  """
    # pylint: disable=no-value-for-parameter

    slices = (
        extracts
        # Downstream computation only cares about FPLs, so we prune before fanout.
        # Note that fanout itself will prune the slice keys.
        # TODO(b/130032676, b/111353165): Prune FPLs to contain only the necessary
        # set for the calculation of post_export_metrics if possible.
        | 'PruneExtracts' >> extractor.Filter(include=[
            constants.FEATURES_PREDICTIONS_LABELS_KEY,
            constants.SLICE_KEY_TYPES_KEY,
            constants.INPUT_KEY,
        ])
        # Input: one example at a time, with slice keys in extracts.
        # Output: one fpl example per slice key (notice that the example turns
        #         into n logical examples, references to which are replicated once
        #         per applicable slice key).
        | 'FanoutSlices' >> slicer.FanoutSlices())

    slices_count = (slices
                    | 'ExtractSliceKeys' >> beam.Keys()
                    | 'CountPerSliceKey' >> beam.combiners.Count.PerElement())

    _ = (extracts.pipeline
         | 'IncrementMetricsCallbacksCounters' >>
         counter_util.IncrementMetricsCallbacksCounters(
             eval_shared_model.add_metrics_callbacks), slices_count
         | 'IncreamentSliceSpecCounters' >>
         counter_util.IncrementSliceSpecCounters())

    aggregated_metrics = (
        slices
        # Metrics are computed per slice key.
        # Output: Multi-outputs, a dict of slice key to computed metrics, and
        # plots if applicable.
        | 'ComputePerSliceMetrics' >>
        poisson_bootstrap.ComputeWithConfidenceIntervals(
            aggregate.ComputePerSliceMetrics,
            num_bootstrap_samples=(
                poisson_bootstrap.DEFAULT_NUM_BOOTSTRAP_SAMPLES
                if compute_confidence_intervals else 1),
            random_seed_for_testing=random_seed_for_testing,
            eval_shared_model=eval_shared_model,
            desired_batch_size=desired_batch_size)
        | 'SeparateMetricsAndPlots' >> beam.ParDo(
            _SeparateMetricsAndPlotsFn()).with_outputs(
                _SeparateMetricsAndPlotsFn.OUTPUT_TAG_PLOTS,
                main=_SeparateMetricsAndPlotsFn.OUTPUT_TAG_METRICS))

    return (aggregated_metrics, slices_count)