Exemple #1
0
    def testSliceOneSlice(self):
        with beam.Pipeline() as pipeline:
            fpls = create_fpls()
            metrics = (
                pipeline
                | 'CreateTestInput' >> beam.Create(fpls, reshuffle=False)
                | 'WrapFpls' >> beam.Map(wrap_fpl)
                | 'ExtractSlices' >> slice_key_extractor.ExtractSliceKeys([
                    slicer.SingleSliceSpec(),
                    slicer.SingleSliceSpec(columns=['gender'])
                ])
                | 'FanoutSlices' >> slicer.FanoutSlices())

            def check_result(got):
                try:
                    self.assertLen(got, 4)
                    expected_result = [
                        ((), wrap_fpl(fpls[0])),
                        ((), wrap_fpl(fpls[1])),
                        ((('gender', 'f'), ), wrap_fpl(fpls[0])),
                        ((('gender', 'm'), ), wrap_fpl(fpls[1])),
                    ]
                    self.assertCountEqual(got, expected_result)
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics, check_result)
Exemple #2
0
    def testMultidimOverallSlices(self):
        data = [{
            constants.SLICE_KEY_TYPES_KEY: np.array([[()], [()]])
        }, {
            constants.SLICE_KEY_TYPES_KEY: np.array([[()], [()]])
        }]

        with beam.Pipeline() as pipeline:
            result = (pipeline
                      | 'CreateTestInput' >> beam.Create(data, reshuffle=False)
                      | 'FanoutSlices' >> slicer.FanoutSlices())

            def check_result(got):
                try:
                    self.assertLen(got, 2)
                    del data[0][constants.SLICE_KEY_TYPES_KEY]
                    del data[1][constants.SLICE_KEY_TYPES_KEY]
                    expected_result = [
                        ((), data[0]),
                        ((), data[1]),
                    ]
                    self.assertCountEqual(got, expected_result)
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result)
    def testSliceOnMetaFeature(self):
        # We want to make sure that slicing on the newly added feature works, so
        # pulling in slice here.
        with beam.Pipeline() as pipeline:
            fpls = create_fpls()
            metrics = (
                pipeline
                | 'CreateTestInput' >> beam.Create(fpls)
                | 'WrapFpls' >> beam.Map(wrap_fpl)
                | 'ExtractInterestsNum' >>
                meta_feature_extractor.ExtractMetaFeature(get_num_interests)
                | 'ExtractSlices' >> slice_key_extractor._ExtractSliceKeys([
                    slicer.SingleSliceSpec(),
                    slicer.SingleSliceSpec(columns=['num_interests'])
                ])
                | 'FanoutSlices' >> slicer.FanoutSlices())

            def check_result(got):
                try:
                    self.assertEqual(4, len(got), 'got: %s' % got)
                    expected_slice_keys = [
                        (),
                        (),
                        (('num_interests', 1), ),
                        (('num_interests', 2), ),
                    ]
                    self.assertEqual(sorted(slice_key for slice_key, _ in got),
                                     sorted(expected_slice_keys))
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics, check_result)
Exemple #4
0
    def testSliceDefaultSlice(self):
        with beam.Pipeline() as pipeline:
            fpls = create_fpls()

            metrics = (pipeline
                       | 'CreateTestInput' >> beam.Create(fpls)
                       | 'WrapFpls' >> beam.Map(wrap_fpl)
                       |
                       'ExtractSlices' >> slice_key_extractor.ExtractSliceKeys(
                           [slicer.SingleSliceSpec()])
                       | 'FanoutSlices' >> slicer.FanoutSlices())

            def check_result(got):
                try:
                    self.assertLen(got, 2)
                    expected_result = [
                        ((), wrap_fpl(fpls[0])),
                        ((), wrap_fpl(fpls[1])),
                    ]
                    self.assertEqual(len(got), len(expected_result))
                    self.assertTrue(got[0] == expected_result[0]
                                    and got[1] == expected_result[1]
                                    or got[1] == expected_result[0]
                                    and got[0] == expected_result[1])
                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(metrics, check_result)
  def testMultidimSlices(self):
    data = [{
        'features': {
            'gender': [['f'], ['f']],
            'age': [[13], [13]],
            'interest': [['cars'], ['cars']]
        },
        'predictions': [[1], [1]],
        'labels': [[0], [0]],
        constants.SLICE_KEY_TYPES_KEY:
            np.array([
                slicer.slice_keys_to_numpy_array([(), (('gender', 'f'),)]),
                slicer.slice_keys_to_numpy_array([(), (('gender', 'f'),)])
            ])
    }, {
        'features': {
            'gender': [['f'], ['m']],
            'age': [[13], [10]],
            'interest': [['cars'], ['cars']]
        },
        'predictions': [[1], [1]],
        'labels': [[0], [0]],
        constants.SLICE_KEY_TYPES_KEY:
            np.array([
                slicer.slice_keys_to_numpy_array([(), (('gender', 'f'),)]),
                slicer.slice_keys_to_numpy_array([(), (('gender', 'm'),)])
            ])
    }]

    with beam.Pipeline() as pipeline:
      result = (
          pipeline
          | 'CreateTestInput' >> beam.Create(data, reshuffle=False)
          | 'FanoutSlices' >> slicer.FanoutSlices())

      def check_result(got):
        try:
          self.assertLen(got, 5)
          del data[0][constants.SLICE_KEY_TYPES_KEY]
          del data[1][constants.SLICE_KEY_TYPES_KEY]
          expected_result = [
              ((), data[0]),
              ((), data[1]),
              ((('gender', 'f'),), data[0]),
              ((('gender', 'f'),), data[1]),
              ((('gender', 'm'),), data[1]),
          ]
          self.assertCountEqual(got, expected_result)
        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result)
  def testMultidimOverallSlices(self):
    data = [
        {
            constants.SLICE_KEY_TYPES_KEY:  # variable length batch case
                types.VarLenTensorValue.from_dense_rows([
                    slicer.slice_keys_to_numpy_array([(('gender', 'f'),), ()]),
                    slicer.slice_keys_to_numpy_array([()])
                ])
        },
        {
            constants.SLICE_KEY_TYPES_KEY:  # fixed length batch case
                np.array([
                    slicer.slice_keys_to_numpy_array([()]),
                    slicer.slice_keys_to_numpy_array([()])
                ])
        }
    ]

    with beam.Pipeline() as pipeline:
      result = (
          pipeline
          | 'CreateTestInput' >> beam.Create(data, reshuffle=False)
          | 'FanoutSlices' >> slicer.FanoutSlices())

      def check_result(got):
        try:
          del data[0][constants.SLICE_KEY_TYPES_KEY]
          del data[1][constants.SLICE_KEY_TYPES_KEY]
          expected_result = [
              ((('gender', 'f'),), data[0]),
              ((), data[0]),
              ((), data[1]),
          ]
          self.assertCountEqual(got, expected_result)
        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result)
def _ComputeMetricsAndPlots(  # pylint: disable=invalid-name
    extracts: beam.pvalue.PCollection,
    eval_config: config_pb2.EvalConfig,
    metrics_specs: List[config_pb2.MetricsSpec],
    eval_shared_models: Optional[Dict[Text, types.EvalSharedModel]] = None,
    metrics_key: Text = constants.METRICS_KEY,
    plots_key: Text = constants.PLOTS_KEY,
    attributions_key: Text = constants.ATTRIBUTIONS_KEY,
    schema: Optional[schema_pb2.Schema] = None,
    random_seed_for_testing: Optional[int] = None,
    tensor_adapter_config: Optional[tensor_adapter.TensorAdapterConfig] = None
) -> evaluator.Evaluation:
  """Computes metrics and plots.

  Args:
    extracts: PCollection of Extracts. If a query_key was used then the
      PCollection will contain a list of extracts.
    eval_config: Eval config.
    metrics_specs: Subset of the metric specs to compute metrics for. If a
      query_key was used all of the metric specs will be for the same query_key.
    eval_shared_models: Optional dict of shared models keyed by model name. Only
      required if there are metrics to be computed in-graph using the model.
    metrics_key: Name to use for metrics key in Evaluation output.
    plots_key: Name to use for plots key in Evaluation output.
    attributions_key: Name to use for attributions key in Evaluation output.
    schema: A schema to use for customizing metrics and plots.
    random_seed_for_testing: Seed to use for unit testing.
    tensor_adapter_config: Tensor adapter config which specifies how to obtain
      tensors from the Arrow RecordBatch. The model's signature will be invoked
      with those tensors (matched by names). If None, an attempt will be made to
      create an adapter based on the model's input signature otherwise the model
      will be invoked with raw examples (assuming a  signature of a single 1-D
      string tensor).

  Returns:
    Evaluation containing dict of PCollections of (slice_key, results_dict)
    tuples where the dict is keyed by either the metrics_key (e.g. 'metrics'),
    plots_key (e.g. 'plots'), or attributions_key (e.g. 'attributions')
    depending on what the results_dict contains.
  """
  computations = []
  # Add default metric computations
  if eval_shared_models:
    # Note that there is the possibility for metric naming collisions here
    # (e.g. 'auc' calculated within the model as well as by AUC metric
    # computation performed outside the model). Currently all the overlapping
    # metrics such as AUC that are computed outside the model are all derived
    # metrics so they will override the metrics calculated by the model which is
    # the desired behavior.
    for model_name, eval_shared_model in eval_shared_models.items():
      if not eval_shared_model.include_default_metrics:
        continue
      if eval_shared_model.model_type == constants.TF_KERAS:
        computations.extend(
            keras_util.metric_computations_using_keras_saved_model(
                model_name, eval_shared_model.model_loader, eval_config,
                tensor_adapter_config))
      elif (eval_shared_model.model_type == constants.TF_ESTIMATOR and
            eval_constants.EVAL_TAG in eval_shared_model.model_loader.tags):
        computations.extend(
            eval_saved_model_util.metric_computations_using_eval_saved_model(
                model_name, eval_shared_model.model_loader))
  # Add metric computations from specs
  metric_computations = _filter_and_separate_computations(
      metric_specs.to_computations(
          metrics_specs, eval_config=eval_config, schema=schema))
  computations.extend(metric_computations.non_derived_computations)

  # Find out which model is baseline.
  baseline_spec = model_util.get_baseline_model_spec(eval_config)
  baseline_model_name = baseline_spec.name if baseline_spec else None

  # pylint: disable=no-value-for-parameter

  # Input: Single extract per example (or list of extracts if query_key used)
  #        where each item contains slice keys and other extracts from upstream
  #        extractors (e.g. labels, predictions, etc).
  # Output: Single extract (per example) containing slice keys and initial
  #         combiner state returned from preprocessor. Note that even if a
  #         query_key was used the output is still only a single extract
  #         (though, that extract may contain lists of values (predictions,
  #         labels, etc) in its keys).
  #
  # Note that the output of this step is extracts instead of just a tuple of
  # computation outputs because FanoutSlices takes extracts as input (and in
  # many cases a subset of the extracts themselves are what is fanned out).
  extracts = (
      extracts
      | 'Preprocesss' >> beam.ParDo(_PreprocessorDoFn(computations)))

  # Input: Single extract containing slice keys and initial combiner inputs. If
  #        query_key is used the extract represents multiple examples with the
  #        same query_key, otherwise the extract represents a single example.
  # Output: Tuple (slice key, combiner inputs extracts). Notice that the per
  #         example (or list or examples if query_key used) input extract turns
  #         into n logical extracts, references to which are replicated once per
  #         applicable slice key.
  slices = extracts | 'FanoutSlices' >> slicer.FanoutSlices()

  slices_count = (
      slices
      | 'ExtractSliceKeys' >> beam.Keys()
      | 'CountPerSliceKey' >> beam.combiners.Count.PerElement())

  model_types = _get_model_types_for_logging(eval_shared_models)

  _ = (
      extracts.pipeline
      | 'IncrementMetricsSpecsCounters' >>
      counter_util.IncrementMetricsSpecsCounters(metrics_specs, model_types),
      slices_count
      |
      'IncrementSliceSpecCounters' >> counter_util.IncrementSliceSpecCounters())

  ci_params = _get_confidence_interval_params(eval_config, metrics_specs)

  cross_slice_specs = []
  if eval_config.cross_slicing_specs:
    cross_slice_specs = eval_config.cross_slicing_specs

  computations_combine_fn = _ComputationsCombineFn(computations=computations)
  derived_metrics_ptransform = _AddDerivedCrossSliceAndDiffMetrics(
      metric_computations.derived_computations,
      metric_computations.cross_slice_computations, cross_slice_specs,
      baseline_model_name)

  # Input: Tuple of (slice key, combiner input extracts).
  # Output: Tuple of (slice key, dict of computed metrics/plots/attributions).
  #         The dicts will be keyed by MetricKey/PlotKey/AttributionsKey and the
  #         values will be the result of the associated computations. A given
  #         MetricComputation can perform computations for multiple keys, but
  #         the keys should be unique across computations.
  if ci_params.num_bootstrap_samples:
    sliced_metrics_plots_and_attributions = (
        slices | 'PoissonBootstrapConfidenceIntervals' >>
        poisson_bootstrap.ComputeWithConfidenceIntervals(
            computations_combine_fn=computations_combine_fn,
            derived_metrics_ptransform=derived_metrics_ptransform,
            num_bootstrap_samples=ci_params.num_bootstrap_samples,
            hot_key_fanout=_COMBINE_PER_SLICE_KEY_HOT_KEY_FANOUT,
            skip_ci_metric_keys=ci_params.skip_ci_metric_keys,
            random_seed_for_testing=random_seed_for_testing))
  elif ci_params.num_jackknife_samples:
    sliced_metrics_plots_and_attributions = (
        slices
        | 'JackknifeConfidenceIntervals' >>
        jackknife.ComputeWithConfidenceIntervals(
            computations_combine_fn=computations_combine_fn,
            derived_metrics_ptransform=derived_metrics_ptransform,
            num_jackknife_samples=ci_params.num_jackknife_samples,
            skip_ci_metric_keys=ci_params.skip_ci_metric_keys,
            random_seed_for_testing=random_seed_for_testing))
  else:
    sliced_metrics_plots_and_attributions = (
        slices
        |
        'CombineMetricsPerSlice' >> beam.CombinePerKey(computations_combine_fn)
        .with_hot_key_fanout(_COMBINE_PER_SLICE_KEY_HOT_KEY_FANOUT)
        | 'AddDerivedCrossSliceAndDiffMetrics' >> derived_metrics_ptransform)

  sliced_metrics_plots_and_attributions = (
      sliced_metrics_plots_and_attributions
      | 'AddCIDerivedMetrics' >> beam.Map(
          _add_ci_derived_metrics, metric_computations.ci_derived_computations))

  if eval_config.options.min_slice_size.value > 1:
    sliced_metrics_plots_and_attributions = (
        sliced_metrics_plots_and_attributions
        | 'FilterSmallSlices' >> slicer.FilterOutSlices(
            slices_count, eval_config.options.min_slice_size.value))

  sliced_metrics = (
      sliced_metrics_plots_and_attributions
      | 'FilterByMetrics' >> beam.Map(_filter_by_key_type,
                                      metric_types.MetricKey))
  sliced_plots = (
      sliced_metrics_plots_and_attributions
      | 'FilterByPlots' >> beam.Map(_filter_by_key_type, metric_types.PlotKey))

  sliced_attributions = (
      sliced_metrics_plots_and_attributions
      | 'FilterByAttributions' >> beam.Map(_filter_by_key_type,
                                           metric_types.AttributionsKey))

  # pylint: enable=no-value-for-parameter

  return {
      metrics_key: sliced_metrics,
      plots_key: sliced_plots,
      attributions_key: sliced_attributions
  }
def _ComputeMetricsAndPlots(  # pylint: disable=invalid-name
        extracts: beam.pvalue.PCollection,
        eval_config: config.EvalConfig,
        metrics_specs: List[config.MetricsSpec],
        eval_shared_models: Optional[Dict[Text, types.EvalSharedModel]] = None,
        metrics_key: Text = constants.METRICS_KEY,
        plots_key: Text = constants.PLOTS_KEY,
        schema: Optional[schema_pb2.Schema] = None,
        random_seed_for_testing: Optional[int] = None) -> evaluator.Evaluation:
    """Computes metrics and plots.

  Args:
    extracts: PCollection of Extracts. If a query_key was used then the
      PCollection will contain a list of extracts.
    eval_config: Eval config.
    metrics_specs: Subset of the metric specs to compute metrics for. If a
      query_key was used all of the metric specs will be for the same query_key.
    eval_shared_models: Optional dict of shared models keyed by model name. Only
      required if there are metrics to be computed in-graph using the model.
    metrics_key: Name to use for metrics key in Evaluation output.
    plots_key: Name to use for plots key in Evaluation output.
    schema: A schema to use for customizing metrics and plots.
    random_seed_for_testing: Seed to use for unit testing.

  Returns:
    Evaluation containing dict of PCollections of (slice_key, results_dict)
    tuples where the dict is keyed by either the metrics_key (e.g. 'metrics') or
    plots_key (e.g. 'plots') depending on what the results_dict contains.
    schema: A schema to use for customizing metrics and plots.
  """
    computations = []
    # Add default metric computations
    if eval_shared_models:
        for model_name, eval_shared_model in eval_shared_models.items():
            if not eval_shared_model.include_default_metrics:
                continue
            if eval_shared_model.model_type == constants.TF_KERAS:
                keras_specs = keras_util.metrics_specs_from_keras(
                    model_name, eval_shared_model.model_loader)
                metrics_specs = keras_specs + metrics_specs[:]
                # TODO(mdreves): Add support for calling keras.evaluate().
            elif (eval_shared_model.model_type == constants.TF_ESTIMATOR
                  and eval_constants.EVAL_TAG
                  in eval_shared_model.model_loader.tags):
                # Note that there is the possibility for metric naming collisions here
                # (e.g. 'auc' calculated within the EvalSavedModel as well as by AUC
                # metric computation performed outside the model). Currently all the
                # overlapping metrics such as AUC that are computed outside the model
                # are all derived metrics so they will override the metrics calculated
                # by the model which is the desired behavior.
                computations.extend(
                    eval_saved_model_util.
                    metric_computations_using_eval_saved_model(
                        model_name, eval_shared_model.model_loader))
    # Add metric computations from specs
    computations_from_specs, derived_computations = (
        _filter_and_separate_computations(
            metric_specs.to_computations(metrics_specs,
                                         eval_config=eval_config,
                                         schema=schema)))
    computations.extend(computations_from_specs)

    # Find out which model is baseline.
    baseline_spec = model_util.get_baseline_model_spec(eval_config)
    baseline_model_name = baseline_spec.name if baseline_spec else None

    # pylint: disable=no-value-for-parameter

    # Input: Single extract per example (or list of extracts if query_key used)
    #        where each item contains slice keys and other extracts from upstream
    #        extractors (e.g. labels, predictions, etc).
    # Output: Single extract (per example) containing slice keys and initial
    #         combiner state returned from preprocessor. Note that even if a
    #         query_key was used the output is still only a single extract
    #         (though, that extract may contain lists of values (predictions,
    #         labels, etc) in its keys).
    #
    # Note that the output of this step is extracts instead of just a tuple of
    # computation outputs because FanoutSlices takes extracts as input (and in
    # many cases a subset of the extracts themselves are what is fanned out).
    extracts = (extracts
                | 'Preprocesss' >> beam.ParDo(_PreprocessorDoFn(computations)))

    # Input: Single extract containing slice keys and initial combiner inputs. If
    #        query_key is used the extract represents multiple examples with the
    #        same query_key, otherwise the extract represents a single example.
    # Output: Tuple (slice key, combiner inputs extracts). Notice that the per
    #         example (or list or examples if query_key used) input extract turns
    #         into n logical extracts, references to which are replicated once per
    #         applicable slice key.
    slices = extracts | 'FanoutSlices' >> slicer.FanoutSlices()

    slices_count = (slices
                    | 'ExtractSliceKeys' >> beam.Keys()
                    | 'CountPerSliceKey' >> beam.combiners.Count.PerElement())

    _ = (extracts.pipeline
         | 'IncrementMetricsSpecsCounters' >>
         counter_util.IncrementMetricsSpecsCounters(metrics_specs),
         slices_count
         | 'IncrementSliceSpecCounters' >>
         counter_util.IncrementSliceSpecCounters())

    ci_params = _get_confidence_interval_params(eval_config, metrics_specs)

    cross_slice_specs = []
    if eval_config.cross_slicing_specs:
        cross_slice_specs = eval_config.cross_slicing_specs

    # TODO(b/151482616): Make bootstrap and jackknife confidence interval
    # implementations more parallel.

    # Input: Tuple of (slice key, combiner input extracts).
    # Output: Tuple of (slice key, dict of computed metrics/plots). The dicts will
    #         be keyed by MetricKey/PlotKey and the values will be the result
    #         of the associated computations. A given MetricComputation can
    #         perform computations for multiple keys, but the keys should be
    #         unique across computations.
    sliced_metrics_and_plots = (
        slices
        |
        'ComputePerSlice' >> poisson_bootstrap.ComputeWithConfidenceIntervals(
            _ComputePerSlice,
            computations=computations,
            derived_computations=derived_computations,
            baseline_model_name=baseline_model_name,
            cross_slice_specs=cross_slice_specs,
            num_jackknife_samples=ci_params.num_jackknife_samples,
            num_bootstrap_samples=ci_params.num_bootstrap_samples,
            skip_ci_metric_keys=ci_params.skip_ci_metric_keys,
            random_seed_for_testing=random_seed_for_testing))

    if eval_config.options.min_slice_size.value > 1:
        sliced_metrics_and_plots = (
            sliced_metrics_and_plots
            | 'FilterSmallSlices' >> slicer.FilterOutSlices(
                slices_count, eval_config.options.min_slice_size.value))

    sliced_metrics = (sliced_metrics_and_plots
                      | 'FilterByMetrics' >> beam.Map(_filter_by_key_type,
                                                      metric_types.MetricKey))
    sliced_plots = (
        sliced_metrics_and_plots
        |
        'FilterByPlots' >> beam.Map(_filter_by_key_type, metric_types.PlotKey))

    # pylint: enable=no-value-for-parameter

    return {metrics_key: sliced_metrics, plots_key: sliced_plots}
Exemple #9
0
def ComputeMetricsAndPlots(  # pylint: disable=invalid-name
    extracts: beam.pvalue.PCollection,
    eval_shared_model: types.EvalSharedModel,
    desired_batch_size: Optional[int] = None,
    compute_confidence_intervals: Optional[bool] = False,
    random_seed_for_testing: Optional[int] = None
) -> Tuple[beam.pvalue.DoOutputsTuple, beam.pvalue.PCollection]:
    """Computes metrics and plots using the EvalSavedModel.

  Args:
    extracts: PCollection of Extracts. The extracts MUST contain a
      FeaturesPredictionsLabels extract keyed by
      tfma.FEATURE_PREDICTIONS_LABELS_KEY and a list of SliceKeyType extracts
      keyed by tfma.SLICE_KEY_TYPES_KEY. Typically these will be added by
      calling the default_extractors function.
    eval_shared_model: Shared model parameters for EvalSavedModel including any
      additional metrics (see EvalSharedModel for more information on how to
      configure additional metrics).
    desired_batch_size: Optional batch size for batching in Aggregate.
    compute_confidence_intervals: Set to True to run metrics analysis over
      multiple bootstrap samples and compute uncertainty intervals.
    random_seed_for_testing: Provide for deterministic tests only.

  Returns:
    Tuple of Tuple[PCollection of (slice key, metrics),
    PCollection of (slice key, plot metrics)] and
    PCollection of (slice_key and its example count).
  """
    # pylint: disable=no-value-for-parameter

    slices = (
        extracts
        # Downstream computation only cares about FPLs, so we prune before fanout.
        # Note that fanout itself will prune the slice keys.
        # TODO(b/130032676, b/111353165): Prune FPLs to contain only the necessary
        # set for the calculation of post_export_metrics if possible.
        | 'PruneExtracts' >> extractor.Filter(include=[
            constants.FEATURES_PREDICTIONS_LABELS_KEY,
            constants.SLICE_KEY_TYPES_KEY,
            constants.INPUT_KEY,
        ])
        # Input: one example at a time, with slice keys in extracts.
        # Output: one fpl example per slice key (notice that the example turns
        #         into n logical examples, references to which are replicated once
        #         per applicable slice key).
        | 'FanoutSlices' >> slicer.FanoutSlices())

    slices_count = (slices
                    | 'ExtractSliceKeys' >> beam.Keys()
                    | 'CountPerSliceKey' >> beam.combiners.Count.PerElement())

    _ = (extracts.pipeline
         | 'IncrementMetricsCallbacksCounters' >>
         counter_util.IncrementMetricsCallbacksCounters(
             eval_shared_model.add_metrics_callbacks), slices_count
         | 'IncreamentSliceSpecCounters' >>
         counter_util.IncrementSliceSpecCounters())

    aggregated_metrics = (
        slices
        # Metrics are computed per slice key.
        # Output: Multi-outputs, a dict of slice key to computed metrics, and
        # plots if applicable.
        | 'ComputePerSliceMetrics' >>
        poisson_bootstrap.ComputeWithConfidenceIntervals(
            aggregate.ComputePerSliceMetrics,
            num_bootstrap_samples=(
                poisson_bootstrap.DEFAULT_NUM_BOOTSTRAP_SAMPLES
                if compute_confidence_intervals else 1),
            random_seed_for_testing=random_seed_for_testing,
            eval_shared_model=eval_shared_model,
            desired_batch_size=desired_batch_size)
        | 'SeparateMetricsAndPlots' >> beam.ParDo(
            _SeparateMetricsAndPlotsFn()).with_outputs(
                _SeparateMetricsAndPlotsFn.OUTPUT_TAG_PLOTS,
                main=_SeparateMetricsAndPlotsFn.OUTPUT_TAG_METRICS))

    return (aggregated_metrics, slices_count)
Exemple #10
0
def _ComputeMetricsAndPlots(  # pylint: disable=invalid-name
        extracts: beam.pvalue.PCollection,
        eval_config: config.EvalConfig,
        metrics_specs: List[config.MetricsSpec],
        eval_shared_models: Optional[Dict[Text, types.EvalSharedModel]] = None,
        metrics_key: Text = constants.METRICS_KEY,
        plots_key: Text = constants.PLOTS_KEY) -> evaluator.Evaluation:
    """Computes metrics and plots.

  Args:
    extracts: PCollection of Extracts. If a query_key was used then the
      PCollection will contain a list of extracts.
    eval_config: Eval config.
    metrics_specs: Subset of the metric specs to compute metrics for. If a
      query_key was used all of the metric specs will be for the same query_key.
    eval_shared_models: Optional dict of shared models keyed by model name. Only
      required if there are metrics to be computed in-graph using the model.
    metrics_key: Name to use for metrics key in Evaluation output.
    plots_key: Name to use for plots key in Evaluation output.

  Returns:
    Evaluation containing dict of PCollections of (slice_key, results_dict)
    tuples where the dict is keyed by either the metrics_key (e.g. 'metrics') or
    plots_key (e.g. 'plots') depending on what the results_dict contains.
  """
    model_loaders = None
    if eval_shared_models:
        model_loaders = {}
        for k, v in eval_shared_models.items():
            if v.include_default_metrics:
                model_loaders[k] = v.model_loader
    computations, derived_computations = _filter_and_separate_computations(
        metric_specs.to_computations(metrics_specs,
                                     eval_config=eval_config,
                                     model_loaders=model_loaders))
    # Add default metric computations
    if (model_loaders and eval_config
            and (not eval_config.options.HasField('include_default_metrics')
                 or eval_config.options.include_default_metrics.value)):
        for model_name, model_loader in model_loaders.items():
            model_types = model_loader.construct_fn(lambda x: None)()
            if model_types.keras_model is not None:
                # TODO(mdreves): Move handling of keras metrics to here.
                pass
            elif model_types.eval_saved_model is not None:
                # Note that there is the possibility for metric naming collisions here
                # (e.g. 'auc' calculated within the EvalSavedModel as well as by AUC
                # metric computation performed outside the model). Currently all the
                # overlapping metrics such as AUC that are computed outside the model
                # are all derived metrics so they will override the metrics calculated
                # by the model which is the desired behavior.
                computations.extend(
                    eval_saved_model_util.
                    metric_computations_using_eval_saved_model(
                        model_name, model_loader))

    # pylint: disable=no-value-for-parameter

    # Input: Single extract per example (or list of extracts if query_key used)
    #        where each item contains slice keys and other extracts from upstream
    #        extractors (e.g. labels, predictions, etc).
    # Output: Single extract (per example) containing slice keys and initial
    #         combiner state returned from preprocessor. Note that even if a
    #         query_key was used the output is still only a single extract
    #         (though, that extract may contain lists of values (predictions,
    #         labels, etc) in its keys).
    #
    # Note that the output of this step is extracts instead of just a tuple of
    # computation outputs because FanoutSlices takes extracts as input (and in
    # many cases a subset of the extracts themselves are what is fanned out).
    extracts = (extracts
                | 'Preprocesss' >> beam.ParDo(_PreprocessorDoFn(computations)))

    # Input: Single extract containing slice keys and initial combiner inputs. If
    #        query_key is used the extract represents multiple examples with the
    #        same query_key, otherwise the extract represents a single example.
    # Output: Tuple (slice key, combiner inputs extracts). Notice that the per
    #         example (or list or examples if query_key used) input extract turns
    #         into n logical extracts, references to which are replicated once per
    #         applicable slice key.
    slices = extracts | 'FanoutSlices' >> slicer.FanoutSlices()

    slices_count = (slices
                    | 'ExtractSliceKeys' >> beam.Keys()
                    | 'CountPerSliceKey' >> beam.combiners.Count.PerElement())

    # Input: Tuple of (slice key, combiner input extracts).
    # Output: Tuple of (slice key, dict of computed metrics/plots). The dicts will
    #         be keyed by MetricKey/PlotKey and the values will be the result
    #         of the associated computations. A given MetricComputation can
    #         perform computations for multiple keys, but the keys should be
    #         unique across computations.
    sliced_metrics_and_plots = (
        slices
        |
        'ComputePerSlice' >> poisson_bootstrap.ComputeWithConfidenceIntervals(
            _ComputePerSlice,
            computations=computations,
            derived_computations=derived_computations,
            num_bootstrap_samples=(
                poisson_bootstrap.DEFAULT_NUM_BOOTSTRAP_SAMPLES if
                eval_config.options.compute_confidence_intervals.value else 1))
    )

    if eval_config.options.k_anonymization_count.value > 1:
        sliced_metrics_and_plots = (
            sliced_metrics_and_plots
            | 'FilteForSmallSlices' >> slicer.FilterOutSlices(
                slices_count, eval_config.options.k_anonymization_count.value))

    sliced_metrics = (sliced_metrics_and_plots
                      | 'FilterByMetrics' >> beam.Map(_filter_by_key_type,
                                                      metric_types.MetricKey))
    sliced_plots = (
        sliced_metrics_and_plots
        |
        'FilterByPlots' >> beam.Map(_filter_by_key_type, metric_types.PlotKey))

    # pylint: enable=no-value-for-parameter

    return {metrics_key: sliced_metrics, plots_key: sliced_plots}
def _ComputeMetricsAndPlots(  # pylint: disable=invalid-name
        extracts: beam.pvalue.PCollection,
        eval_config: config.EvalConfig,
        metrics_specs: List[config.MetricsSpec],
        eval_shared_models: List[types.EvalSharedModel],
        metrics_key: Text = constants.METRICS_KEY,
        plots_key: Text = constants.PLOTS_KEY) -> evaluator.Evaluation:
    """Computes metrics and plots.

  Args:
    extracts: PCollection of Extracts. If a query_key was used then the
      PCollection will contain a list of extracts.
    eval_config: Eval config.
    metrics_specs: Subset of the metric specs to compute metrics for. If a
      query_key was used all of the metric specs will be for the same query_key.
    eval_shared_models: Shared models.
    metrics_key: Name to use for metrics key in Evaluation output.
    plots_key: Name to use for plots key in Evaluation output.

  Returns:
    Evaluation containing dict of PCollections of (slice_key, results_dict)
    tuples where the dict is keyed by either the metrics_key (e.g. 'metrics') or
    plots_key (e.g. 'plots') depending on what the results_dict contains.
  """
    model_loaders = {m.model_path: m.model_loader for m in eval_shared_models}
    computations, derived_computations = _filter_and_separate_computations(
        metric_specs.to_computations(metrics_specs,
                                     eval_config=eval_config,
                                     model_loaders=model_loaders))

    # pylint: disable=no-value-for-parameter

    # Input: Single extract per example (or list of extracts if query_key used)
    #        where each item contains slice keys and other extracts from upstream
    #        extractors (e.g. labels, predictions, etc).
    # Output: Single extract (per example) containing slice keys and initial
    #         combiner state returned from preprocessor. Note that even if a
    #         query_key was used the output is still only a single extract
    #         (though, that extract may contain lists of values (predictions,
    #         labels, etc) in its keys).
    #
    # Note that the output of this step is extracts instead of just a tuple of
    # computation outputs because FanoutSlices takes extracts as input (and in
    # many cases a subset of the extracts themselves are what is fanned out).
    extracts = (extracts
                | 'Preprocesss' >> beam.ParDo(_PreprocessorDoFn(computations)))

    # Input: Single extract containing slice keys and initial combiner inputs. If
    #        query_key is used the extract represents multiple examples with the
    #        same query_key, otherwise the extract represents a single example.
    # Output: Tuple (slice key, combiner inputs extracts). Notice that the per
    #         example (or list or examples if query_key used) input extract turns
    #         into n logical extracts, references to which are replicated once per
    #         applicable slice key.
    slices = extracts | 'FanoutSlices' >> slicer.FanoutSlices()

    slices_count = (slices
                    | 'ExtractSliceKeys' >> beam.Keys()
                    | 'CountPerSliceKey' >> beam.combiners.Count.PerElement())

    # Input: Tuple of (slice key, combiner input extracts).
    # Output: Tuple of (slice key, dict of computed metrics/plots). The dicts will
    #         be keyed by MetricKey/PlotKey and the values will be the result
    #         of the associated computations. A given MetricComputation can
    #         perform computations for multiple keys, but the keys should be
    #         unique across computations.
    sliced_metrics_and_plots = (
        slices
        |
        'ComputePerSlice' >> poisson_bootstrap.ComputeWithConfidenceIntervals(
            _ComputePerSlice,
            computations=computations,
            derived_computations=derived_computations,
            num_bootstrap_samples=(
                poisson_bootstrap.DEFAULT_NUM_BOOTSTRAP_SAMPLES if
                eval_config.options.compute_confidence_intervals.value else 1))
    )

    if eval_config.options.k_anonymization_count.value > 1:
        sliced_metrics_and_plots = (
            sliced_metrics_and_plots
            | 'FilteForSmallSlices' >> slicer.FilterOutSlices(
                slices_count, eval_config.options.k_anonymization_count.value))

    sliced_metrics = (sliced_metrics_and_plots
                      | 'FilterByMetrics' >> beam.Map(_filter_by_key_type,
                                                      metric_types.MetricKey))
    sliced_plots = (
        sliced_metrics_and_plots
        |
        'FilterByPlots' >> beam.Map(_filter_by_key_type, metric_types.PlotKey))

    # pylint: enable=no-value-for-parameter

    return {metrics_key: sliced_metrics, plots_key: sliced_plots}