def testSliceSpecBeamCounter(self): with beam.Pipeline() as pipeline: _ = (pipeline | beam.Create([[[('slice_key', 'first_slice')]]]) | counter_util.IncrementSliceSpecCounters()) result = pipeline.run() slice_spec_filter = beam.metrics.metric.MetricsFilter().with_namespace( constants.METRICS_NAMESPACE).with_name( 'slice_computed_slice_key_first_slice') slice_count = result.metrics().query( filter=slice_spec_filter)['counters'][0].committed self.assertEqual(slice_count, 1)
def _ComputeMetricsAndPlots( # pylint: disable=invalid-name extracts: beam.pvalue.PCollection, eval_config: config_pb2.EvalConfig, metrics_specs: List[config_pb2.MetricsSpec], eval_shared_models: Optional[Dict[Text, types.EvalSharedModel]] = None, metrics_key: Text = constants.METRICS_KEY, plots_key: Text = constants.PLOTS_KEY, attributions_key: Text = constants.ATTRIBUTIONS_KEY, schema: Optional[schema_pb2.Schema] = None, random_seed_for_testing: Optional[int] = None, tensor_adapter_config: Optional[tensor_adapter.TensorAdapterConfig] = None ) -> evaluator.Evaluation: """Computes metrics and plots. Args: extracts: PCollection of Extracts. If a query_key was used then the PCollection will contain a list of extracts. eval_config: Eval config. metrics_specs: Subset of the metric specs to compute metrics for. If a query_key was used all of the metric specs will be for the same query_key. eval_shared_models: Optional dict of shared models keyed by model name. Only required if there are metrics to be computed in-graph using the model. metrics_key: Name to use for metrics key in Evaluation output. plots_key: Name to use for plots key in Evaluation output. attributions_key: Name to use for attributions key in Evaluation output. schema: A schema to use for customizing metrics and plots. random_seed_for_testing: Seed to use for unit testing. tensor_adapter_config: Tensor adapter config which specifies how to obtain tensors from the Arrow RecordBatch. The model's signature will be invoked with those tensors (matched by names). If None, an attempt will be made to create an adapter based on the model's input signature otherwise the model will be invoked with raw examples (assuming a signature of a single 1-D string tensor). Returns: Evaluation containing dict of PCollections of (slice_key, results_dict) tuples where the dict is keyed by either the metrics_key (e.g. 'metrics'), plots_key (e.g. 'plots'), or attributions_key (e.g. 'attributions') depending on what the results_dict contains. """ computations = [] # Add default metric computations if eval_shared_models: # Note that there is the possibility for metric naming collisions here # (e.g. 'auc' calculated within the model as well as by AUC metric # computation performed outside the model). Currently all the overlapping # metrics such as AUC that are computed outside the model are all derived # metrics so they will override the metrics calculated by the model which is # the desired behavior. for model_name, eval_shared_model in eval_shared_models.items(): if not eval_shared_model.include_default_metrics: continue if eval_shared_model.model_type == constants.TF_KERAS: computations.extend( keras_util.metric_computations_using_keras_saved_model( model_name, eval_shared_model.model_loader, eval_config, tensor_adapter_config)) elif (eval_shared_model.model_type == constants.TF_ESTIMATOR and eval_constants.EVAL_TAG in eval_shared_model.model_loader.tags): computations.extend( eval_saved_model_util.metric_computations_using_eval_saved_model( model_name, eval_shared_model.model_loader)) # Add metric computations from specs metric_computations = _filter_and_separate_computations( metric_specs.to_computations( metrics_specs, eval_config=eval_config, schema=schema)) computations.extend(metric_computations.non_derived_computations) # Find out which model is baseline. baseline_spec = model_util.get_baseline_model_spec(eval_config) baseline_model_name = baseline_spec.name if baseline_spec else None # pylint: disable=no-value-for-parameter # Input: Single extract per example (or list of extracts if query_key used) # where each item contains slice keys and other extracts from upstream # extractors (e.g. labels, predictions, etc). # Output: Single extract (per example) containing slice keys and initial # combiner state returned from preprocessor. Note that even if a # query_key was used the output is still only a single extract # (though, that extract may contain lists of values (predictions, # labels, etc) in its keys). # # Note that the output of this step is extracts instead of just a tuple of # computation outputs because FanoutSlices takes extracts as input (and in # many cases a subset of the extracts themselves are what is fanned out). extracts = ( extracts | 'Preprocesss' >> beam.ParDo(_PreprocessorDoFn(computations))) # Input: Single extract containing slice keys and initial combiner inputs. If # query_key is used the extract represents multiple examples with the # same query_key, otherwise the extract represents a single example. # Output: Tuple (slice key, combiner inputs extracts). Notice that the per # example (or list or examples if query_key used) input extract turns # into n logical extracts, references to which are replicated once per # applicable slice key. slices = extracts | 'FanoutSlices' >> slicer.FanoutSlices() slices_count = ( slices | 'ExtractSliceKeys' >> beam.Keys() | 'CountPerSliceKey' >> beam.combiners.Count.PerElement()) model_types = _get_model_types_for_logging(eval_shared_models) _ = ( extracts.pipeline | 'IncrementMetricsSpecsCounters' >> counter_util.IncrementMetricsSpecsCounters(metrics_specs, model_types), slices_count | 'IncrementSliceSpecCounters' >> counter_util.IncrementSliceSpecCounters()) ci_params = _get_confidence_interval_params(eval_config, metrics_specs) cross_slice_specs = [] if eval_config.cross_slicing_specs: cross_slice_specs = eval_config.cross_slicing_specs computations_combine_fn = _ComputationsCombineFn(computations=computations) derived_metrics_ptransform = _AddDerivedCrossSliceAndDiffMetrics( metric_computations.derived_computations, metric_computations.cross_slice_computations, cross_slice_specs, baseline_model_name) # Input: Tuple of (slice key, combiner input extracts). # Output: Tuple of (slice key, dict of computed metrics/plots/attributions). # The dicts will be keyed by MetricKey/PlotKey/AttributionsKey and the # values will be the result of the associated computations. A given # MetricComputation can perform computations for multiple keys, but # the keys should be unique across computations. if ci_params.num_bootstrap_samples: sliced_metrics_plots_and_attributions = ( slices | 'PoissonBootstrapConfidenceIntervals' >> poisson_bootstrap.ComputeWithConfidenceIntervals( computations_combine_fn=computations_combine_fn, derived_metrics_ptransform=derived_metrics_ptransform, num_bootstrap_samples=ci_params.num_bootstrap_samples, hot_key_fanout=_COMBINE_PER_SLICE_KEY_HOT_KEY_FANOUT, skip_ci_metric_keys=ci_params.skip_ci_metric_keys, random_seed_for_testing=random_seed_for_testing)) elif ci_params.num_jackknife_samples: sliced_metrics_plots_and_attributions = ( slices | 'JackknifeConfidenceIntervals' >> jackknife.ComputeWithConfidenceIntervals( computations_combine_fn=computations_combine_fn, derived_metrics_ptransform=derived_metrics_ptransform, num_jackknife_samples=ci_params.num_jackknife_samples, skip_ci_metric_keys=ci_params.skip_ci_metric_keys, random_seed_for_testing=random_seed_for_testing)) else: sliced_metrics_plots_and_attributions = ( slices | 'CombineMetricsPerSlice' >> beam.CombinePerKey(computations_combine_fn) .with_hot_key_fanout(_COMBINE_PER_SLICE_KEY_HOT_KEY_FANOUT) | 'AddDerivedCrossSliceAndDiffMetrics' >> derived_metrics_ptransform) sliced_metrics_plots_and_attributions = ( sliced_metrics_plots_and_attributions | 'AddCIDerivedMetrics' >> beam.Map( _add_ci_derived_metrics, metric_computations.ci_derived_computations)) if eval_config.options.min_slice_size.value > 1: sliced_metrics_plots_and_attributions = ( sliced_metrics_plots_and_attributions | 'FilterSmallSlices' >> slicer.FilterOutSlices( slices_count, eval_config.options.min_slice_size.value)) sliced_metrics = ( sliced_metrics_plots_and_attributions | 'FilterByMetrics' >> beam.Map(_filter_by_key_type, metric_types.MetricKey)) sliced_plots = ( sliced_metrics_plots_and_attributions | 'FilterByPlots' >> beam.Map(_filter_by_key_type, metric_types.PlotKey)) sliced_attributions = ( sliced_metrics_plots_and_attributions | 'FilterByAttributions' >> beam.Map(_filter_by_key_type, metric_types.AttributionsKey)) # pylint: enable=no-value-for-parameter return { metrics_key: sliced_metrics, plots_key: sliced_plots, attributions_key: sliced_attributions }
def _ComputeMetricsAndPlots( # pylint: disable=invalid-name extracts: beam.pvalue.PCollection, eval_config: config.EvalConfig, metrics_specs: List[config.MetricsSpec], eval_shared_models: Optional[Dict[Text, types.EvalSharedModel]] = None, metrics_key: Text = constants.METRICS_KEY, plots_key: Text = constants.PLOTS_KEY, schema: Optional[schema_pb2.Schema] = None, random_seed_for_testing: Optional[int] = None) -> evaluator.Evaluation: """Computes metrics and plots. Args: extracts: PCollection of Extracts. If a query_key was used then the PCollection will contain a list of extracts. eval_config: Eval config. metrics_specs: Subset of the metric specs to compute metrics for. If a query_key was used all of the metric specs will be for the same query_key. eval_shared_models: Optional dict of shared models keyed by model name. Only required if there are metrics to be computed in-graph using the model. metrics_key: Name to use for metrics key in Evaluation output. plots_key: Name to use for plots key in Evaluation output. schema: A schema to use for customizing metrics and plots. random_seed_for_testing: Seed to use for unit testing. Returns: Evaluation containing dict of PCollections of (slice_key, results_dict) tuples where the dict is keyed by either the metrics_key (e.g. 'metrics') or plots_key (e.g. 'plots') depending on what the results_dict contains. schema: A schema to use for customizing metrics and plots. """ computations = [] # Add default metric computations if eval_shared_models: for model_name, eval_shared_model in eval_shared_models.items(): if not eval_shared_model.include_default_metrics: continue if eval_shared_model.model_type == constants.TF_KERAS: keras_specs = keras_util.metrics_specs_from_keras( model_name, eval_shared_model.model_loader) metrics_specs = keras_specs + metrics_specs[:] # TODO(mdreves): Add support for calling keras.evaluate(). elif (eval_shared_model.model_type == constants.TF_ESTIMATOR and eval_constants.EVAL_TAG in eval_shared_model.model_loader.tags): # Note that there is the possibility for metric naming collisions here # (e.g. 'auc' calculated within the EvalSavedModel as well as by AUC # metric computation performed outside the model). Currently all the # overlapping metrics such as AUC that are computed outside the model # are all derived metrics so they will override the metrics calculated # by the model which is the desired behavior. computations.extend( eval_saved_model_util. metric_computations_using_eval_saved_model( model_name, eval_shared_model.model_loader)) # Add metric computations from specs computations_from_specs, derived_computations = ( _filter_and_separate_computations( metric_specs.to_computations(metrics_specs, eval_config=eval_config, schema=schema))) computations.extend(computations_from_specs) # Find out which model is baseline. baseline_spec = model_util.get_baseline_model_spec(eval_config) baseline_model_name = baseline_spec.name if baseline_spec else None # pylint: disable=no-value-for-parameter # Input: Single extract per example (or list of extracts if query_key used) # where each item contains slice keys and other extracts from upstream # extractors (e.g. labels, predictions, etc). # Output: Single extract (per example) containing slice keys and initial # combiner state returned from preprocessor. Note that even if a # query_key was used the output is still only a single extract # (though, that extract may contain lists of values (predictions, # labels, etc) in its keys). # # Note that the output of this step is extracts instead of just a tuple of # computation outputs because FanoutSlices takes extracts as input (and in # many cases a subset of the extracts themselves are what is fanned out). extracts = (extracts | 'Preprocesss' >> beam.ParDo(_PreprocessorDoFn(computations))) # Input: Single extract containing slice keys and initial combiner inputs. If # query_key is used the extract represents multiple examples with the # same query_key, otherwise the extract represents a single example. # Output: Tuple (slice key, combiner inputs extracts). Notice that the per # example (or list or examples if query_key used) input extract turns # into n logical extracts, references to which are replicated once per # applicable slice key. slices = extracts | 'FanoutSlices' >> slicer.FanoutSlices() slices_count = (slices | 'ExtractSliceKeys' >> beam.Keys() | 'CountPerSliceKey' >> beam.combiners.Count.PerElement()) _ = (extracts.pipeline | 'IncrementMetricsSpecsCounters' >> counter_util.IncrementMetricsSpecsCounters(metrics_specs), slices_count | 'IncrementSliceSpecCounters' >> counter_util.IncrementSliceSpecCounters()) ci_params = _get_confidence_interval_params(eval_config, metrics_specs) cross_slice_specs = [] if eval_config.cross_slicing_specs: cross_slice_specs = eval_config.cross_slicing_specs # TODO(b/151482616): Make bootstrap and jackknife confidence interval # implementations more parallel. # Input: Tuple of (slice key, combiner input extracts). # Output: Tuple of (slice key, dict of computed metrics/plots). The dicts will # be keyed by MetricKey/PlotKey and the values will be the result # of the associated computations. A given MetricComputation can # perform computations for multiple keys, but the keys should be # unique across computations. sliced_metrics_and_plots = ( slices | 'ComputePerSlice' >> poisson_bootstrap.ComputeWithConfidenceIntervals( _ComputePerSlice, computations=computations, derived_computations=derived_computations, baseline_model_name=baseline_model_name, cross_slice_specs=cross_slice_specs, num_jackknife_samples=ci_params.num_jackknife_samples, num_bootstrap_samples=ci_params.num_bootstrap_samples, skip_ci_metric_keys=ci_params.skip_ci_metric_keys, random_seed_for_testing=random_seed_for_testing)) if eval_config.options.min_slice_size.value > 1: sliced_metrics_and_plots = ( sliced_metrics_and_plots | 'FilterSmallSlices' >> slicer.FilterOutSlices( slices_count, eval_config.options.min_slice_size.value)) sliced_metrics = (sliced_metrics_and_plots | 'FilterByMetrics' >> beam.Map(_filter_by_key_type, metric_types.MetricKey)) sliced_plots = ( sliced_metrics_and_plots | 'FilterByPlots' >> beam.Map(_filter_by_key_type, metric_types.PlotKey)) # pylint: enable=no-value-for-parameter return {metrics_key: sliced_metrics, plots_key: sliced_plots}
def ComputeMetricsAndPlots( # pylint: disable=invalid-name extracts: beam.pvalue.PCollection, eval_shared_model: types.EvalSharedModel, desired_batch_size: Optional[int] = None, compute_confidence_intervals: Optional[bool] = False, random_seed_for_testing: Optional[int] = None ) -> Tuple[beam.pvalue.DoOutputsTuple, beam.pvalue.PCollection]: """Computes metrics and plots using the EvalSavedModel. Args: extracts: PCollection of Extracts. The extracts MUST contain a FeaturesPredictionsLabels extract keyed by tfma.FEATURE_PREDICTIONS_LABELS_KEY and a list of SliceKeyType extracts keyed by tfma.SLICE_KEY_TYPES_KEY. Typically these will be added by calling the default_extractors function. eval_shared_model: Shared model parameters for EvalSavedModel including any additional metrics (see EvalSharedModel for more information on how to configure additional metrics). desired_batch_size: Optional batch size for batching in Aggregate. compute_confidence_intervals: Set to True to run metrics analysis over multiple bootstrap samples and compute uncertainty intervals. random_seed_for_testing: Provide for deterministic tests only. Returns: Tuple of Tuple[PCollection of (slice key, metrics), PCollection of (slice key, plot metrics)] and PCollection of (slice_key and its example count). """ # pylint: disable=no-value-for-parameter slices = ( extracts # Downstream computation only cares about FPLs, so we prune before fanout. # Note that fanout itself will prune the slice keys. # TODO(b/130032676, b/111353165): Prune FPLs to contain only the necessary # set for the calculation of post_export_metrics if possible. | 'PruneExtracts' >> extractor.Filter(include=[ constants.FEATURES_PREDICTIONS_LABELS_KEY, constants.SLICE_KEY_TYPES_KEY, constants.INPUT_KEY, ]) # Input: one example at a time, with slice keys in extracts. # Output: one fpl example per slice key (notice that the example turns # into n logical examples, references to which are replicated once # per applicable slice key). | 'FanoutSlices' >> slicer.FanoutSlices()) slices_count = (slices | 'ExtractSliceKeys' >> beam.Keys() | 'CountPerSliceKey' >> beam.combiners.Count.PerElement()) _ = (extracts.pipeline | 'IncrementMetricsCallbacksCounters' >> counter_util.IncrementMetricsCallbacksCounters( eval_shared_model.add_metrics_callbacks), slices_count | 'IncreamentSliceSpecCounters' >> counter_util.IncrementSliceSpecCounters()) aggregated_metrics = ( slices # Metrics are computed per slice key. # Output: Multi-outputs, a dict of slice key to computed metrics, and # plots if applicable. | 'ComputePerSliceMetrics' >> poisson_bootstrap.ComputeWithConfidenceIntervals( aggregate.ComputePerSliceMetrics, num_bootstrap_samples=( poisson_bootstrap.DEFAULT_NUM_BOOTSTRAP_SAMPLES if compute_confidence_intervals else 1), random_seed_for_testing=random_seed_for_testing, eval_shared_model=eval_shared_model, desired_batch_size=desired_batch_size) | 'SeparateMetricsAndPlots' >> beam.ParDo( _SeparateMetricsAndPlotsFn()).with_outputs( _SeparateMetricsAndPlotsFn.OUTPUT_TAG_PLOTS, main=_SeparateMetricsAndPlotsFn.OUTPUT_TAG_METRICS)) return (aggregated_metrics, slices_count)