def get_missing_slices( slicing_details: Iterable[validation_result_pb2.SlicingDetails], eval_config: config.EvalConfig ) -> List[Union[config.SlicingSpec, config.CrossSlicingSpec]]: """Returns specs that are defined in the EvalConfig but not found in details. Args: slicing_details: Slicing details. eval_config: Eval config. Returns: List of missing slices or empty list if none are missing. """ hashed_details = _hashed_slicing_details(slicing_details) thresholds = metric_specs.metric_thresholds_from_metrics_specs( eval_config.metrics_specs) baseline_spec = model_util.get_baseline_model_spec(eval_config) baseline_model_name = baseline_spec.name if baseline_spec else None missing_slices = [] for metric_key, sliced_thresholds in thresholds.items(): # Skip baseline. if metric_key.model_name == baseline_model_name: continue for slice_spec, _ in sliced_thresholds: if not slice_spec: slice_spec = config.SlicingSpec() slice_hash = slice_spec.SerializeToString() if slice_hash not in hashed_details: missing_slices.append(slice_spec) # Same slice may be used by other metrics/thresholds, only add once hashed_details[ slice_hash] = validation_result_pb2.SlicingDetails() return missing_slices
def test_get_baseline_model(self, eval_config, expected_baseline_model_spec): self.assertEqual(expected_baseline_model_spec, model_util.get_baseline_model_spec(eval_config))
def validate_metrics( sliced_metrics: Tuple[slicer.SliceKeyType, Dict[metric_types.MetricKey, Any]], eval_config: config.EvalConfig ) -> validation_result_pb2.ValidationResult: """Check the metrics and check whether they should be validated.""" # Find out which model is baseline. baseline_spec = model_util.get_baseline_model_spec(eval_config) baseline_model_name = baseline_spec.name if baseline_spec else None sliced_key, metrics = sliced_metrics thresholds = metric_specs.metric_thresholds_from_metrics_specs( eval_config.metrics_specs) def _check_threshold(key: metric_types.MetricKey, metric: Any) -> bool: """Verify a metric given its metric key and metric value.""" threshold = thresholds[key] if isinstance(threshold, config.GenericValueThreshold): lower_bound, upper_bound = -np.inf, np.inf if threshold.HasField('lower_bound'): lower_bound = threshold.lower_bound.value if threshold.HasField('upper_bound'): upper_bound = threshold.upper_bound.value return metric > lower_bound and metric < upper_bound elif isinstance(threshold, config.GenericChangeThreshold): diff = metric ratio = diff / metrics[key.make_baseline_key(baseline_model_name)] if threshold.direction == config.MetricDirection.LOWER_IS_BETTER: absolute, relative = np.inf, np.inf elif threshold.direction == config.MetricDirection.HIGHER_IS_BETTER: absolute, relative = -np.inf, -np.inf else: raise ValueError('"UNKNOWN" direction for change threshold.') if threshold.HasField('absolute'): absolute = threshold.absolute.value if threshold.HasField('relative'): relative = threshold.relative.value if threshold.direction == config.MetricDirection.LOWER_IS_BETTER: return diff < absolute and ratio < relative elif threshold.direction == config.MetricDirection.HIGHER_IS_BETTER: return diff > absolute and ratio > relative def _copy_metric(metric, to): # Will add more types when more MetricValue are supported. to.double_value.value = float(metric) def _copy_threshold(threshold, to): if isinstance(threshold, config.GenericValueThreshold): to.value_threshold.CopyFrom(threshold) if isinstance(threshold, config.GenericChangeThreshold): to.change_threshold.CopyFrom(threshold) # Empty metrics per slice is considered validated. result = validation_result_pb2.ValidationResult(validation_ok=True) validation_for_slice = validation_result_pb2.MetricsValidationForSlice() for metric_key, metric in metrics.items(): # Not meaningful to check threshold for baseline model, thus always return # True if such threshold is configured. We also do not compare Message type # metrics. if (metric_key.model_name == baseline_model_name or metric_key not in thresholds): continue msg = '' # We try to convert to float values. try: metric = float(metric) except (TypeError, ValueError): msg = """ Invalid threshold config: This metric is not comparable to the threshold. The type of the threshold is: {}, and the metric value is: \n{}""".format(type(metric), metric) if not _check_threshold(metric_key, metric): failure = validation_for_slice.failures.add() failure.metric_key.CopyFrom(metric_key.to_proto()) _copy_metric(metric, failure.metric_value) _copy_threshold(thresholds[metric_key], failure.metric_threshold) failure.message = msg # Any failure leads to overall failure. if validation_for_slice.failures: validation_for_slice.slice_key.CopyFrom( slicer.serialize_slice_key(sliced_key)) result.validation_ok = False result.metric_validations_per_slice.append(validation_for_slice) return result
def _ComputeMetricsAndPlots( # pylint: disable=invalid-name extracts: beam.pvalue.PCollection, eval_config: config.EvalConfig, metrics_specs: List[config.MetricsSpec], eval_shared_models: Optional[Dict[Text, types.EvalSharedModel]] = None, metrics_key: Text = constants.METRICS_KEY, plots_key: Text = constants.PLOTS_KEY, schema: Optional[schema_pb2.Schema] = None, random_seed_for_testing: Optional[int] = None) -> evaluator.Evaluation: """Computes metrics and plots. Args: extracts: PCollection of Extracts. If a query_key was used then the PCollection will contain a list of extracts. eval_config: Eval config. metrics_specs: Subset of the metric specs to compute metrics for. If a query_key was used all of the metric specs will be for the same query_key. eval_shared_models: Optional dict of shared models keyed by model name. Only required if there are metrics to be computed in-graph using the model. metrics_key: Name to use for metrics key in Evaluation output. plots_key: Name to use for plots key in Evaluation output. schema: A schema to use for customizing metrics and plots. random_seed_for_testing: Seed to use for unit testing. Returns: Evaluation containing dict of PCollections of (slice_key, results_dict) tuples where the dict is keyed by either the metrics_key (e.g. 'metrics') or plots_key (e.g. 'plots') depending on what the results_dict contains. schema: A schema to use for customizing metrics and plots. """ computations = [] # Add default metric computations if eval_shared_models: for model_name, eval_shared_model in eval_shared_models.items(): if not eval_shared_model.include_default_metrics: continue if eval_shared_model.model_type == constants.TF_KERAS: keras_specs = keras_util.metrics_specs_from_keras( model_name, eval_shared_model.model_loader) metrics_specs = keras_specs + metrics_specs[:] # TODO(mdreves): Add support for calling keras.evaluate(). elif (eval_shared_model.model_type == constants.TF_ESTIMATOR and eval_constants.EVAL_TAG in eval_shared_model.model_loader.tags): # Note that there is the possibility for metric naming collisions here # (e.g. 'auc' calculated within the EvalSavedModel as well as by AUC # metric computation performed outside the model). Currently all the # overlapping metrics such as AUC that are computed outside the model # are all derived metrics so they will override the metrics calculated # by the model which is the desired behavior. computations.extend( eval_saved_model_util. metric_computations_using_eval_saved_model( model_name, eval_shared_model.model_loader)) # Add metric computations from specs computations_from_specs, derived_computations = ( _filter_and_separate_computations( metric_specs.to_computations(metrics_specs, eval_config=eval_config, schema=schema))) computations.extend(computations_from_specs) # Find out which model is baseline. baseline_spec = model_util.get_baseline_model_spec(eval_config) baseline_model_name = baseline_spec.name if baseline_spec else None # pylint: disable=no-value-for-parameter # Input: Single extract per example (or list of extracts if query_key used) # where each item contains slice keys and other extracts from upstream # extractors (e.g. labels, predictions, etc). # Output: Single extract (per example) containing slice keys and initial # combiner state returned from preprocessor. Note that even if a # query_key was used the output is still only a single extract # (though, that extract may contain lists of values (predictions, # labels, etc) in its keys). # # Note that the output of this step is extracts instead of just a tuple of # computation outputs because FanoutSlices takes extracts as input (and in # many cases a subset of the extracts themselves are what is fanned out). extracts = (extracts | 'Preprocesss' >> beam.ParDo(_PreprocessorDoFn(computations))) # Input: Single extract containing slice keys and initial combiner inputs. If # query_key is used the extract represents multiple examples with the # same query_key, otherwise the extract represents a single example. # Output: Tuple (slice key, combiner inputs extracts). Notice that the per # example (or list or examples if query_key used) input extract turns # into n logical extracts, references to which are replicated once per # applicable slice key. slices = extracts | 'FanoutSlices' >> slicer.FanoutSlices() slices_count = (slices | 'ExtractSliceKeys' >> beam.Keys() | 'CountPerSliceKey' >> beam.combiners.Count.PerElement()) _ = (extracts.pipeline | 'IncrementMetricsSpecsCounters' >> counter_util.IncrementMetricsSpecsCounters(metrics_specs), slices_count | 'IncrementSliceSpecCounters' >> counter_util.IncrementSliceSpecCounters()) ci_params = _get_confidence_interval_params(eval_config, metrics_specs) cross_slice_specs = [] if eval_config.cross_slicing_specs: cross_slice_specs = eval_config.cross_slicing_specs # TODO(b/151482616): Make bootstrap and jackknife confidence interval # implementations more parallel. # Input: Tuple of (slice key, combiner input extracts). # Output: Tuple of (slice key, dict of computed metrics/plots). The dicts will # be keyed by MetricKey/PlotKey and the values will be the result # of the associated computations. A given MetricComputation can # perform computations for multiple keys, but the keys should be # unique across computations. sliced_metrics_and_plots = ( slices | 'ComputePerSlice' >> poisson_bootstrap.ComputeWithConfidenceIntervals( _ComputePerSlice, computations=computations, derived_computations=derived_computations, baseline_model_name=baseline_model_name, cross_slice_specs=cross_slice_specs, num_jackknife_samples=ci_params.num_jackknife_samples, num_bootstrap_samples=ci_params.num_bootstrap_samples, skip_ci_metric_keys=ci_params.skip_ci_metric_keys, random_seed_for_testing=random_seed_for_testing)) if eval_config.options.min_slice_size.value > 1: sliced_metrics_and_plots = ( sliced_metrics_and_plots | 'FilterSmallSlices' >> slicer.FilterOutSlices( slices_count, eval_config.options.min_slice_size.value)) sliced_metrics = (sliced_metrics_and_plots | 'FilterByMetrics' >> beam.Map(_filter_by_key_type, metric_types.MetricKey)) sliced_plots = ( sliced_metrics_and_plots | 'FilterByPlots' >> beam.Map(_filter_by_key_type, metric_types.PlotKey)) # pylint: enable=no-value-for-parameter return {metrics_key: sliced_metrics, plots_key: sliced_plots}
def validate_metrics( sliced_metrics: Tuple[Union[slicer.SliceKeyType, slicer.CrossSliceKeyType], Dict['metric_types.MetricKey', Any]], eval_config: config.EvalConfig ) -> validation_result_pb2.ValidationResult: """Check the metrics and check whether they should be validated.""" # Find out which model is baseline. baseline_spec = model_util.get_baseline_model_spec(eval_config) baseline_model_name = baseline_spec.name if baseline_spec else None sliced_key, metrics = sliced_metrics thresholds = metric_specs.metric_thresholds_from_metrics_specs( eval_config.metrics_specs) # pytype: disable=wrong-arg-types is_cross_slice = slicer.is_cross_slice_key(sliced_key) def _check_threshold(key: metric_types.MetricKey, threshold: _ThresholdType, metric: Any) -> bool: """Verify a metric given its metric key and metric value.""" if isinstance(threshold, config.GenericValueThreshold): lower_bound, upper_bound = -np.inf, np.inf if threshold.HasField('lower_bound'): lower_bound = threshold.lower_bound.value if threshold.HasField('upper_bound'): upper_bound = threshold.upper_bound.value return metric > lower_bound and metric < upper_bound elif isinstance(threshold, config.GenericChangeThreshold): diff = metric ratio = diff / metrics[key.make_baseline_key(baseline_model_name)] if threshold.direction == config.MetricDirection.LOWER_IS_BETTER: absolute, relative = np.inf, np.inf elif threshold.direction == config.MetricDirection.HIGHER_IS_BETTER: absolute, relative = -np.inf, -np.inf else: raise ValueError('"UNKNOWN" direction for change threshold.') if threshold.HasField('absolute'): absolute = threshold.absolute.value if threshold.HasField('relative'): relative = threshold.relative.value if threshold.direction == config.MetricDirection.LOWER_IS_BETTER: return diff < absolute and ratio < relative elif threshold.direction == config.MetricDirection.HIGHER_IS_BETTER: return diff > absolute and ratio > relative def _copy_metric(metric, to): # Will add more types when more MetricValue are supported. to.double_value.value = float(metric) def _copy_threshold(threshold, to): if isinstance(threshold, config.GenericValueThreshold): to.value_threshold.CopyFrom(threshold) if isinstance(threshold, config.GenericChangeThreshold): to.change_threshold.CopyFrom(threshold) def _add_to_set(s, v): """Adds value to set. Returns true if didn't exist.""" if v in s: return False else: s.add(v) return True # Empty metrics per slice is considered validated. result = validation_result_pb2.ValidationResult(validation_ok=True) validation_for_slice = validation_result_pb2.MetricsValidationForSlice() unchecked_thresholds = dict(thresholds) for metric_key, metric in metrics.items(): if metric_key not in thresholds: continue del unchecked_thresholds[metric_key] # Not meaningful to check threshold for baseline model, thus always return # True if such threshold is configured. We also do not compare Message type # metrics. if metric_key.model_name == baseline_model_name: continue msg = '' # We try to convert to float values. try: metric = float(metric) except (TypeError, ValueError): msg = """ Invalid threshold config: This metric is not comparable to the threshold. The type of the threshold is: {}, and the metric value is: \n{}""".format(type(metric), metric) existing_failures = set() for slice_spec, threshold in thresholds[metric_key]: if (slice_spec is not None and isinstance(slice_spec, config.SlicingSpec) and (is_cross_slice or not slicer.SingleSliceSpec( spec=slice_spec).is_slice_applicable(sliced_key))): continue if (slice_spec is not None and isinstance(slice_spec, config.CrossSlicingSpec) and (not is_cross_slice or not slicer.is_cross_slice_applicable( cross_slice_key=sliced_key, cross_slicing_spec=slice_spec)) ): continue if not _check_threshold(metric_key, threshold, metric): # The same threshold values could be set for multiple matching slice # specs. Only store the first match. # # Note that hashing by SerializeToString() is only safe if used within # the same process. if not _add_to_set(existing_failures, threshold.SerializeToString()): continue failure = validation_for_slice.failures.add() failure.metric_key.CopyFrom(metric_key.to_proto()) _copy_metric(metric, failure.metric_value) _copy_threshold(threshold, failure.metric_threshold) failure.message = msg # Track we have completed a validation check for slice spec and metric slicing_details = result.validation_details.slicing_details.add() if slice_spec is not None: if isinstance(slice_spec, config.SlicingSpec): slicing_details.slicing_spec.CopyFrom(slice_spec) else: slicing_details.cross_slicing_spec.CopyFrom(slice_spec) else: slicing_details.slicing_spec.CopyFrom(config.SlicingSpec()) slicing_details.num_matching_slices = 1 # All unchecked thresholds are considered failures. for metric_key, thresholds in unchecked_thresholds.items(): if metric_key.model_name == baseline_model_name: continue existing_failures = set() for _, threshold in thresholds: # The same threshold values could be set for multiple matching slice # specs. Only store the first match. # # Note that hashing by SerializeToString() is only safe if used within # the same process. if not _add_to_set(existing_failures, threshold.SerializeToString()): continue failure = validation_for_slice.failures.add() failure.metric_key.CopyFrom(metric_key.to_proto()) _copy_threshold(threshold, failure.metric_threshold) failure.message = 'Metric not found.' # Any failure leads to overall failure. if validation_for_slice.failures: if not is_cross_slice: validation_for_slice.slice_key.CopyFrom( slicer.serialize_slice_key(sliced_key)) else: validation_for_slice.cross_slice_key.CopyFrom( slicer.serialize_cross_slice_key(sliced_key)) result.validation_ok = False result.metric_validations_per_slice.append(validation_for_slice) return result
def _ComputeMetricsAndPlots( # pylint: disable=invalid-name extracts: beam.pvalue.PCollection, eval_config: config.EvalConfig, metrics_specs: List[config.MetricsSpec], eval_shared_models: Optional[Dict[Text, types.EvalSharedModel]] = None, metrics_key: Text = constants.METRICS_KEY, plots_key: Text = constants.PLOTS_KEY) -> evaluator.Evaluation: """Computes metrics and plots. Args: extracts: PCollection of Extracts. If a query_key was used then the PCollection will contain a list of extracts. eval_config: Eval config. metrics_specs: Subset of the metric specs to compute metrics for. If a query_key was used all of the metric specs will be for the same query_key. eval_shared_models: Optional dict of shared models keyed by model name. Only required if there are metrics to be computed in-graph using the model. metrics_key: Name to use for metrics key in Evaluation output. plots_key: Name to use for plots key in Evaluation output. Returns: Evaluation containing dict of PCollections of (slice_key, results_dict) tuples where the dict is keyed by either the metrics_key (e.g. 'metrics') or plots_key (e.g. 'plots') depending on what the results_dict contains. """ computations = [] model_loaders = None # Add default metric computations if eval_shared_models: model_loaders = {} for model_name, eval_shared_model in eval_shared_models.items(): if not eval_shared_model.include_default_metrics: continue model_loader = eval_shared_model.model_loader model_loaders[model_name] = model_loader model_types = model_loader.construct_fn(lambda x: None)() if model_types.keras_model is not None: # TODO(mdreves): Move handling of keras metrics to here. pass elif model_types.eval_saved_model is not None: # Note that there is the possibility for metric naming collisions here # (e.g. 'auc' calculated within the EvalSavedModel as well as by AUC # metric computation performed outside the model). Currently all the # overlapping metrics such as AUC that are computed outside the model # are all derived metrics so they will override the metrics calculated # by the model which is the desired behavior. computations.extend( eval_saved_model_util.metric_computations_using_eval_saved_model( model_name, model_loader)) # Add metric computations from specs computations_from_specs, derived_computations = ( _filter_and_separate_computations( metric_specs.to_computations( metrics_specs, eval_config=eval_config, model_loaders=model_loaders))) computations.extend(computations_from_specs) # Find out which model is baseline. baseline_spec = model_util.get_baseline_model_spec(eval_config) baseline_model_name = baseline_spec.name if baseline_spec else None # pylint: disable=no-value-for-parameter # Input: Single extract per example (or list of extracts if query_key used) # where each item contains slice keys and other extracts from upstream # extractors (e.g. labels, predictions, etc). # Output: Single extract (per example) containing slice keys and initial # combiner state returned from preprocessor. Note that even if a # query_key was used the output is still only a single extract # (though, that extract may contain lists of values (predictions, # labels, etc) in its keys). # # Note that the output of this step is extracts instead of just a tuple of # computation outputs because FanoutSlices takes extracts as input (and in # many cases a subset of the extracts themselves are what is fanned out). extracts = ( extracts | 'Preprocesss' >> beam.ParDo(_PreprocessorDoFn(computations))) # Input: Single extract containing slice keys and initial combiner inputs. If # query_key is used the extract represents multiple examples with the # same query_key, otherwise the extract represents a single example. # Output: Tuple (slice key, combiner inputs extracts). Notice that the per # example (or list or examples if query_key used) input extract turns # into n logical extracts, references to which are replicated once per # applicable slice key. slices = extracts | 'FanoutSlices' >> slicer.FanoutSlices() slices_count = ( slices | 'ExtractSliceKeys' >> beam.Keys() | 'CountPerSliceKey' >> beam.combiners.Count.PerElement()) # Input: Tuple of (slice key, combiner input extracts). # Output: Tuple of (slice key, dict of computed metrics/plots). The dicts will # be keyed by MetricKey/PlotKey and the values will be the result # of the associated computations. A given MetricComputation can # perform computations for multiple keys, but the keys should be # unique across computations. sliced_metrics_and_plots = ( slices | 'ComputePerSlice' >> poisson_bootstrap.ComputeWithConfidenceIntervals( _ComputePerSlice, computations=computations, derived_computations=derived_computations, baseline_model_name=baseline_model_name, num_bootstrap_samples=( poisson_bootstrap.DEFAULT_NUM_BOOTSTRAP_SAMPLES if eval_config.options.compute_confidence_intervals.value else 1))) if eval_config.options.k_anonymization_count.value > 1: sliced_metrics_and_plots = ( sliced_metrics_and_plots | 'FilterForSmallSlices' >> slicer.FilterOutSlices( slices_count, eval_config.options.k_anonymization_count.value)) sliced_metrics = ( sliced_metrics_and_plots | 'FilterByMetrics' >> beam.Map(_filter_by_key_type, metric_types.MetricKey)) sliced_plots = ( sliced_metrics_and_plots | 'FilterByPlots' >> beam.Map(_filter_by_key_type, metric_types.PlotKey)) # pylint: enable=no-value-for-parameter return {metrics_key: sliced_metrics, plots_key: sliced_plots}