Beispiel #1
0
def get_missing_slices(
    slicing_details: Iterable[validation_result_pb2.SlicingDetails],
    eval_config: config.EvalConfig
) -> List[Union[config.SlicingSpec, config.CrossSlicingSpec]]:
    """Returns specs that are defined in the EvalConfig but not found in details.

  Args:
    slicing_details: Slicing details.
    eval_config: Eval config.

  Returns:
    List of missing slices or empty list if none are missing.
  """
    hashed_details = _hashed_slicing_details(slicing_details)
    thresholds = metric_specs.metric_thresholds_from_metrics_specs(
        eval_config.metrics_specs)
    baseline_spec = model_util.get_baseline_model_spec(eval_config)
    baseline_model_name = baseline_spec.name if baseline_spec else None
    missing_slices = []
    for metric_key, sliced_thresholds in thresholds.items():
        # Skip baseline.
        if metric_key.model_name == baseline_model_name:
            continue
        for slice_spec, _ in sliced_thresholds:
            if not slice_spec:
                slice_spec = config.SlicingSpec()
            slice_hash = slice_spec.SerializeToString()
            if slice_hash not in hashed_details:
                missing_slices.append(slice_spec)
                # Same slice may be used by other metrics/thresholds, only add once
                hashed_details[
                    slice_hash] = validation_result_pb2.SlicingDetails()
    return missing_slices
 def test_get_baseline_model(self, eval_config,
                             expected_baseline_model_spec):
     self.assertEqual(expected_baseline_model_spec,
                      model_util.get_baseline_model_spec(eval_config))
def validate_metrics(
        sliced_metrics: Tuple[slicer.SliceKeyType,
                              Dict[metric_types.MetricKey,
                                   Any]], eval_config: config.EvalConfig
) -> validation_result_pb2.ValidationResult:
    """Check the metrics and check whether they should be validated."""
    # Find out which model is baseline.
    baseline_spec = model_util.get_baseline_model_spec(eval_config)
    baseline_model_name = baseline_spec.name if baseline_spec else None

    sliced_key, metrics = sliced_metrics
    thresholds = metric_specs.metric_thresholds_from_metrics_specs(
        eval_config.metrics_specs)

    def _check_threshold(key: metric_types.MetricKey, metric: Any) -> bool:
        """Verify a metric given its metric key and metric value."""
        threshold = thresholds[key]
        if isinstance(threshold, config.GenericValueThreshold):
            lower_bound, upper_bound = -np.inf, np.inf
            if threshold.HasField('lower_bound'):
                lower_bound = threshold.lower_bound.value
            if threshold.HasField('upper_bound'):
                upper_bound = threshold.upper_bound.value
            return metric > lower_bound and metric < upper_bound
        elif isinstance(threshold, config.GenericChangeThreshold):
            diff = metric
            ratio = diff / metrics[key.make_baseline_key(baseline_model_name)]
            if threshold.direction == config.MetricDirection.LOWER_IS_BETTER:
                absolute, relative = np.inf, np.inf
            elif threshold.direction == config.MetricDirection.HIGHER_IS_BETTER:
                absolute, relative = -np.inf, -np.inf
            else:
                raise ValueError('"UNKNOWN" direction for change threshold.')
            if threshold.HasField('absolute'):
                absolute = threshold.absolute.value
            if threshold.HasField('relative'):
                relative = threshold.relative.value
            if threshold.direction == config.MetricDirection.LOWER_IS_BETTER:
                return diff < absolute and ratio < relative
            elif threshold.direction == config.MetricDirection.HIGHER_IS_BETTER:
                return diff > absolute and ratio > relative

    def _copy_metric(metric, to):
        # Will add more types when more MetricValue are supported.
        to.double_value.value = float(metric)

    def _copy_threshold(threshold, to):
        if isinstance(threshold, config.GenericValueThreshold):
            to.value_threshold.CopyFrom(threshold)
        if isinstance(threshold, config.GenericChangeThreshold):
            to.change_threshold.CopyFrom(threshold)

    # Empty metrics per slice is considered validated.
    result = validation_result_pb2.ValidationResult(validation_ok=True)
    validation_for_slice = validation_result_pb2.MetricsValidationForSlice()
    for metric_key, metric in metrics.items():
        # Not meaningful to check threshold for baseline model, thus always return
        # True if such threshold is configured. We also do not compare Message type
        # metrics.
        if (metric_key.model_name == baseline_model_name
                or metric_key not in thresholds):
            continue
        msg = ''
        # We try to convert to float values.
        try:
            metric = float(metric)
        except (TypeError, ValueError):
            msg = """
        Invalid threshold config: This metric is not comparable to the
        threshold. The type of the threshold is: {}, and the metric value is:
        \n{}""".format(type(metric), metric)
        if not _check_threshold(metric_key, metric):
            failure = validation_for_slice.failures.add()
            failure.metric_key.CopyFrom(metric_key.to_proto())
            _copy_metric(metric, failure.metric_value)
            _copy_threshold(thresholds[metric_key], failure.metric_threshold)
            failure.message = msg
    # Any failure leads to overall failure.
    if validation_for_slice.failures:
        validation_for_slice.slice_key.CopyFrom(
            slicer.serialize_slice_key(sliced_key))
        result.validation_ok = False
        result.metric_validations_per_slice.append(validation_for_slice)
    return result
def _ComputeMetricsAndPlots(  # pylint: disable=invalid-name
        extracts: beam.pvalue.PCollection,
        eval_config: config.EvalConfig,
        metrics_specs: List[config.MetricsSpec],
        eval_shared_models: Optional[Dict[Text, types.EvalSharedModel]] = None,
        metrics_key: Text = constants.METRICS_KEY,
        plots_key: Text = constants.PLOTS_KEY,
        schema: Optional[schema_pb2.Schema] = None,
        random_seed_for_testing: Optional[int] = None) -> evaluator.Evaluation:
    """Computes metrics and plots.

  Args:
    extracts: PCollection of Extracts. If a query_key was used then the
      PCollection will contain a list of extracts.
    eval_config: Eval config.
    metrics_specs: Subset of the metric specs to compute metrics for. If a
      query_key was used all of the metric specs will be for the same query_key.
    eval_shared_models: Optional dict of shared models keyed by model name. Only
      required if there are metrics to be computed in-graph using the model.
    metrics_key: Name to use for metrics key in Evaluation output.
    plots_key: Name to use for plots key in Evaluation output.
    schema: A schema to use for customizing metrics and plots.
    random_seed_for_testing: Seed to use for unit testing.

  Returns:
    Evaluation containing dict of PCollections of (slice_key, results_dict)
    tuples where the dict is keyed by either the metrics_key (e.g. 'metrics') or
    plots_key (e.g. 'plots') depending on what the results_dict contains.
    schema: A schema to use for customizing metrics and plots.
  """
    computations = []
    # Add default metric computations
    if eval_shared_models:
        for model_name, eval_shared_model in eval_shared_models.items():
            if not eval_shared_model.include_default_metrics:
                continue
            if eval_shared_model.model_type == constants.TF_KERAS:
                keras_specs = keras_util.metrics_specs_from_keras(
                    model_name, eval_shared_model.model_loader)
                metrics_specs = keras_specs + metrics_specs[:]
                # TODO(mdreves): Add support for calling keras.evaluate().
            elif (eval_shared_model.model_type == constants.TF_ESTIMATOR
                  and eval_constants.EVAL_TAG
                  in eval_shared_model.model_loader.tags):
                # Note that there is the possibility for metric naming collisions here
                # (e.g. 'auc' calculated within the EvalSavedModel as well as by AUC
                # metric computation performed outside the model). Currently all the
                # overlapping metrics such as AUC that are computed outside the model
                # are all derived metrics so they will override the metrics calculated
                # by the model which is the desired behavior.
                computations.extend(
                    eval_saved_model_util.
                    metric_computations_using_eval_saved_model(
                        model_name, eval_shared_model.model_loader))
    # Add metric computations from specs
    computations_from_specs, derived_computations = (
        _filter_and_separate_computations(
            metric_specs.to_computations(metrics_specs,
                                         eval_config=eval_config,
                                         schema=schema)))
    computations.extend(computations_from_specs)

    # Find out which model is baseline.
    baseline_spec = model_util.get_baseline_model_spec(eval_config)
    baseline_model_name = baseline_spec.name if baseline_spec else None

    # pylint: disable=no-value-for-parameter

    # Input: Single extract per example (or list of extracts if query_key used)
    #        where each item contains slice keys and other extracts from upstream
    #        extractors (e.g. labels, predictions, etc).
    # Output: Single extract (per example) containing slice keys and initial
    #         combiner state returned from preprocessor. Note that even if a
    #         query_key was used the output is still only a single extract
    #         (though, that extract may contain lists of values (predictions,
    #         labels, etc) in its keys).
    #
    # Note that the output of this step is extracts instead of just a tuple of
    # computation outputs because FanoutSlices takes extracts as input (and in
    # many cases a subset of the extracts themselves are what is fanned out).
    extracts = (extracts
                | 'Preprocesss' >> beam.ParDo(_PreprocessorDoFn(computations)))

    # Input: Single extract containing slice keys and initial combiner inputs. If
    #        query_key is used the extract represents multiple examples with the
    #        same query_key, otherwise the extract represents a single example.
    # Output: Tuple (slice key, combiner inputs extracts). Notice that the per
    #         example (or list or examples if query_key used) input extract turns
    #         into n logical extracts, references to which are replicated once per
    #         applicable slice key.
    slices = extracts | 'FanoutSlices' >> slicer.FanoutSlices()

    slices_count = (slices
                    | 'ExtractSliceKeys' >> beam.Keys()
                    | 'CountPerSliceKey' >> beam.combiners.Count.PerElement())

    _ = (extracts.pipeline
         | 'IncrementMetricsSpecsCounters' >>
         counter_util.IncrementMetricsSpecsCounters(metrics_specs),
         slices_count
         | 'IncrementSliceSpecCounters' >>
         counter_util.IncrementSliceSpecCounters())

    ci_params = _get_confidence_interval_params(eval_config, metrics_specs)

    cross_slice_specs = []
    if eval_config.cross_slicing_specs:
        cross_slice_specs = eval_config.cross_slicing_specs

    # TODO(b/151482616): Make bootstrap and jackknife confidence interval
    # implementations more parallel.

    # Input: Tuple of (slice key, combiner input extracts).
    # Output: Tuple of (slice key, dict of computed metrics/plots). The dicts will
    #         be keyed by MetricKey/PlotKey and the values will be the result
    #         of the associated computations. A given MetricComputation can
    #         perform computations for multiple keys, but the keys should be
    #         unique across computations.
    sliced_metrics_and_plots = (
        slices
        |
        'ComputePerSlice' >> poisson_bootstrap.ComputeWithConfidenceIntervals(
            _ComputePerSlice,
            computations=computations,
            derived_computations=derived_computations,
            baseline_model_name=baseline_model_name,
            cross_slice_specs=cross_slice_specs,
            num_jackknife_samples=ci_params.num_jackknife_samples,
            num_bootstrap_samples=ci_params.num_bootstrap_samples,
            skip_ci_metric_keys=ci_params.skip_ci_metric_keys,
            random_seed_for_testing=random_seed_for_testing))

    if eval_config.options.min_slice_size.value > 1:
        sliced_metrics_and_plots = (
            sliced_metrics_and_plots
            | 'FilterSmallSlices' >> slicer.FilterOutSlices(
                slices_count, eval_config.options.min_slice_size.value))

    sliced_metrics = (sliced_metrics_and_plots
                      | 'FilterByMetrics' >> beam.Map(_filter_by_key_type,
                                                      metric_types.MetricKey))
    sliced_plots = (
        sliced_metrics_and_plots
        |
        'FilterByPlots' >> beam.Map(_filter_by_key_type, metric_types.PlotKey))

    # pylint: enable=no-value-for-parameter

    return {metrics_key: sliced_metrics, plots_key: sliced_plots}
Beispiel #5
0
def validate_metrics(
        sliced_metrics: Tuple[Union[slicer.SliceKeyType,
                                    slicer.CrossSliceKeyType],
                              Dict['metric_types.MetricKey',
                                   Any]], eval_config: config.EvalConfig
) -> validation_result_pb2.ValidationResult:
    """Check the metrics and check whether they should be validated."""
    # Find out which model is baseline.
    baseline_spec = model_util.get_baseline_model_spec(eval_config)
    baseline_model_name = baseline_spec.name if baseline_spec else None

    sliced_key, metrics = sliced_metrics
    thresholds = metric_specs.metric_thresholds_from_metrics_specs(
        eval_config.metrics_specs)  # pytype: disable=wrong-arg-types
    is_cross_slice = slicer.is_cross_slice_key(sliced_key)

    def _check_threshold(key: metric_types.MetricKey,
                         threshold: _ThresholdType, metric: Any) -> bool:
        """Verify a metric given its metric key and metric value."""
        if isinstance(threshold, config.GenericValueThreshold):
            lower_bound, upper_bound = -np.inf, np.inf
            if threshold.HasField('lower_bound'):
                lower_bound = threshold.lower_bound.value
            if threshold.HasField('upper_bound'):
                upper_bound = threshold.upper_bound.value
            return metric > lower_bound and metric < upper_bound
        elif isinstance(threshold, config.GenericChangeThreshold):
            diff = metric
            ratio = diff / metrics[key.make_baseline_key(baseline_model_name)]
            if threshold.direction == config.MetricDirection.LOWER_IS_BETTER:
                absolute, relative = np.inf, np.inf
            elif threshold.direction == config.MetricDirection.HIGHER_IS_BETTER:
                absolute, relative = -np.inf, -np.inf
            else:
                raise ValueError('"UNKNOWN" direction for change threshold.')
            if threshold.HasField('absolute'):
                absolute = threshold.absolute.value
            if threshold.HasField('relative'):
                relative = threshold.relative.value
            if threshold.direction == config.MetricDirection.LOWER_IS_BETTER:
                return diff < absolute and ratio < relative
            elif threshold.direction == config.MetricDirection.HIGHER_IS_BETTER:
                return diff > absolute and ratio > relative

    def _copy_metric(metric, to):
        # Will add more types when more MetricValue are supported.
        to.double_value.value = float(metric)

    def _copy_threshold(threshold, to):
        if isinstance(threshold, config.GenericValueThreshold):
            to.value_threshold.CopyFrom(threshold)
        if isinstance(threshold, config.GenericChangeThreshold):
            to.change_threshold.CopyFrom(threshold)

    def _add_to_set(s, v):
        """Adds value to set. Returns true if didn't exist."""
        if v in s:
            return False
        else:
            s.add(v)
            return True

    # Empty metrics per slice is considered validated.
    result = validation_result_pb2.ValidationResult(validation_ok=True)
    validation_for_slice = validation_result_pb2.MetricsValidationForSlice()
    unchecked_thresholds = dict(thresholds)
    for metric_key, metric in metrics.items():
        if metric_key not in thresholds:
            continue
        del unchecked_thresholds[metric_key]
        # Not meaningful to check threshold for baseline model, thus always return
        # True if such threshold is configured. We also do not compare Message type
        # metrics.
        if metric_key.model_name == baseline_model_name:
            continue
        msg = ''
        # We try to convert to float values.
        try:
            metric = float(metric)
        except (TypeError, ValueError):
            msg = """
        Invalid threshold config: This metric is not comparable to the
        threshold. The type of the threshold is: {}, and the metric value is:
        \n{}""".format(type(metric), metric)
        existing_failures = set()
        for slice_spec, threshold in thresholds[metric_key]:
            if (slice_spec is not None
                    and isinstance(slice_spec, config.SlicingSpec)
                    and (is_cross_slice or not slicer.SingleSliceSpec(
                        spec=slice_spec).is_slice_applicable(sliced_key))):
                continue
            if (slice_spec is not None
                    and isinstance(slice_spec, config.CrossSlicingSpec) and
                (not is_cross_slice or not slicer.is_cross_slice_applicable(
                    cross_slice_key=sliced_key, cross_slicing_spec=slice_spec))
                ):
                continue
            if not _check_threshold(metric_key, threshold, metric):
                # The same threshold values could be set for multiple matching slice
                # specs. Only store the first match.
                #
                # Note that hashing by SerializeToString() is only safe if used within
                # the same process.
                if not _add_to_set(existing_failures,
                                   threshold.SerializeToString()):
                    continue
                failure = validation_for_slice.failures.add()
                failure.metric_key.CopyFrom(metric_key.to_proto())
                _copy_metric(metric, failure.metric_value)
                _copy_threshold(threshold, failure.metric_threshold)
                failure.message = msg
            # Track we have completed a validation check for slice spec and metric
            slicing_details = result.validation_details.slicing_details.add()
            if slice_spec is not None:
                if isinstance(slice_spec, config.SlicingSpec):
                    slicing_details.slicing_spec.CopyFrom(slice_spec)
                else:
                    slicing_details.cross_slicing_spec.CopyFrom(slice_spec)
            else:
                slicing_details.slicing_spec.CopyFrom(config.SlicingSpec())
            slicing_details.num_matching_slices = 1
    # All unchecked thresholds are considered failures.
    for metric_key, thresholds in unchecked_thresholds.items():
        if metric_key.model_name == baseline_model_name:
            continue
        existing_failures = set()
        for _, threshold in thresholds:
            # The same threshold values could be set for multiple matching slice
            # specs. Only store the first match.
            #
            # Note that hashing by SerializeToString() is only safe if used within
            # the same process.
            if not _add_to_set(existing_failures,
                               threshold.SerializeToString()):
                continue
            failure = validation_for_slice.failures.add()
            failure.metric_key.CopyFrom(metric_key.to_proto())
            _copy_threshold(threshold, failure.metric_threshold)
            failure.message = 'Metric not found.'
    # Any failure leads to overall failure.
    if validation_for_slice.failures:
        if not is_cross_slice:
            validation_for_slice.slice_key.CopyFrom(
                slicer.serialize_slice_key(sliced_key))
        else:
            validation_for_slice.cross_slice_key.CopyFrom(
                slicer.serialize_cross_slice_key(sliced_key))
        result.validation_ok = False
        result.metric_validations_per_slice.append(validation_for_slice)
    return result
def _ComputeMetricsAndPlots(  # pylint: disable=invalid-name
    extracts: beam.pvalue.PCollection,
    eval_config: config.EvalConfig,
    metrics_specs: List[config.MetricsSpec],
    eval_shared_models: Optional[Dict[Text, types.EvalSharedModel]] = None,
    metrics_key: Text = constants.METRICS_KEY,
    plots_key: Text = constants.PLOTS_KEY) -> evaluator.Evaluation:
  """Computes metrics and plots.

  Args:
    extracts: PCollection of Extracts. If a query_key was used then the
      PCollection will contain a list of extracts.
    eval_config: Eval config.
    metrics_specs: Subset of the metric specs to compute metrics for. If a
      query_key was used all of the metric specs will be for the same query_key.
    eval_shared_models: Optional dict of shared models keyed by model name. Only
      required if there are metrics to be computed in-graph using the model.
    metrics_key: Name to use for metrics key in Evaluation output.
    plots_key: Name to use for plots key in Evaluation output.

  Returns:
    Evaluation containing dict of PCollections of (slice_key, results_dict)
    tuples where the dict is keyed by either the metrics_key (e.g. 'metrics') or
    plots_key (e.g. 'plots') depending on what the results_dict contains.
  """
  computations = []
  model_loaders = None
  # Add default metric computations
  if eval_shared_models:
    model_loaders = {}
    for model_name, eval_shared_model in eval_shared_models.items():
      if not eval_shared_model.include_default_metrics:
        continue
      model_loader = eval_shared_model.model_loader
      model_loaders[model_name] = model_loader
      model_types = model_loader.construct_fn(lambda x: None)()
      if model_types.keras_model is not None:
        # TODO(mdreves): Move handling of keras metrics to here.
        pass
      elif model_types.eval_saved_model is not None:
        # Note that there is the possibility for metric naming collisions here
        # (e.g. 'auc' calculated within the EvalSavedModel as well as by AUC
        # metric computation performed outside the model). Currently all the
        # overlapping metrics such as AUC that are computed outside the model
        # are all derived metrics so they will override the metrics calculated
        # by the model which is the desired behavior.
        computations.extend(
            eval_saved_model_util.metric_computations_using_eval_saved_model(
                model_name, model_loader))
  # Add metric computations from specs
  computations_from_specs, derived_computations = (
      _filter_and_separate_computations(
          metric_specs.to_computations(
              metrics_specs,
              eval_config=eval_config,
              model_loaders=model_loaders)))
  computations.extend(computations_from_specs)

  # Find out which model is baseline.
  baseline_spec = model_util.get_baseline_model_spec(eval_config)
  baseline_model_name = baseline_spec.name if baseline_spec else None

  # pylint: disable=no-value-for-parameter

  # Input: Single extract per example (or list of extracts if query_key used)
  #        where each item contains slice keys and other extracts from upstream
  #        extractors (e.g. labels, predictions, etc).
  # Output: Single extract (per example) containing slice keys and initial
  #         combiner state returned from preprocessor. Note that even if a
  #         query_key was used the output is still only a single extract
  #         (though, that extract may contain lists of values (predictions,
  #         labels, etc) in its keys).
  #
  # Note that the output of this step is extracts instead of just a tuple of
  # computation outputs because FanoutSlices takes extracts as input (and in
  # many cases a subset of the extracts themselves are what is fanned out).
  extracts = (
      extracts
      | 'Preprocesss' >> beam.ParDo(_PreprocessorDoFn(computations)))

  # Input: Single extract containing slice keys and initial combiner inputs. If
  #        query_key is used the extract represents multiple examples with the
  #        same query_key, otherwise the extract represents a single example.
  # Output: Tuple (slice key, combiner inputs extracts). Notice that the per
  #         example (or list or examples if query_key used) input extract turns
  #         into n logical extracts, references to which are replicated once per
  #         applicable slice key.
  slices = extracts | 'FanoutSlices' >> slicer.FanoutSlices()

  slices_count = (
      slices
      | 'ExtractSliceKeys' >> beam.Keys()
      | 'CountPerSliceKey' >> beam.combiners.Count.PerElement())

  # Input: Tuple of (slice key, combiner input extracts).
  # Output: Tuple of (slice key, dict of computed metrics/plots). The dicts will
  #         be keyed by MetricKey/PlotKey and the values will be the result
  #         of the associated computations. A given MetricComputation can
  #         perform computations for multiple keys, but the keys should be
  #         unique across computations.
  sliced_metrics_and_plots = (
      slices
      | 'ComputePerSlice' >> poisson_bootstrap.ComputeWithConfidenceIntervals(
          _ComputePerSlice,
          computations=computations,
          derived_computations=derived_computations,
          baseline_model_name=baseline_model_name,
          num_bootstrap_samples=(
              poisson_bootstrap.DEFAULT_NUM_BOOTSTRAP_SAMPLES if
              eval_config.options.compute_confidence_intervals.value else 1)))

  if eval_config.options.k_anonymization_count.value > 1:
    sliced_metrics_and_plots = (
        sliced_metrics_and_plots
        | 'FilterForSmallSlices' >> slicer.FilterOutSlices(
            slices_count, eval_config.options.k_anonymization_count.value))

  sliced_metrics = (
      sliced_metrics_and_plots
      | 'FilterByMetrics' >> beam.Map(_filter_by_key_type,
                                      metric_types.MetricKey))
  sliced_plots = (
      sliced_metrics_and_plots
      | 'FilterByPlots' >> beam.Map(_filter_by_key_type, metric_types.PlotKey))

  # pylint: enable=no-value-for-parameter

  return {metrics_key: sliced_metrics, plots_key: sliced_plots}