Exemple #1
0
def get_missing_slices(
    slicing_details: Iterable[validation_result_pb2.SlicingDetails],
    eval_config: config_pb2.EvalConfig
) -> List[Union[config_pb2.SlicingSpec, config_pb2.CrossSlicingSpec]]:
    """Returns specs that are defined in the EvalConfig but not found in details.

  Args:
    slicing_details: Slicing details.
    eval_config: Eval config.

  Returns:
    List of missing slices or empty list if none are missing.
  """
    hashed_details = _hashed_slicing_details(slicing_details)
    thresholds = metric_specs.metric_thresholds_from_metrics_specs(
        eval_config.metrics_specs)
    baseline_spec = model_util.get_baseline_model_spec(eval_config)
    baseline_model_name = baseline_spec.name if baseline_spec else None
    missing_slices = []
    for metric_key, sliced_thresholds in thresholds.items():
        # Skip baseline.
        if metric_key.model_name == baseline_model_name:
            continue
        for slice_spec, _ in sliced_thresholds:
            if not slice_spec:
                slice_spec = config_pb2.SlicingSpec()
            slice_hash = slice_spec.SerializeToString()
            if slice_hash not in hashed_details:
                missing_slices.append(slice_spec)
                # Same slice may be used by other metrics/thresholds, only add once
                hashed_details[
                    slice_hash] = validation_result_pb2.SlicingDetails()
    return missing_slices
Exemple #2
0
def symmetric_prediction_difference_computations(
        name: str = SYMMETRIC_PREDICITON_DIFFERENCE_NAME,
        eval_config: Optional[config_pb2.EvalConfig] = None,
        model_names: Optional[List[str]] = None,
        output_names: Optional[List[str]] = None,
        sub_keys: Optional[List[metric_types.SubKey]] = None,
        example_weighted: bool = False) -> metric_types.MetricComputations:
    """Returns metric computations for SymmetricPredictionDifference.

  This is not meant to be used with merge_per_key_computations because we
  don't want to create computations for the baseline model, and we want to
  provide the baseline model name to each Combiner

  Args:
    name: The name of the metric returned by the computations.
    eval_config: The EvalConfig for this TFMA evaluation.
    model_names: The set of models for which to compute this metric.
    output_names: The set of output names for which to compute this metric.
    sub_keys: The set of sub_key settings for which to compute this metric.
    example_weighted: Whether to compute this metric using example weights.
  """
    computations = []
    baseline_spec = model_util.get_baseline_model_spec(eval_config)
    baseline_model_name = baseline_spec.name if baseline_spec else None
    for model_name in model_names or ['']:
        if model_name == baseline_model_name:
            continue
        for output_name in output_names or ['']:
            for sub_key in sub_keys or [None]:
                key = metric_types.MetricKey(name=name,
                                             model_name=model_name,
                                             output_name=output_name,
                                             sub_key=sub_key,
                                             example_weighted=example_weighted,
                                             is_diff=True)
                computations.append(
                    metric_types.MetricComputation(
                        keys=[key],
                        preprocessor=None,
                        combiner=_SymmetricPredictionDifferenceCombiner(
                            eval_config, baseline_model_name, model_name,
                            output_name, key, example_weighted)))
    return computations
Exemple #3
0
def validate_metrics(
    sliced_metrics: Tuple[Union[slicer.SliceKeyType, slicer.CrossSliceKeyType],
                          Dict['metric_types.MetricKey',
                               Any]], eval_config: config_pb2.EvalConfig
) -> validation_result_pb2.ValidationResult:
    """Check the metrics and check whether they should be validated."""
    # Find out which model is baseline.
    baseline_spec = model_util.get_baseline_model_spec(eval_config)
    baseline_model_name = baseline_spec.name if baseline_spec else None

    sliced_key, metrics = sliced_metrics
    thresholds = metric_specs.metric_thresholds_from_metrics_specs(
        eval_config.metrics_specs)
    is_cross_slice = slicer.is_cross_slice_key(sliced_key)

    def _check_threshold(key: metric_types.MetricKey,
                         threshold: _ThresholdType, metric: Any) -> bool:
        """Verify a metric given its metric key and metric value."""
        metric = float(metric)
        if isinstance(threshold, config_pb2.GenericValueThreshold):
            lower_bound, upper_bound = -np.inf, np.inf
            if threshold.HasField('lower_bound'):
                lower_bound = threshold.lower_bound.value
            if threshold.HasField('upper_bound'):
                upper_bound = threshold.upper_bound.value
            return metric >= lower_bound and metric <= upper_bound
        elif isinstance(threshold, config_pb2.GenericChangeThreshold):
            diff = metric
            metric_baseline = float(
                metrics[key.make_baseline_key(baseline_model_name)])
            if math.isclose(metric_baseline, 0.0):
                ratio = float('nan')
            else:
                ratio = diff / metric_baseline
            if threshold.direction == config_pb2.MetricDirection.LOWER_IS_BETTER:
                absolute, relative = np.inf, np.inf
            elif threshold.direction == config_pb2.MetricDirection.HIGHER_IS_BETTER:
                absolute, relative = -np.inf, -np.inf
            else:
                raise ValueError(
                    '"UNKNOWN" direction for change threshold: {}.'.format(
                        threshold))
            if threshold.HasField('absolute'):
                absolute = threshold.absolute.value
            if threshold.HasField('relative'):
                relative = threshold.relative.value
            if threshold.direction == config_pb2.MetricDirection.LOWER_IS_BETTER:
                return diff <= absolute and ratio <= relative
            elif threshold.direction == config_pb2.MetricDirection.HIGHER_IS_BETTER:
                return diff >= absolute and ratio >= relative
        else:
            raise ValueError('Unknown threshold: {}'.format(threshold))

    def _copy_metric(metric, to):
        # Will add more types when more MetricValue are supported.
        to.double_value.value = float(metric)

    def _copy_threshold(threshold, to):
        if isinstance(threshold, config_pb2.GenericValueThreshold):
            to.value_threshold.CopyFrom(threshold)
        if isinstance(threshold, config_pb2.GenericChangeThreshold):
            to.change_threshold.CopyFrom(threshold)

    def _add_to_set(s, v):
        """Adds value to set. Returns true if didn't exist."""
        if v in s:
            return False
        else:
            s.add(v)
            return True

    # Empty metrics per slice is considered validated.
    result = validation_result_pb2.ValidationResult(validation_ok=True)
    validation_for_slice = validation_result_pb2.MetricsValidationForSlice()
    unchecked_thresholds = dict(thresholds)
    for metric_key, metric in metrics.items():
        if metric_key not in thresholds:
            continue
        del unchecked_thresholds[metric_key]
        # Not meaningful to check threshold for baseline model, thus always return
        # True if such threshold is configured. We also do not compare Message type
        # metrics.
        if metric_key.model_name == baseline_model_name:
            continue
        msg = ''
        existing_failures = set()
        for slice_spec, threshold in thresholds[metric_key]:
            if slice_spec is not None:
                if (isinstance(slice_spec, config_pb2.SlicingSpec)
                        and (is_cross_slice or not slicer.SingleSliceSpec(
                            spec=slice_spec).is_slice_applicable(sliced_key))):
                    continue
                if (isinstance(slice_spec, config_pb2.CrossSlicingSpec)
                        and (not is_cross_slice
                             or not slicer.is_cross_slice_applicable(
                                 cross_slice_key=sliced_key,
                                 cross_slicing_spec=slice_spec))):
                    continue
            elif is_cross_slice:
                continue
            try:
                check_result = _check_threshold(metric_key, threshold, metric)
            except ValueError:
                msg = """
          Invalid metrics or threshold for comparison: The type of the metric
          is: {}, the metric value is: {}, and the threshold is: {}.
          """.format(type(metric), metric, threshold)
                check_result = False
            else:
                msg = ''
            if not check_result:
                # The same threshold values could be set for multiple matching slice
                # specs. Only store the first match.
                #
                # Note that hashing by SerializeToString() is only safe if used within
                # the same process.
                if not _add_to_set(existing_failures,
                                   threshold.SerializeToString()):
                    continue
                failure = validation_for_slice.failures.add()
                failure.metric_key.CopyFrom(metric_key.to_proto())
                _copy_metric(metric, failure.metric_value)
                _copy_threshold(threshold, failure.metric_threshold)
                failure.message = msg
            # Track we have completed a validation check for slice spec and metric
            slicing_details = result.validation_details.slicing_details.add()
            if slice_spec is not None:
                if isinstance(slice_spec, config_pb2.SlicingSpec):
                    slicing_details.slicing_spec.CopyFrom(slice_spec)
                else:
                    slicing_details.cross_slicing_spec.CopyFrom(slice_spec)
            else:
                slicing_details.slicing_spec.CopyFrom(config_pb2.SlicingSpec())
            slicing_details.num_matching_slices = 1
    # All unchecked thresholds are considered failures.
    for metric_key, thresholds in unchecked_thresholds.items():
        if metric_key.model_name == baseline_model_name:
            continue
        existing_failures = set()
        for slice_spec, threshold in thresholds:
            if slice_spec is not None:
                if is_cross_slice != isinstance(slice_spec,
                                                config_pb2.CrossSlicingSpec):
                    continue
                if (is_cross_slice
                        and not slicer.is_cross_slice_applicable(
                            cross_slice_key=sliced_key,
                            cross_slicing_spec=slice_spec)):
                    continue
            elif is_cross_slice:
                continue
            # The same threshold values could be set for multiple matching slice
            # specs. Only store the first match.
            #
            # Note that hashing by SerializeToString() is only safe if used within
            # the same process.
            if not _add_to_set(existing_failures,
                               threshold.SerializeToString()):
                continue
            failure = validation_for_slice.failures.add()
            failure.metric_key.CopyFrom(metric_key.to_proto())
            _copy_threshold(threshold, failure.metric_threshold)
            failure.message = 'Metric not found.'
    # Any failure leads to overall failure.
    if validation_for_slice.failures:
        if not is_cross_slice:
            validation_for_slice.slice_key.CopyFrom(
                slicer.serialize_slice_key(sliced_key))
        else:
            validation_for_slice.cross_slice_key.CopyFrom(
                slicer.serialize_cross_slice_key(sliced_key))
        result.validation_ok = False
        result.metric_validations_per_slice.append(validation_for_slice)
    return result
def _ComputeMetricsAndPlots(  # pylint: disable=invalid-name
    extracts: beam.pvalue.PCollection,
    eval_config: config_pb2.EvalConfig,
    metrics_specs: List[config_pb2.MetricsSpec],
    eval_shared_models: Optional[Dict[Text, types.EvalSharedModel]] = None,
    metrics_key: Text = constants.METRICS_KEY,
    plots_key: Text = constants.PLOTS_KEY,
    attributions_key: Text = constants.ATTRIBUTIONS_KEY,
    schema: Optional[schema_pb2.Schema] = None,
    random_seed_for_testing: Optional[int] = None,
    tensor_adapter_config: Optional[tensor_adapter.TensorAdapterConfig] = None
) -> evaluator.Evaluation:
  """Computes metrics and plots.

  Args:
    extracts: PCollection of Extracts. If a query_key was used then the
      PCollection will contain a list of extracts.
    eval_config: Eval config.
    metrics_specs: Subset of the metric specs to compute metrics for. If a
      query_key was used all of the metric specs will be for the same query_key.
    eval_shared_models: Optional dict of shared models keyed by model name. Only
      required if there are metrics to be computed in-graph using the model.
    metrics_key: Name to use for metrics key in Evaluation output.
    plots_key: Name to use for plots key in Evaluation output.
    attributions_key: Name to use for attributions key in Evaluation output.
    schema: A schema to use for customizing metrics and plots.
    random_seed_for_testing: Seed to use for unit testing.
    tensor_adapter_config: Tensor adapter config which specifies how to obtain
      tensors from the Arrow RecordBatch. The model's signature will be invoked
      with those tensors (matched by names). If None, an attempt will be made to
      create an adapter based on the model's input signature otherwise the model
      will be invoked with raw examples (assuming a  signature of a single 1-D
      string tensor).

  Returns:
    Evaluation containing dict of PCollections of (slice_key, results_dict)
    tuples where the dict is keyed by either the metrics_key (e.g. 'metrics'),
    plots_key (e.g. 'plots'), or attributions_key (e.g. 'attributions')
    depending on what the results_dict contains.
  """
  computations = []
  # Add default metric computations
  if eval_shared_models:
    # Note that there is the possibility for metric naming collisions here
    # (e.g. 'auc' calculated within the model as well as by AUC metric
    # computation performed outside the model). Currently all the overlapping
    # metrics such as AUC that are computed outside the model are all derived
    # metrics so they will override the metrics calculated by the model which is
    # the desired behavior.
    for model_name, eval_shared_model in eval_shared_models.items():
      if not eval_shared_model.include_default_metrics:
        continue
      if eval_shared_model.model_type == constants.TF_KERAS:
        computations.extend(
            keras_util.metric_computations_using_keras_saved_model(
                model_name, eval_shared_model.model_loader, eval_config,
                tensor_adapter_config))
      elif (eval_shared_model.model_type == constants.TF_ESTIMATOR and
            eval_constants.EVAL_TAG in eval_shared_model.model_loader.tags):
        computations.extend(
            eval_saved_model_util.metric_computations_using_eval_saved_model(
                model_name, eval_shared_model.model_loader))
  # Add metric computations from specs
  metric_computations = _filter_and_separate_computations(
      metric_specs.to_computations(
          metrics_specs, eval_config=eval_config, schema=schema))
  computations.extend(metric_computations.non_derived_computations)

  # Find out which model is baseline.
  baseline_spec = model_util.get_baseline_model_spec(eval_config)
  baseline_model_name = baseline_spec.name if baseline_spec else None

  # pylint: disable=no-value-for-parameter

  # Input: Single extract per example (or list of extracts if query_key used)
  #        where each item contains slice keys and other extracts from upstream
  #        extractors (e.g. labels, predictions, etc).
  # Output: Single extract (per example) containing slice keys and initial
  #         combiner state returned from preprocessor. Note that even if a
  #         query_key was used the output is still only a single extract
  #         (though, that extract may contain lists of values (predictions,
  #         labels, etc) in its keys).
  #
  # Note that the output of this step is extracts instead of just a tuple of
  # computation outputs because FanoutSlices takes extracts as input (and in
  # many cases a subset of the extracts themselves are what is fanned out).
  extracts = (
      extracts
      | 'Preprocesss' >> beam.ParDo(_PreprocessorDoFn(computations)))

  # Input: Single extract containing slice keys and initial combiner inputs. If
  #        query_key is used the extract represents multiple examples with the
  #        same query_key, otherwise the extract represents a single example.
  # Output: Tuple (slice key, combiner inputs extracts). Notice that the per
  #         example (or list or examples if query_key used) input extract turns
  #         into n logical extracts, references to which are replicated once per
  #         applicable slice key.
  slices = extracts | 'FanoutSlices' >> slicer.FanoutSlices()

  slices_count = (
      slices
      | 'ExtractSliceKeys' >> beam.Keys()
      | 'CountPerSliceKey' >> beam.combiners.Count.PerElement())

  model_types = _get_model_types_for_logging(eval_shared_models)

  _ = (
      extracts.pipeline
      | 'IncrementMetricsSpecsCounters' >>
      counter_util.IncrementMetricsSpecsCounters(metrics_specs, model_types),
      slices_count
      |
      'IncrementSliceSpecCounters' >> counter_util.IncrementSliceSpecCounters())

  ci_params = _get_confidence_interval_params(eval_config, metrics_specs)

  cross_slice_specs = []
  if eval_config.cross_slicing_specs:
    cross_slice_specs = eval_config.cross_slicing_specs

  computations_combine_fn = _ComputationsCombineFn(computations=computations)
  derived_metrics_ptransform = _AddDerivedCrossSliceAndDiffMetrics(
      metric_computations.derived_computations,
      metric_computations.cross_slice_computations, cross_slice_specs,
      baseline_model_name)

  # Input: Tuple of (slice key, combiner input extracts).
  # Output: Tuple of (slice key, dict of computed metrics/plots/attributions).
  #         The dicts will be keyed by MetricKey/PlotKey/AttributionsKey and the
  #         values will be the result of the associated computations. A given
  #         MetricComputation can perform computations for multiple keys, but
  #         the keys should be unique across computations.
  if ci_params.num_bootstrap_samples:
    sliced_metrics_plots_and_attributions = (
        slices | 'PoissonBootstrapConfidenceIntervals' >>
        poisson_bootstrap.ComputeWithConfidenceIntervals(
            computations_combine_fn=computations_combine_fn,
            derived_metrics_ptransform=derived_metrics_ptransform,
            num_bootstrap_samples=ci_params.num_bootstrap_samples,
            hot_key_fanout=_COMBINE_PER_SLICE_KEY_HOT_KEY_FANOUT,
            skip_ci_metric_keys=ci_params.skip_ci_metric_keys,
            random_seed_for_testing=random_seed_for_testing))
  elif ci_params.num_jackknife_samples:
    sliced_metrics_plots_and_attributions = (
        slices
        | 'JackknifeConfidenceIntervals' >>
        jackknife.ComputeWithConfidenceIntervals(
            computations_combine_fn=computations_combine_fn,
            derived_metrics_ptransform=derived_metrics_ptransform,
            num_jackknife_samples=ci_params.num_jackknife_samples,
            skip_ci_metric_keys=ci_params.skip_ci_metric_keys,
            random_seed_for_testing=random_seed_for_testing))
  else:
    sliced_metrics_plots_and_attributions = (
        slices
        |
        'CombineMetricsPerSlice' >> beam.CombinePerKey(computations_combine_fn)
        .with_hot_key_fanout(_COMBINE_PER_SLICE_KEY_HOT_KEY_FANOUT)
        | 'AddDerivedCrossSliceAndDiffMetrics' >> derived_metrics_ptransform)

  sliced_metrics_plots_and_attributions = (
      sliced_metrics_plots_and_attributions
      | 'AddCIDerivedMetrics' >> beam.Map(
          _add_ci_derived_metrics, metric_computations.ci_derived_computations))

  if eval_config.options.min_slice_size.value > 1:
    sliced_metrics_plots_and_attributions = (
        sliced_metrics_plots_and_attributions
        | 'FilterSmallSlices' >> slicer.FilterOutSlices(
            slices_count, eval_config.options.min_slice_size.value))

  sliced_metrics = (
      sliced_metrics_plots_and_attributions
      | 'FilterByMetrics' >> beam.Map(_filter_by_key_type,
                                      metric_types.MetricKey))
  sliced_plots = (
      sliced_metrics_plots_and_attributions
      | 'FilterByPlots' >> beam.Map(_filter_by_key_type, metric_types.PlotKey))

  sliced_attributions = (
      sliced_metrics_plots_and_attributions
      | 'FilterByAttributions' >> beam.Map(_filter_by_key_type,
                                           metric_types.AttributionsKey))

  # pylint: enable=no-value-for-parameter

  return {
      metrics_key: sliced_metrics,
      plots_key: sliced_plots,
      attributions_key: sliced_attributions
  }
Exemple #5
0
    def extract_label_prediction_and_weight(
        inputs: metric_types.StandardMetricInputs,
        eval_config: Optional[config_pb2.EvalConfig] = None,
        model_name: str = '',
        output_name: str = '',
        sub_key: Optional[metric_types.SubKey] = None,
        aggregation_type: Optional[metric_types.AggregationType] = None,
        class_weights: Optional[Dict[int, float]] = None,
        example_weighted: bool = False,
        fractional_labels: bool = False,
        flatten: bool = True,
    ) -> Iterator[Tuple[np.ndarray, np.ndarray, np.ndarray]]:
        """Yields label, prediction, and example weights to be used in calculations.

    This function is a customized metric_util.to_label_prediction_example_weight
    function which yields original prediction as label and counterfactual
    prediction as prediction and derive flip count metrics from false positives
    and false negatives of binary confusion matrix.

    Args:
      inputs: Standard metric inputs.
      eval_config: Eval config
      model_name: Optional model name (if multi-model evaluation).
      output_name: Optional output name (if multi-output model type).
      sub_key: Optional sub key. (unused)
      aggregation_type: Optional aggregation type. (unused)
      class_weights: Optional class weights to apply to multi-class /
        multi-label labels and predictions. (unused)
      example_weighted: True if example weights should be applied.
      fractional_labels: If true, each incoming tuple of (label, prediction,
        example weight) will be split into two tuples as follows (where l, p, w
        represent the resulting label, prediction, and example weight values):
          (1) l = 0.0, p = prediction, and w = example_weight * (1.0 - label)
          (2) l = 1.0, p = prediction, and w = example_weight * label If
          enabled, an exception will be raised if labels are not within [0, 1].
          The implementation is such that tuples associated with a weight of
          zero are not yielded. This means it is safe to enable
          fractional_labels even when the labels only take on the values of 0.0
          or 1.0. (unused)
      flatten: True to flatten the final label and prediction outputs so that
        the yielded values are always arrays of size 1. For example, multi-class
        /multi-label outputs would be converted into label and prediction pairs
        that could then be processed by a binary classification metric in order
        to compute a micro average over all classes. (unused)

    Yields:
      Tuple of (label, prediction, example_weight).

    Raises:
      ValueError: If counterfactual prediction key is not found within either
        the features or predictions.
      ValueError: If predictions is None or empty.
    """
        del (sub_key, aggregation_type, class_weights, fractional_labels,
             flatten)  # unused

        # TODO(sokeefe): Look into removing the options to pass counterfactual
        # predictions in a feature and instead as a baseline model.
        if (counterfactual_prediction_key is not None
                and counterfactual_prediction_key in inputs.features):
            counterfactual_prediction = inputs.features[
                counterfactual_prediction_key]
        elif eval_config is not None:
            counterfactual_model_spec = model_util.get_baseline_model_spec(
                eval_config)
            if counterfactual_model_spec is not None:
                _, counterfactual_prediction, _ = next(
                    metric_util.to_label_prediction_example_weight(
                        inputs,
                        eval_config=eval_config,
                        model_name=counterfactual_model_spec.name,
                        output_name=output_name,
                        example_weighted=example_weighted,
                        fractional_labels=
                        False,  # Labels are ignored for flip counts.
                        flatten=False,  # Flattened below
                        allow_none=True,  # Allow None labels
                        require_single_example_weight=True))
            else:
                raise ValueError(
                    'The Counterfactual model must be listed with '
                    f'`is_baseline` equal to `True`. Found: {eval_config}')
        else:
            raise ValueError(
                '`counterfactual_prediction` was not found within the provided '
                'inputs. It must be included as either a feature key or within the '
                'predictions. Found:\n'
                f'`counterfactual_prediction_key`: {counterfactual_prediction_key}\n'
                f'`inputs.prediction`:{inputs.prediction}')

        if counterfactual_prediction is None:
            raise ValueError(
                '%s feature key is None (required for FlipCount metric)' %
                counterfactual_prediction_key)

        def get_by_keys(value: Any, keys: List[str]) -> Any:
            if isinstance(value, dict):
                new_value = util.get_by_keys(value, keys, optional=True)
                if new_value is not None:
                    return new_value
            return value

        if model_name:
            counterfactual_prediction = get_by_keys(counterfactual_prediction,
                                                    [model_name])
        if output_name:
            counterfactual_prediction = get_by_keys(counterfactual_prediction,
                                                    [output_name])

        _, prediction, example_weight = next(
            metric_util.to_label_prediction_example_weight(
                inputs,
                eval_config=eval_config,
                model_name=model_name,
                output_name=output_name,
                example_weighted=example_weighted,
                fractional_labels=False,  # Labels are ignored for flip counts.
                flatten=False,  # Flattened below
                allow_none=True,  # Allow None labels
                require_single_example_weight=True))

        if prediction.size != counterfactual_prediction.size:
            raise ValueError(
                'prediction and counterfactual_prediction size should be same for '
                'FlipCount metric, %f != %f' %
                (prediction.size, counterfactual_prediction.size))

        if prediction.size == 0:
            raise ValueError(
                'prediction is empty (required for FlipCount metric)')
        else:  # Always flatten
            example_weight = np.array(
                [float(example_weight) for i in range(prediction.shape[-1])])
            for p, cfp, w in zip(prediction.flatten(),
                                 counterfactual_prediction.flatten(),
                                 example_weight.flatten()):
                yield np.array([p]), np.array([cfp]), np.array([w])
 def test_get_baseline_model(self, eval_config,
                             expected_baseline_model_spec):
     self.assertEqual(expected_baseline_model_spec,
                      model_util.get_baseline_model_spec(eval_config))