def convert_slicing_metrics_to_ui_input( slicing_metrics: List[Tuple[slicer.SliceKeyOrCrossSliceKeyType, view_types.MetricsByOutputName]], slicing_column: Optional[str] = None, slicing_spec: Optional[slicer.SingleSliceSpec] = None, output_name: str = '', multi_class_key: str = '') -> Optional[List[Dict[str, Any]]]: """Renders the Fairness Indicator view. Args: slicing_metrics: tfma.EvalResult.slicing_metrics. slicing_column: The slicing column to to filter results. If both slicing_column and slicing_spec are None, show all eval results. slicing_spec: The slicing spec to filter results. If both slicing_column and slicing_spec are None, show all eval results. output_name: The output name associated with metric (for multi-output models). multi_class_key: The multi-class key associated with metric (for multi-class models). Returns: A list of dicts for each slice, where each dict contains keys 'sliceValue', 'slice', and 'metrics'. Raises: ValueError if no related eval result found or both slicing_column and slicing_spec are not None. """ if slicing_column and slicing_spec: raise ValueError( 'Only one of the "slicing_column" and "slicing_spec" parameters ' 'can be set.') if slicing_column: slicing_spec = slicer.SingleSliceSpec(columns=[slicing_column]) data = [] for (slice_key, metric_value) in slicing_metrics: if (metric_value is not None and output_name in metric_value and multi_class_key in metric_value[output_name]): metrics = metric_value[output_name][multi_class_key] # To add evaluation data for cross slice comparison. if slicer.is_cross_slice_key(slice_key): _add_cross_slice_key_data(slice_key, metrics, data) # To add evaluation data for regular slices. elif (slicing_spec is None or not slice_key or slicing_spec.is_slice_applicable(slice_key)): data.append({ 'sliceValue': stringify_slice_key_value(slice_key), 'slice': slicer.stringify_slice_key(slice_key), 'metrics': metrics }) if not data: raise ValueError( 'No eval result found for output_name:"%s" and ' 'multi_class_key:"%s" and slicing_column:"%s" and slicing_spec:"%s".' % (output_name, multi_class_key, slicing_column, slicing_spec)) return data
def convert_slice_attributions_to_proto( attributions: Tuple[slicer.SliceKeyOrCrossSliceKeyType, Dict[Any, Dict[Text, Any]]] ) -> metrics_for_slice_pb2.AttributionsForSlice: """Converts the given slice attributions into serialized AtributionsForSlice. Args: attributions: The slice attributions. Returns: The AttributionsForSlice proto. Raises: TypeError: If the type of the feature value in slice key cannot be recognized. """ result = metrics_for_slice_pb2.AttributionsForSlice() slice_key, slice_attributions = attributions if slicer.is_cross_slice_key(slice_key): result.cross_slice_key.CopyFrom( slicer.serialize_cross_slice_key(slice_key)) else: result.slice_key.CopyFrom(slicer.serialize_slice_key(slice_key)) slice_attributions = slice_attributions.copy() for key in sorted(slice_attributions.keys()): key_and_value = result.attributions_keys_and_values.add() key_and_value.key.CopyFrom(key.to_proto()) for feature, value in slice_attributions[key].items(): attribution_value = metrics_for_slice_pb2.MetricValue() if isinstance(value, six.binary_type): # Convert textual types to string metrics. attribution_value.bytes_value = value elif isinstance(value, six.text_type): # Convert textual types to string metrics. attribution_value.bytes_value = value.encode('utf8') elif isinstance(value, np.ndarray) and value.size != 1: # Convert NumPy arrays to ArrayValue. attribution_value.array_value.CopyFrom( _convert_to_array_value(value)) else: # We try to convert to float values. try: attribution_value.double_value.value = float(value) except (TypeError, ValueError) as e: attribution_value.unknown_type.value = str(value) attribution_value.unknown_type.error = e.message # pytype: disable=attribute-error key_and_value.values[feature].CopyFrom(attribution_value) return result
def convert_slice_plots_to_proto( plots: Tuple[slicer.SliceKeyOrCrossSliceKeyType, Dict[Any, Any]], add_metrics_callbacks: List[types.AddMetricsCallbackType] ) -> metrics_for_slice_pb2.PlotsForSlice: """Converts the given slice plots into PlotsForSlice proto. Args: plots: The slice plots. add_metrics_callbacks: A list of metric callbacks. This should be the same list as the one passed to tfma.Evaluate(). Returns: The PlotsForSlice proto. """ result = metrics_for_slice_pb2.PlotsForSlice() slice_key, slice_plots = plots if slicer.is_cross_slice_key(slice_key): result.cross_slice_key.CopyFrom( slicer.serialize_cross_slice_key(slice_key)) else: result.slice_key.CopyFrom(slicer.serialize_slice_key(slice_key)) slice_plots = slice_plots.copy() if metric_keys.ERROR_METRIC in slice_plots: logging.warning('Error for slice: %s with error message: %s ', slice_key, slice_plots[metric_keys.ERROR_METRIC]) error_metric = slice_plots.pop(metric_keys.ERROR_METRIC) result.plots[metric_keys.ERROR_METRIC].debug_message = error_metric return result if add_metrics_callbacks and (not any( isinstance(k, metric_types.MetricKey) for k in slice_plots.keys())): for add_metrics_callback in add_metrics_callbacks: if hasattr(add_metrics_callback, 'populate_plots_and_pop'): add_metrics_callback.populate_plots_and_pop( slice_plots, result.plots) plots_by_key = {} for key in sorted(slice_plots.keys()): value = slice_plots[key] # Remove plot name from key (multiple plots are combined into a single # proto). if isinstance(key, metric_types.MetricKey): parent_key = key._replace(name=None) else: continue if parent_key not in plots_by_key: key_and_value = result.plot_keys_and_values.add() key_and_value.key.CopyFrom(parent_key.to_proto()) plots_by_key[parent_key] = key_and_value.value if isinstance(value, metrics_for_slice_pb2.CalibrationHistogramBuckets): plots_by_key[parent_key].calibration_histogram_buckets.CopyFrom( value) slice_plots.pop(key) elif isinstance(value, metrics_for_slice_pb2.ConfusionMatrixAtThresholds): plots_by_key[parent_key].confusion_matrix_at_thresholds.CopyFrom( value) slice_plots.pop(key) elif isinstance( value, metrics_for_slice_pb2.MultiClassConfusionMatrixAtThresholds): plots_by_key[ parent_key].multi_class_confusion_matrix_at_thresholds.CopyFrom( value) slice_plots.pop(key) elif isinstance( value, metrics_for_slice_pb2.MultiLabelConfusionMatrixAtThresholds): plots_by_key[ parent_key].multi_label_confusion_matrix_at_thresholds.CopyFrom( value) slice_plots.pop(key) if slice_plots: if add_metrics_callbacks is None: add_metrics_callbacks = [] raise NotImplementedError( 'some plots were not converted or popped. keys: %s. ' 'add_metrics_callbacks were: %s' % ( slice_plots.keys(), [ x.name for x in add_metrics_callbacks # pytype: disable=attribute-error ])) return result
def convert_slice_metrics_to_proto( metrics: Tuple[slicer.SliceKeyOrCrossSliceKeyType, Dict[Any, Any]], add_metrics_callbacks: List[types.AddMetricsCallbackType] ) -> metrics_for_slice_pb2.MetricsForSlice: """Converts the given slice metrics into serialized proto MetricsForSlice. Args: metrics: The slice metrics. add_metrics_callbacks: A list of metric callbacks. This should be the same list as the one passed to tfma.Evaluate(). Returns: The MetricsForSlice proto. Raises: TypeError: If the type of the feature value in slice key cannot be recognized. """ result = metrics_for_slice_pb2.MetricsForSlice() slice_key, slice_metrics = metrics if slicer.is_cross_slice_key(slice_key): result.cross_slice_key.CopyFrom( slicer.serialize_cross_slice_key(slice_key)) else: result.slice_key.CopyFrom(slicer.serialize_slice_key(slice_key)) slice_metrics = slice_metrics.copy() if metric_keys.ERROR_METRIC in slice_metrics: logging.warning('Error for slice: %s with error message: %s ', slice_key, slice_metrics[metric_keys.ERROR_METRIC]) result.metrics[metric_keys.ERROR_METRIC].debug_message = slice_metrics[ metric_keys.ERROR_METRIC] return result # Convert the metrics from add_metrics_callbacks to the structured output if # defined. if add_metrics_callbacks and (not any( isinstance(k, metric_types.MetricKey) for k in slice_metrics.keys())): for add_metrics_callback in add_metrics_callbacks: if hasattr(add_metrics_callback, 'populate_stats_and_pop'): add_metrics_callback.populate_stats_and_pop( slice_key, slice_metrics, result.metrics) for key in sorted(slice_metrics.keys()): value = slice_metrics[key] metric_value = metrics_for_slice_pb2.MetricValue() if isinstance(value, metrics_for_slice_pb2.ConfusionMatrixAtThresholds): metric_value.confusion_matrix_at_thresholds.CopyFrom(value) elif isinstance( value, metrics_for_slice_pb2.MultiClassConfusionMatrixAtThresholds): metric_value.multi_class_confusion_matrix_at_thresholds.CopyFrom( value) elif isinstance(value, types.ValueWithTDistribution): # Currently we populate both bounded_value and confidence_interval. # Avoid populating bounded_value once the UI handles confidence_interval. # Convert to a bounded value. 95% confidence level is computed here. _, lower_bound, upper_bound = ( math_util.calculate_confidence_interval(value)) metric_value.bounded_value.value.value = value.unsampled_value metric_value.bounded_value.lower_bound.value = lower_bound metric_value.bounded_value.upper_bound.value = upper_bound metric_value.bounded_value.methodology = ( metrics_for_slice_pb2.BoundedValue.POISSON_BOOTSTRAP) # Populate confidence_interval metric_value.confidence_interval.lower_bound.value = lower_bound metric_value.confidence_interval.upper_bound.value = upper_bound t_dist_value = metrics_for_slice_pb2.TDistributionValue() t_dist_value.sample_mean.value = value.sample_mean t_dist_value.sample_standard_deviation.value = ( value.sample_standard_deviation) t_dist_value.sample_degrees_of_freedom.value = ( value.sample_degrees_of_freedom) # Once the UI handles confidence interval, we will avoid setting this and # instead use the double_value. t_dist_value.unsampled_value.value = value.unsampled_value metric_value.confidence_interval.t_distribution_value.CopyFrom( t_dist_value) elif isinstance(value, six.binary_type): # Convert textual types to string metrics. metric_value.bytes_value = value elif isinstance(value, six.text_type): # Convert textual types to string metrics. metric_value.bytes_value = value.encode('utf8') elif isinstance(value, np.ndarray): # Convert NumPy arrays to ArrayValue. metric_value.array_value.CopyFrom(_convert_to_array_value(value)) else: # We try to convert to float values. try: metric_value.double_value.value = float(value) except (TypeError, ValueError) as e: metric_value.unknown_type.value = str(value) metric_value.unknown_type.error = e.message # pytype: disable=attribute-error if isinstance(key, metric_types.MetricKey): key_and_value = result.metric_keys_and_values.add() key_and_value.key.CopyFrom(key.to_proto()) key_and_value.value.CopyFrom(metric_value) else: result.metrics[key].CopyFrom(metric_value) return result
def validate_metrics( sliced_metrics: Tuple[Union[slicer.SliceKeyType, slicer.CrossSliceKeyType], Dict['metric_types.MetricKey', Any]], eval_config: config_pb2.EvalConfig ) -> validation_result_pb2.ValidationResult: """Check the metrics and check whether they should be validated.""" # Find out which model is baseline. baseline_spec = model_util.get_baseline_model_spec(eval_config) baseline_model_name = baseline_spec.name if baseline_spec else None sliced_key, metrics = sliced_metrics thresholds = metric_specs.metric_thresholds_from_metrics_specs( eval_config.metrics_specs) is_cross_slice = slicer.is_cross_slice_key(sliced_key) def _check_threshold(key: metric_types.MetricKey, threshold: _ThresholdType, metric: Any) -> bool: """Verify a metric given its metric key and metric value.""" metric = float(metric) if isinstance(threshold, config_pb2.GenericValueThreshold): lower_bound, upper_bound = -np.inf, np.inf if threshold.HasField('lower_bound'): lower_bound = threshold.lower_bound.value if threshold.HasField('upper_bound'): upper_bound = threshold.upper_bound.value return metric >= lower_bound and metric <= upper_bound elif isinstance(threshold, config_pb2.GenericChangeThreshold): diff = metric metric_baseline = float( metrics[key.make_baseline_key(baseline_model_name)]) if math.isclose(metric_baseline, 0.0): ratio = float('nan') else: ratio = diff / metric_baseline if threshold.direction == config_pb2.MetricDirection.LOWER_IS_BETTER: absolute, relative = np.inf, np.inf elif threshold.direction == config_pb2.MetricDirection.HIGHER_IS_BETTER: absolute, relative = -np.inf, -np.inf else: raise ValueError( '"UNKNOWN" direction for change threshold: {}.'.format( threshold)) if threshold.HasField('absolute'): absolute = threshold.absolute.value if threshold.HasField('relative'): relative = threshold.relative.value if threshold.direction == config_pb2.MetricDirection.LOWER_IS_BETTER: return diff <= absolute and ratio <= relative elif threshold.direction == config_pb2.MetricDirection.HIGHER_IS_BETTER: return diff >= absolute and ratio >= relative else: raise ValueError('Unknown threshold: {}'.format(threshold)) def _copy_metric(metric, to): # Will add more types when more MetricValue are supported. to.double_value.value = float(metric) def _copy_threshold(threshold, to): if isinstance(threshold, config_pb2.GenericValueThreshold): to.value_threshold.CopyFrom(threshold) if isinstance(threshold, config_pb2.GenericChangeThreshold): to.change_threshold.CopyFrom(threshold) def _add_to_set(s, v): """Adds value to set. Returns true if didn't exist.""" if v in s: return False else: s.add(v) return True # Empty metrics per slice is considered validated. result = validation_result_pb2.ValidationResult(validation_ok=True) validation_for_slice = validation_result_pb2.MetricsValidationForSlice() unchecked_thresholds = dict(thresholds) for metric_key, metric in metrics.items(): if metric_key not in thresholds: continue del unchecked_thresholds[metric_key] # Not meaningful to check threshold for baseline model, thus always return # True if such threshold is configured. We also do not compare Message type # metrics. if metric_key.model_name == baseline_model_name: continue msg = '' existing_failures = set() for slice_spec, threshold in thresholds[metric_key]: if slice_spec is not None: if (isinstance(slice_spec, config_pb2.SlicingSpec) and (is_cross_slice or not slicer.SingleSliceSpec( spec=slice_spec).is_slice_applicable(sliced_key))): continue if (isinstance(slice_spec, config_pb2.CrossSlicingSpec) and (not is_cross_slice or not slicer.is_cross_slice_applicable( cross_slice_key=sliced_key, cross_slicing_spec=slice_spec))): continue elif is_cross_slice: continue try: check_result = _check_threshold(metric_key, threshold, metric) except ValueError: msg = """ Invalid metrics or threshold for comparison: The type of the metric is: {}, the metric value is: {}, and the threshold is: {}. """.format(type(metric), metric, threshold) check_result = False else: msg = '' if not check_result: # The same threshold values could be set for multiple matching slice # specs. Only store the first match. # # Note that hashing by SerializeToString() is only safe if used within # the same process. if not _add_to_set(existing_failures, threshold.SerializeToString()): continue failure = validation_for_slice.failures.add() failure.metric_key.CopyFrom(metric_key.to_proto()) _copy_metric(metric, failure.metric_value) _copy_threshold(threshold, failure.metric_threshold) failure.message = msg # Track we have completed a validation check for slice spec and metric slicing_details = result.validation_details.slicing_details.add() if slice_spec is not None: if isinstance(slice_spec, config_pb2.SlicingSpec): slicing_details.slicing_spec.CopyFrom(slice_spec) else: slicing_details.cross_slicing_spec.CopyFrom(slice_spec) else: slicing_details.slicing_spec.CopyFrom(config_pb2.SlicingSpec()) slicing_details.num_matching_slices = 1 # All unchecked thresholds are considered failures. for metric_key, thresholds in unchecked_thresholds.items(): if metric_key.model_name == baseline_model_name: continue existing_failures = set() for slice_spec, threshold in thresholds: if slice_spec is not None: if is_cross_slice != isinstance(slice_spec, config_pb2.CrossSlicingSpec): continue if (is_cross_slice and not slicer.is_cross_slice_applicable( cross_slice_key=sliced_key, cross_slicing_spec=slice_spec)): continue elif is_cross_slice: continue # The same threshold values could be set for multiple matching slice # specs. Only store the first match. # # Note that hashing by SerializeToString() is only safe if used within # the same process. if not _add_to_set(existing_failures, threshold.SerializeToString()): continue failure = validation_for_slice.failures.add() failure.metric_key.CopyFrom(metric_key.to_proto()) _copy_threshold(threshold, failure.metric_threshold) failure.message = 'Metric not found.' # Any failure leads to overall failure. if validation_for_slice.failures: if not is_cross_slice: validation_for_slice.slice_key.CopyFrom( slicer.serialize_slice_key(sliced_key)) else: validation_for_slice.cross_slice_key.CopyFrom( slicer.serialize_cross_slice_key(sliced_key)) result.validation_ok = False result.metric_validations_per_slice.append(validation_for_slice) return result
def convert_slice_metrics_to_proto( metrics: Tuple[slicer.SliceKeyOrCrossSliceKeyType, Dict[Any, Any]], add_metrics_callbacks: List[types.AddMetricsCallbackType] ) -> metrics_for_slice_pb2.MetricsForSlice: """Converts the given slice metrics into serialized proto MetricsForSlice. Args: metrics: The slice metrics. add_metrics_callbacks: A list of metric callbacks. This should be the same list as the one passed to tfma.Evaluate(). Returns: The MetricsForSlice proto. Raises: TypeError: If the type of the feature value in slice key cannot be recognized. """ result = metrics_for_slice_pb2.MetricsForSlice() slice_key, slice_metrics = metrics if slicer.is_cross_slice_key(slice_key): result.cross_slice_key.CopyFrom( slicer.serialize_cross_slice_key(slice_key)) else: result.slice_key.CopyFrom(slicer.serialize_slice_key(slice_key)) slice_metrics = slice_metrics.copy() if metric_keys.ERROR_METRIC in slice_metrics: logging.warning('Error for slice: %s with error message: %s ', slice_key, slice_metrics[metric_keys.ERROR_METRIC]) result.metrics[metric_keys.ERROR_METRIC].debug_message = slice_metrics[ metric_keys.ERROR_METRIC] return result # Convert the metrics from add_metrics_callbacks to the structured output if # defined. if add_metrics_callbacks and (not any( isinstance(k, metric_types.MetricKey) for k in slice_metrics.keys())): for add_metrics_callback in add_metrics_callbacks: if hasattr(add_metrics_callback, 'populate_stats_and_pop'): add_metrics_callback.populate_stats_and_pop( slice_key, slice_metrics, result.metrics) for key in sorted(slice_metrics.keys()): value = slice_metrics[key] if isinstance(value, types.ValueWithTDistribution): unsampled_value = value.unsampled_value _, lower_bound, upper_bound = ( math_util.calculate_confidence_interval(value)) confidence_interval = metrics_for_slice_pb2.ConfidenceInterval( lower_bound=convert_metric_value_to_proto(lower_bound), upper_bound=convert_metric_value_to_proto(upper_bound), standard_error=convert_metric_value_to_proto( value.sample_standard_deviation), degrees_of_freedom={'value': value.sample_degrees_of_freedom}) metric_value = convert_metric_value_to_proto(unsampled_value) # If metric can be stored to double_value metrics, replace it with a # bounded_value for backwards compatibility. # TODO(b/188575688): remove this logic to stop populating bounded_value if metric_value.WhichOneof('type') == 'double_value': # setting bounded_value clears double_value in the same oneof scope. metric_value.bounded_value.value.value = unsampled_value metric_value.bounded_value.lower_bound.value = lower_bound metric_value.bounded_value.upper_bound.value = upper_bound metric_value.bounded_value.methodology = ( metrics_for_slice_pb2.BoundedValue.POISSON_BOOTSTRAP) else: metric_value = convert_metric_value_to_proto(value) confidence_interval = None if isinstance(key, metric_types.MetricKey): result.metric_keys_and_values.add( key=key.to_proto(), value=metric_value, confidence_interval=confidence_interval) else: result.metrics[key].CopyFrom(metric_value) return result