def _total_attributions( absolute: bool = True, name: Text = '', model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None, ) -> metric_types.MetricComputations: """Returns metric computations for total attributions.""" key = metric_types.AttributionsKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key) # Make sure total_attributions is calculated. computations = _total_attributions_computations( absolute=absolute, model_name=model_name, output_name=output_name, sub_key=sub_key, ) private_key = computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.AttributionsKey, Dict[Text, Union[float, np.ndarray]]]: """Returns total attributions.""" return {key: metrics[private_key]} derived_computation = metric_types.DerivedMetricComputation(keys=[key], result=result) computations.append(derived_computation) return computations
def _confusion_matrix_at_thresholds( thresholds: List[float], name: Text = CONFUSION_MATRIX_AT_THRESHOLDS_NAME, eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None ) -> metric_types.MetricComputations: """Returns metric computations for confusion matrix at thresholds.""" key = metric_types.MetricKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key) # Make sure matrices are calculated. matrices_computations = binary_confusion_matrices.binary_confusion_matrices( eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, thresholds=thresholds) matrices_key = matrices_computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, metrics_for_slice_pb2.ConfusionMatrixAtThresholds] ) -> Dict[metric_types.MetricKey, Any]: return {key: to_proto(thresholds, metrics[matrices_key])} derived_computation = metric_types.DerivedMetricComputation(keys=[key], result=result) computations = matrices_computations computations.append(derived_computation) return computations
def _calibration_plot( num_buckets: int = DEFAULT_NUM_BUCKETS, left: Optional[float] = None, right: Optional[float] = None, name: Text = CALIBRATION_PLOT_NAME, eval_config: Optional[config_pb2.EvalConfig] = None, schema: Optional[schema_pb2.Schema] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None, aggregation_type: Optional[metric_types.AggregationType] = None, class_weights: Optional[Dict[int, float]] = None ) -> metric_types.MetricComputations: """Returns metric computations for calibration plot.""" key = metric_types.PlotKey( name=name, model_name=model_name, output_name=output_name, sub_key=sub_key) label_left, label_right = None, None if (left is None or right is None) and eval_config and schema: label_left, label_right = _find_label_domain(eval_config, schema, model_name, output_name) if left is None: left = label_left if label_left is not None else 0.0 if right is None: right = label_right if label_right is not None else 1.0 # Make sure calibration histogram is calculated. Note we are using the default # number of buckets assigned to the histogram instead of the value used for # the plots just in case the computation is shared with other metrics and # plots that need higher preicion. It will be downsampled later. computations = calibration_histogram.calibration_histogram( eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, left=left, right=right, aggregation_type=aggregation_type, class_weights=class_weights) histogram_key = computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Any]: thresholds = [ left + i * (right - left) / num_buckets for i in range(num_buckets + 1) ] thresholds = [float('-inf')] + thresholds histogram = calibration_histogram.rebin( thresholds, metrics[histogram_key], left=left, right=right) return {key: _to_proto(thresholds, histogram)} derived_computation = metric_types.DerivedMetricComputation( keys=[key], result=result) computations.append(derived_computation) return computations
def _confusion_matrix_plot( num_thresholds: int = DEFAULT_NUM_THRESHOLDS, name: Text = CONFUSION_MATRIX_PLOT_NAME, eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None, aggregation_type: Optional[metric_types.AggregationType] = None, class_weights: Optional[Dict[int, float]] = None ) -> metric_types.MetricComputations: """Returns metric computations for confusion matrix plots.""" key = metric_types.PlotKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key) # The interoploation strategy used here matches how the legacy post export # metrics calculated its plots. thresholds = [ i * 1.0 / num_thresholds for i in range(0, num_thresholds + 1) ] thresholds = [-1e-6] + thresholds # Make sure matrices are calculated. matrices_computations = binary_confusion_matrices.binary_confusion_matrices( # Use a custom name since we have a custom interpolation strategy which # will cause the default naming used by the binary confusion matrix to be # very long. name=(binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME + '_' + name), eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, aggregation_type=aggregation_type, class_weights=class_weights, thresholds=thresholds) matrices_key = matrices_computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, metrics_for_slice_pb2.ConfusionMatrixAtThresholds]: return { key: confusion_matrix_metrics.to_proto(thresholds, metrics[matrices_key]) } derived_computation = metric_types.DerivedMetricComputation(keys=[key], result=result) computations = matrices_computations computations.append(derived_computation) return computations
def _mean_attributions( absolute: bool = True, name: str = MEAN_ATTRIBUTIONS_NAME, eval_config: Optional[config_pb2.EvalConfig] = None, model_name: str = '', output_name: str = '', sub_key: Optional[metric_types.SubKey] = None, example_weighted: bool = False, ) -> metric_types.MetricComputations: """Returns metric computations for mean attributions.""" key = metric_types.AttributionsKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key, example_weighted=example_weighted) # Make sure total_attributions is calculated. computations = _total_attributions_computations( absolute=absolute, eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, example_weighted=example_weighted) total_attributions_key = computations[-1].keys[-1] # Make sure example_count is calculated computations.extend( example_count.example_count(model_names=[model_name], output_names=[output_name], sub_keys=[sub_key], example_weighted=example_weighted)) example_count_key = computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.AttributionsKey, Dict[str, Union[float, np.ndarray]]]: """Returns mean attributions.""" total_attributions = metrics[total_attributions_key] count = metrics[example_count_key] attributions = {} for k, v in total_attributions.items(): if np.isclose(count, 0.0): attributions[k] = float('nan') else: attributions[k] = v / count return {key: attributions} derived_computation = metric_types.DerivedMetricComputation(keys=[key], result=result) computations.append(derived_computation) return computations
def _coefficient_of_discrimination( name: str = COEFFICIENT_OF_DISCRIMINATION_NAME, eval_config: Optional[config_pb2.EvalConfig] = None, model_name: str = '', output_name: str = '', sub_key: Optional[metric_types.SubKey] = None, aggregation_type: Optional[metric_types.AggregationType] = None, class_weights: Optional[Dict[int, float]] = None, example_weighted: bool = False) -> metric_types.MetricComputations: """Returns metric computations for coefficient of discrimination.""" key = metric_types.MetricKey( name=name, model_name=model_name, output_name=output_name, sub_key=sub_key, example_weighted=example_weighted) # Compute shared tjur discimination metrics. computations = _tjur_discrimination( eval_config=eval_config, model_name=model_name, output_name=output_name, aggregation_type=aggregation_type, class_weights=class_weights, example_weighted=example_weighted) # Shared metrics are based on a single computation and key. tjur_discrimination_key = computations[0].keys[0] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, float]: """Returns coefficient of discrimination.""" metric = metrics[tjur_discrimination_key] if (metric.total_negative_weighted_labels == 0 or metric.total_positive_weighted_labels == 0): value = float('nan') else: avg_pos_label = ( metric.total_positive_weighted_predictions / metric.total_positive_weighted_labels) avg_neg_label = ( metric.total_negative_weighted_predictions / metric.total_negative_weighted_labels) value = avg_pos_label - avg_neg_label return {key: value} derived_computation = metric_types.DerivedMetricComputation( keys=[key], result=result) computations.append(derived_computation) return computations
def macro_average( metric_name: Text, sub_keys: List[metric_types.SubKey], eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', class_weights: Optional[Dict[int, float]] = None ) -> metric_types.MetricComputations: """Returns metric computations for computing macro average of given metric. Args: metric_name: Name of underlying metric average is being computed for. sub_keys: Sub keys used to compute the metric. eval_config: Eval config. model_name: Optional model name. output_name: Optional output name. class_weights: Optional class weights to apply. If sub_key.class_id is not set or not found in the dictionary then 1.0 is assumed. Returns: Computation for performing the macro average. """ del eval_config key = metric_types.MetricKey(name=metric_name, model_name=model_name, output_name=output_name) def result( metrics: Dict[metric_types.MetricKey, float] ) -> Dict[metric_types.MetricKey, float]: """Returns macro average.""" total_value = 0.0 total_weight = 0.0 for sub_key in sub_keys: child_key = metric_types.MetricKey(name=metric_name, model_name=model_name, output_name=output_name, sub_key=sub_key) weight = 1.0 if (class_weights and child_key.sub_key is not None and child_key.sub_key.class_id is not None and child_key.sub_key.class_id in class_weights): weight = class_weights[child_key.sub_key.class_id] total_value += _to_float(metrics[child_key]) * weight total_weight += weight average = total_value / total_weight if total_weight else float('nan') return {key: average} return [metric_types.DerivedMetricComputation(keys=[key], result=result)]
def _metric_computation( self, thresholds: Optional[List[float]] = None, name: Text = '', eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None, aggregation_type: Optional[metric_types.AggregationType] = None, class_weights: Optional[Dict[int, float]] = None ) -> metric_types.MetricComputations: """Returns metric computations for specificity.""" key = metric_types.MetricKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key) if not thresholds: thresholds = [0.5] # Make sure matrices are calculated. matrices_computations = binary_confusion_matrices.binary_confusion_matrices( eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, aggregation_type=aggregation_type, class_weights=class_weights, thresholds=thresholds) matrices_key = matrices_computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Union[float, np.ndarray]]: matrices = metrics[matrices_key] values = [] for i in range(len(thresholds)): values.append( self.result(matrices.tp[i], matrices.tn[i], matrices.fp[i], matrices.fn[i])) return { key: values[0] if len(thresholds) == 1 else np.array(values) } derived_computation = metric_types.DerivedMetricComputation( keys=[key], result=result) computations = matrices_computations computations.append(derived_computation) return computations
def _auc_plot( num_thresholds: int = DEFAULT_NUM_THRESHOLDS, name: Text = AUC_PLOT_NAME, eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None, class_weights: Optional[Dict[int, float]] = None ) -> metric_types.MetricComputations: """Returns metric computations for AUC plots.""" key = metric_types.PlotKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key) # The interoploation stragety used here matches how the legacy post export # metrics calculated its plots. thresholds = [ i * 1.0 / num_thresholds for i in range(0, num_thresholds + 1) ] thresholds = [-1e-6] + thresholds # Make sure matrices are calculated. matrices_computations = binary_confusion_matrices.binary_confusion_matrices( eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, class_weights=class_weights, thresholds=thresholds) matrices_key = matrices_computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, metrics_for_slice_pb2.ConfusionMatrixAtThresholds]: return { key: confusion_matrix_at_thresholds.to_proto(thresholds, metrics[matrices_key]) } derived_computation = metric_types.DerivedMetricComputation(keys=[key], result=result) computations = matrices_computations computations.append(derived_computation) return computations
def output_average( metric_name: str, output_weights: Dict[str, float], eval_config: Optional[config_pb2.EvalConfig] = None, model_name: str = '', sub_key: Optional[metric_types.SubKey] = None, example_weighted: bool = False) -> metric_types.MetricComputations: """Returns metric computations for computing output average of given metric. Args: metric_name: Name of underlying metric average is being computed for. output_weights: Output weights to use to compute metric. eval_config: Eval config. model_name: Optional model name. sub_key: Optional sub key associated with metric (e.g. top_k). example_weighted: True if example weights should be applied. Returns: Computation for performing the output average. """ del eval_config key = metric_types.MetricKey( name=metric_name, model_name=model_name, sub_key=sub_key, example_weighted=example_weighted) def result( metrics: Dict[metric_types.MetricKey, float] ) -> Dict[metric_types.MetricKey, float]: """Returns output average.""" total_value = 0.0 total_weight = 0.0 for output_name, output_weight in output_weights.items(): child_key = metric_types.MetricKey( name=metric_name, model_name=model_name, output_name=output_name, sub_key=sub_key, example_weighted=example_weighted) total_value += _to_float(metrics[child_key]) * output_weight total_weight += output_weight average = total_value / total_weight if total_weight else float('nan') return {key: average} return [metric_types.DerivedMetricComputation(keys=[key], result=result)]
def _calibration_plot( num_buckets: int = DEFAULT_NUM_BUCKETS, left: float = 0.0, right: float = 1.0, name: Text = CALIBRATION_PLOT_NAME, eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None ) -> metric_types.MetricComputations: """Returns metric computations for calibration plot.""" key = metric_types.PlotKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key) # Make sure calibration histogram is calculated. Note we are using the default # number of buckets assigned to the histogram instead of the value used for # the plots just in case the computation is shared with other metrics and # plots that need higher preicion. It will be downsampled later. computations = calibration_histogram.calibration_histogram( eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, left=left, right=right) histogram_key = computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Any]: thresholds = [ left + i * (right - left) / num_buckets for i in range(num_buckets + 1) ] thresholds = [float('-inf')] + thresholds histogram = calibration_histogram.rebin(thresholds, metrics[histogram_key], left=left, right=right) return {key: _to_proto(thresholds, histogram)} derived_computation = metric_types.DerivedMetricComputation(keys=[key], result=result) computations.append(derived_computation) return computations
def _multi_class_confusion_matrix_plot( thresholds: Optional[List[float]] = None, num_thresholds: Optional[int] = None, name: str = MULTI_CLASS_CONFUSION_MATRIX_PLOT_NAME, eval_config: Optional[config_pb2.EvalConfig] = None, model_name: str = '', output_name: str = '', example_weighted: bool = False) -> metric_types.MetricComputations: """Returns computations for multi-class confusion matrix plot.""" if num_thresholds is None and thresholds is None: thresholds = [0.0] key = metric_types.PlotKey( name=name, model_name=model_name, output_name=output_name, example_weighted=example_weighted) # Make sure matrices are calculated. matrices_computations = ( multi_class_confusion_matrix_metrics.multi_class_confusion_matrices( thresholds=thresholds, num_thresholds=num_thresholds, eval_config=eval_config, model_name=model_name, output_name=output_name, example_weighted=example_weighted)) matrices_key = matrices_computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, multi_class_confusion_matrix_metrics.Matrices] ) -> Dict[metric_types.PlotKey, metrics_for_slice_pb2.MultiClassConfusionMatrixAtThresholds]: return { key: metrics[matrices_key].to_proto() .multi_class_confusion_matrix_at_thresholds } derived_computation = metric_types.DerivedMetricComputation( keys=[key], result=result) computations = matrices_computations computations.append(derived_computation) return computations
def _calibration( name: str = CALIBRATION_NAME, eval_config: Optional[config_pb2.EvalConfig] = None, model_name: str = '', output_name: str = '', sub_key: Optional[metric_types.SubKey] = None, aggregation_type: Optional[metric_types.AggregationType] = None, class_weights: Optional[Dict[int, float]] = None, example_weighted: bool = False) -> metric_types.MetricComputations: """Returns metric computations for calibration.""" key = metric_types.MetricKey( name=name, model_name=model_name, output_name=output_name, sub_key=sub_key, example_weighted=example_weighted) # Make sure weighted_labels_predictions_examples are calculated. computations = _weighted_labels_predictions_examples( eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, aggregation_type=aggregation_type, class_weights=class_weights, example_weighted=example_weighted) weighted_labels_predictions_key = computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Any]: """Returns calibration.""" metric = metrics[weighted_labels_predictions_key] if np.isclose(metric.total_weighted_labels, 0.0): value = float('nan') else: value = metric.total_weighted_predictions / metric.total_weighted_labels return {key: value} derived_computation = metric_types.DerivedMetricComputation( keys=[key], result=result) computations.append(derived_computation) return computations
def _relative_coefficient_of_discrimination( name: Text = RELATIVE_COEFFICIENT_OF_DISCRIMINATION_NAME, eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', class_weights: Optional[Dict[float, int]] = None ) -> metric_types.MetricComputations: """Returns metric computations for coefficient of discrimination.""" key = metric_types.MetricKey( name=name, model_name=model_name, output_name=output_name) # Compute shared tjur discimination metrics. computations = _tjur_discrimination( eval_config=eval_config, model_name=model_name, output_name=output_name, class_weights=class_weights) # Shared metrics are based on a single computation and key. tjur_discrimination_key = computations[0].keys[0] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, float]: """Returns coefficient of discrimination.""" metric = metrics[tjur_discrimination_key] if (metric.total_negative_weighted_labels == 0 or metric.total_positive_weighted_labels == 0 or metric.total_negative_weighted_predictions == 0): value = float('nan') else: avg_pos_label = ( metric.total_positive_weighted_predictions / metric.total_positive_weighted_labels) avg_neg_label = ( metric.total_negative_weighted_predictions / metric.total_negative_weighted_labels) value = avg_pos_label / avg_neg_label return {key: value} derived_computation = metric_types.DerivedMetricComputation( keys=[key], result=result) computations.append(derived_computation) return computations
def _mean_label( name: Text = MEAN_LABEL_NAME, eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None, aggregation_type: Optional[metric_types.AggregationType] = None, class_weights: Optional[Dict[int, float]] = None ) -> metric_types.MetricComputations: """Returns metric computations for mean label.""" key = metric_types.MetricKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key) # Make sure weighted_labels_predictions_examples are calculated. computations = _weighted_labels_predictions_examples( eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, aggregation_type=aggregation_type, class_weights=class_weights) weighted_labels_predictions_key = computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Any]: """Returns mean label.""" metric = metrics[weighted_labels_predictions_key] if np.isclose(metric.total_weighted_examples, 0.0): value = float('nan') else: value = metric.total_weighted_labels / metric.total_weighted_examples return {key: value} derived_computation = metric_types.DerivedMetricComputation(keys=[key], result=result) computations.append(derived_computation) return computations
def _multi_class_confusion_matrix_at_thresholds( thresholds: Optional[List[float]] = None, name: Text = MULTI_CLASS_CONFUSION_MATRIX_AT_THRESHOLDS_NAME, eval_config: Optional[config_pb2.EvalConfig] = None, model_name: Text = '', output_name: Text = '', ) -> metric_types.MetricComputations: """Returns computations for multi-class confusion matrix at thresholds.""" if not thresholds: thresholds = [0.5] key = metric_types.MetricKey(name=name, model_name=model_name, output_name=output_name) # Make sure matrices are calculated. matrices_computations = multi_class_confusion_matrices( thresholds=thresholds, eval_config=eval_config, model_name=model_name, output_name=output_name) matrices_key = matrices_computations[-1].keys[-1] def result( metrics: Dict[ metric_types.MetricKey, metrics_for_slice_pb2.MultiClassConfusionMatrixAtThresholds] ) -> Dict[metric_types.MetricKey, metrics_for_slice_pb2.MultiClassConfusionMatrixAtThresholds]: return {key: metrics[matrices_key]} derived_computation = metric_types.DerivedMetricComputation(keys=[key], result=result) computations = matrices_computations computations.append(derived_computation) return computations
def _total_attributions( absolute: bool = True, name: str = '', eval_config: Optional[config_pb2.EvalConfig] = None, model_name: str = '', output_name: str = '', sub_key: Optional[metric_types.SubKey] = None, example_weighted: bool = False) -> metric_types.MetricComputations: """Returns metric computations for total attributions.""" key = metric_types.AttributionsKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key, example_weighted=example_weighted) # Make sure total_attributions is calculated. computations = _total_attributions_computations( absolute=absolute, eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, example_weighted=example_weighted) private_key = computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.AttributionsKey, Dict[str, Union[float, np.ndarray]]]: """Returns total attributions.""" return {key: metrics[private_key]} derived_computation = metric_types.DerivedMetricComputation(keys=[key], result=result) computations.append(derived_computation) return computations
def _fairness_indicators_metrics_at_thresholds( thresholds: List[float], name: Text = FAIRNESS_INDICATORS_METRICS_NAME, eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None, class_weights: Optional[Dict[int, float]] = None ) -> metric_types.MetricComputations: """Returns computations for fairness metrics at thresholds.""" metric_key_by_name_by_threshold = collections.defaultdict(dict) keys = [] digits_num = calculate_digits(thresholds) for t in thresholds: for m in FAIRNESS_INDICATORS_SUB_METRICS: key = metric_types.MetricKey( name='%s/%s@%.*f' % (name, m, digits_num, t), # e.g. "fairness_indicators_metrics/[email protected]" model_name=model_name, output_name=output_name, sub_key=sub_key) keys.append(key) metric_key_by_name_by_threshold[t][m] = key # Make sure matrices are calculated. computations = binary_confusion_matrices.binary_confusion_matrices( eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, class_weights=class_weights, thresholds=thresholds) confusion_matrices_key = computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Any]: """Returns fairness metrics values.""" metric = metrics[confusion_matrices_key] output = {} for i, threshold in enumerate(thresholds): num_positives = metric.tp[i] + metric.fn[i] num_negatives = metric.tn[i] + metric.fp[i] tpr = metric.tp[i] / (num_positives or float('nan')) tnr = metric.tn[i] / (num_negatives or float('nan')) fpr = metric.fp[i] / (num_negatives or float('nan')) fnr = metric.fn[i] / (num_positives or float('nan')) pr = (metric.tp[i] + metric.fp[i]) / ( (num_positives + num_negatives) or float('nan')) nr = (metric.tn[i] + metric.fn[i]) / ( (num_positives + num_negatives) or float('nan')) fdr = metric.fp[i] / ( (metric.fp[i] + metric.tp[i]) or float('nan')) fomr = metric.fn[i] / ( (metric.fn[i] + metric.tn[i]) or float('nan')) output[metric_key_by_name_by_threshold[threshold] ['false_positive_rate']] = fpr output[metric_key_by_name_by_threshold[threshold] ['false_negative_rate']] = fnr output[metric_key_by_name_by_threshold[threshold] ['true_positive_rate']] = tpr output[metric_key_by_name_by_threshold[threshold] ['true_negative_rate']] = tnr output[metric_key_by_name_by_threshold[threshold] ['positive_rate']] = pr output[metric_key_by_name_by_threshold[threshold] ['negative_rate']] = nr output[metric_key_by_name_by_threshold[threshold] ['false_discovery_rate']] = fdr output[metric_key_by_name_by_threshold[threshold] ['false_omission_rate']] = fomr return output derived_computation = metric_types.DerivedMetricComputation(keys=keys, result=result) computations.append(derived_computation) return computations
def _wrap_confusion_matrix_metric( metric: tf.keras.metrics.Metric, eval_config: config.EvalConfig, model_name: Text, output_name: Text, sub_key: Optional[metric_types.SubKey], class_weights: Optional[Dict[int, float]]) -> metric_types.MetricComputations: """Returns confusion matrix metric wrapped in a more efficient computation.""" # Special handling for AUC metric which supports aggregation inherently via # multi_label flag. if (isinstance(metric, tf.keras.metrics.AUC) and hasattr(metric, 'label_weights')): if metric.label_weights: if class_weights: raise ValueError( 'class weights are configured in two different places: (1) via the ' 'tf.keras.metrics.AUC class (using "label_weights") and (2) via ' 'the MetricsSpecs (using "aggregate.class_weights"). Either remove ' 'the label_weights settings in the AUC class or remove the ' 'class_weights from the AggregationOptions: metric={}, ' 'class_weights={}'.format(metric, class_weights)) class_weights = {i: v for i, v in enumerate(metric.label_weights)} if metric.multi_label: raise NotImplementedError('AUC.multi_label=True is not implemented yet.') sub_key = _verify_and_update_sub_key(model_name, output_name, sub_key, metric) key = metric_types.MetricKey( name=metric.name, model_name=model_name, output_name=output_name, sub_key=sub_key) metric_config = tf.keras.metrics.serialize(metric) thresholds = None num_thresholds = None if hasattr(metric, _THRESHOLDS_KEY): if (len( metric.thresholds) == binary_confusion_matrices.DEFAULT_NUM_THRESHOLDS): num_thresholds = binary_confusion_matrices.DEFAULT_NUM_THRESHOLDS else: thresholds = metric.thresholds # Only one of either thresholds or num_thresholds should be used. Keras AUC # allows both but thresholds has more precedence. if thresholds is None and hasattr(metric, _NUM_THRESHOLDS_KEY): num_thresholds = metric.num_thresholds # By default use separate compuations for the confusion matrices since the # metrics might be using different thresholds (note, the underlying histogram # the confusion matrices are based on will still only be calculated once). if (num_thresholds is not None and num_thresholds == binary_confusion_matrices.DEFAULT_NUM_THRESHOLDS): name = binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME else: name = '_{}{}'.format( metric.name, binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME) # Make sure matrices are calculated. Note that the use of class_weights here # implies that micro averaging is being performed. computations = binary_confusion_matrices.binary_confusion_matrices( num_thresholds=num_thresholds, thresholds=thresholds, name=name, eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, class_weights=class_weights) matrices_key = computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Any]: """Returns AUC derived from binary confustion matrices.""" matrices = metrics[matrices_key] metric = tf.keras.metrics.deserialize(metric_config) if (isinstance(metric, tf.keras.metrics.AUC) or isinstance(metric, tf.keras.metrics.SpecificityAtSensitivity) or isinstance(metric, tf.keras.metrics.SensitivityAtSpecificity)): metric.true_positives.assign(np.array(matrices.tp)) metric.true_negatives.assign(np.array(matrices.tn)) metric.false_positives.assign(np.array(matrices.fp)) metric.false_negatives.assign(np.array(matrices.fn)) elif isinstance(metric, tf.keras.metrics.Precision): metric.true_positives.assign(np.array(matrices.tp)) metric.false_positives.assign(np.array(matrices.fp)) elif isinstance(metric, tf.keras.metrics.Recall): metric.true_positives.assign(np.array(matrices.tp)) metric.false_negatives.assign(np.array(matrices.fn)) elif isinstance(metric, tf.keras.metrics.TruePositives): metric.accumulator.assign(np.array(matrices.tp)) elif isinstance(metric, tf.keras.metrics.FalsePositives): metric.accumulator.assign(np.array(matrices.fp)) elif isinstance(metric, tf.keras.metrics.TrueNegatives): metric.accumulator.assign(np.array(matrices.tn)) elif isinstance(metric, tf.keras.metrics.FalseNegatives): metric.accumulator.assign(np.array(matrices.fn)) return {key: metric.result().numpy()} derived_computation = metric_types.DerivedMetricComputation( keys=[key], result=result) computations.append(derived_computation) return computations
def _wrap_confusion_matrix_metric( metric: tf.keras.metrics.Metric, model_name: Text, output_name: Text, sub_key: Optional[metric_types.SubKey], class_weights: Optional[Dict[int, float]]) -> metric_types.MetricComputations: """Returns confusion matrix metric wrapped in a more efficient computation.""" sub_key = _verify_and_update_sub_key(model_name, output_name, sub_key, metric) key = metric_types.MetricKey( name=metric.name, model_name=model_name, output_name=output_name, sub_key=sub_key) metric_config = tf.keras.metrics.serialize(metric) # By default use separate compuations for the confusion matrices since the # metrics might be using different thresholds (note, the underlying histogram # the confusion matrices are based on will still only be calculated once). name = '_{}{}'.format( metric.name, binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME) thresholds = None if hasattr(metric, _THRESHOLDS_KEY): thresholds = metric.thresholds num_thresholds = None if hasattr(metric, _NUM_THRESHOLDS_KEY): num_thresholds = metric.num_thresholds # Increase the default number of thresholds if keras defaults were used (this # also allows us to share the computation with other confusion based metrics). if (num_thresholds == _DEFAULT_NUM_THRESHOLDS_IN_KERAS and _CONFIG_KEY in metric_config and _NUM_THRESHOLDS_KEY in metric_config[_CONFIG_KEY]): name = binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME num_thresholds = binary_confusion_matrices.DEFAULT_NUM_THRESHOLDS metric_config[_CONFIG_KEY][_NUM_THRESHOLDS_KEY] = num_thresholds thresholds = None if _THRESHOLDS_KEY in metric_config[_CONFIG_KEY]: metric_config[_CONFIG_KEY][_THRESHOLDS_KEY] = None # Only one of either thresholds or num_thresholds should be used. Keras AUC # allows both but thresholds has more precedence. if thresholds is not None and num_thresholds is not None: num_thresholds = None # Make sure matrices are calculated. computations = binary_confusion_matrices.binary_confusion_matrices( num_thresholds=num_thresholds, thresholds=thresholds, name=name, model_name=model_name, output_name=output_name, sub_key=sub_key, class_weights=class_weights) matrices_key = computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Any]: """Returns AUC derived from binary confustion matrices.""" matrices = metrics[matrices_key] metric = tf.keras.metrics.deserialize(metric_config) if (isinstance(metric, tf.keras.metrics.AUC) or isinstance(metric, tf.keras.metrics.SpecificityAtSensitivity) or isinstance(metric, tf.keras.metrics.SensitivityAtSpecificity)): metric.true_positives.assign(np.array(matrices.tp)) metric.true_negatives.assign(np.array(matrices.tn)) metric.false_positives.assign(np.array(matrices.fp)) metric.false_negatives.assign(np.array(matrices.fn)) elif isinstance(metric, tf.keras.metrics.Precision): metric.true_positives.assign(np.array(matrices.tp)) metric.false_positives.assign(np.array(matrices.fp)) elif isinstance(metric, tf.keras.metrics.Recall): metric.true_positives.assign(np.array(matrices.tp)) metric.false_negatives.assign(np.array(matrices.fn)) elif isinstance(metric, tf.keras.metrics.TruePositives): metric.accumulator.assign(np.array(matrices.tp)) elif isinstance(metric, tf.keras.metrics.FalsePositives): metric.accumulator.assign(np.array(matrices.fp)) elif isinstance(metric, tf.keras.metrics.TrueNegatives): metric.accumulator.assign(np.array(matrices.tn)) elif isinstance(metric, tf.keras.metrics.FalseNegatives): metric.accumulator.assign(np.array(matrices.fn)) return {key: metric.result().numpy()} derived_computation = metric_types.DerivedMetricComputation( keys=[key], result=result) computations.append(derived_computation) return computations
def _flip_rate( counterfactual_prediction_key: Optional[str] = None, example_id_key: Optional[str] = None, example_ids_count: int = flip_count.DEFAULT_NUM_EXAMPLE_IDS, name: str = FLIP_RATE_NAME, thresholds: Sequence[float] = flip_count.DEFAULT_THRESHOLDS, model_name: str = '', output_name: str = '', eval_config: Optional[config_pb2.EvalConfig] = None, example_weighted: bool = False) -> metric_types.MetricComputations: """Returns computations for flip rate.""" keys, metric_key_by_name_by_threshold = flip_count.create_metric_keys( thresholds, _METRICS_LIST, name, model_name, output_name, example_weighted) computations = flip_count.flip_count( thresholds=thresholds, counterfactual_prediction_key=counterfactual_prediction_key, example_id_key=example_id_key, example_ids_count=example_ids_count, model_name=model_name, output_name=output_name, eval_config=eval_config, example_weighted=example_weighted) _, flip_count_metric_key_by_name_by_threshold = flip_count.create_metric_keys( thresholds, flip_count.METRICS_LIST, flip_count.FLIP_COUNT_NAME, model_name, output_name, example_weighted) def pick_overall_flip_examples(ntp_examples: np.ndarray, ptn_examples: np.ndarray) -> np.ndarray: output_size = min(example_ids_count, ntp_examples.size + ptn_examples.size) examples = np.vstack([ntp_examples, ptn_examples]) return np.random.choice(examples.flatten(), size=output_size, replace=False) def result( metrics: Dict[metric_types.MetricKey, Union[float, np.ndarray]] ) -> Dict[metric_types.MetricKey, Union[float, np.ndarray]]: """Returns flip rate metrics values.""" output = {} for threshold in thresholds: ptn = flip_count_metric_key_by_name_by_threshold[threshold][ _POSITIVE_TO_NEGATIVE] ntp = flip_count_metric_key_by_name_by_threshold[threshold][ _NEGATIVE_TO_POSITIVE] pos_examples = flip_count_metric_key_by_name_by_threshold[ threshold][_POSITIVE_TO_NEGATIVE_EXAMPLE_IDS] neg_examples = flip_count_metric_key_by_name_by_threshold[ threshold][_NEGATIVE_TO_POSITIVE_EXAMPLE_IDS] pos = flip_count_metric_key_by_name_by_threshold[threshold][ _POSITIVE_EXAMPLES_COUNT] neg = flip_count_metric_key_by_name_by_threshold[threshold][ _NEGATIVE_EXAMPLES_COUNT] output[metric_key_by_name_by_threshold[threshold] [_OVERALL]] = (metrics[ntp] + metrics[ptn]) / ( (metrics[pos] + metrics[neg]) or float('NaN')) output[metric_key_by_name_by_threshold[threshold] [_POSITIVE_TO_NEGATIVE]] = metrics[ptn] / ( (metrics[pos] + metrics[neg]) or float('NaN')) output[metric_key_by_name_by_threshold[threshold] [_NEGATIVE_TO_POSITIVE]] = metrics[ntp] / ( (metrics[pos] + metrics[neg]) or float('NaN')) output[metric_key_by_name_by_threshold[threshold] [_POSITIVE_TO_NEGATIVE_EXAMPLE_IDS]] = metrics[pos_examples] output[metric_key_by_name_by_threshold[threshold] [_NEGATIVE_TO_POSITIVE_EXAMPLE_IDS]] = metrics[neg_examples] # TODO(sokeefe): Should this depend on of example_weighted? if not example_weighted: assert isinstance(metrics[neg_examples], np.ndarray) assert isinstance(metrics[pos_examples], np.ndarray) output[metric_key_by_name_by_threshold[threshold] [_SAMPLE_EXAMPLES_IDS]] = pick_overall_flip_examples( ntp_examples=metrics[neg_examples], ptn_examples=metrics[pos_examples]) return output derived_computation = metric_types.DerivedMetricComputation(keys=keys, result=result) computations.append(derived_computation) return computations
def weighted_macro_average( metric_name: Text, sub_keys: List[metric_types.SubKey], eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', class_weights: Optional[Dict[int, float]] = None ) -> metric_types.MetricComputations: """Returns metric computations for computing weighted macro average of metric. The weights per class are based on the percentage of positive labels for each class. Args: metric_name: Name of metric weighted average is being computed for. sub_keys: Sub keys used to compute the metric. eval_config: Eval config. model_name: Optional model name. output_name: Optional output name. class_weights: Optional class weights to apply. If sub_key.class_id is not set or not found in the dictionary then 1.0 is assumed. Note that these weights are applied in addition to the weights based on the positive labels for each class. Returns: Computation for performing the weighted macro average. """ key = metric_types.MetricKey(name=metric_name, model_name=model_name, output_name=output_name) class_ids = [k.class_id for k in sub_keys if k.class_id is not None] # Compute the weights for labels. computations = _class_weights_from_labels(class_ids=class_ids, eval_config=eval_config, model_name=model_name, output_name=output_name) # Class weights metrics are based on a single computation and key. class_weights_from_labels_key = computations[0].keys[0] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, float]: """Returns weighted macro average.""" class_weights_from_labels = metrics[class_weights_from_labels_key] total_value = 0.0 total_weight = 0.0 for sub_key in sub_keys: child_key = metric_types.MetricKey(name=metric_name, model_name=model_name, output_name=output_name, sub_key=sub_key) weight = 1.0 if (child_key.sub_key is not None and child_key.sub_key.class_id is not None): if (class_weights_from_labels and child_key.sub_key.class_id in class_weights_from_labels): weight = class_weights_from_labels[ child_key.sub_key.class_id] if class_weights and child_key.sub_key.class_id in class_weights: weight *= class_weights[child_key.sub_key.class_id] total_value += _to_float(metrics[child_key]) * weight total_weight += weight average = total_value / total_weight if total_weight else float('nan') return {key: average} derived_computation = metric_types.DerivedMetricComputation(keys=[key], result=result) computations.append(derived_computation) return computations
def weighted_macro_average( metric_name: Text, sub_keys: Iterable[metric_types.SubKey], eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None, class_weights: Optional[Dict[int, float]] = None ) -> metric_types.MetricComputations: """Returns metric computations for computing weighted macro average of metric. The weights per class are based on the percentage of positive labels for each class. Args: metric_name: Name of metric weighted average is being computed for. sub_keys: Sub keys used to compute the metric (e.g. class_ids, etc). eval_config: Eval config. model_name: Optional model name. output_name: Optional output name. sub_key: Optional sub key associated with aggregation metric (e.g. top_k). class_weights: Optional class weights to apply. Required if sub_key is not provided. If class_weights are provided, but a sub_key.class_id (if sub_key is None) or sub_key.k (if sub_key is top_k) is not set or not found in the dictionary then 0.0 is assumed. Note that these weights are applied in addition to the weights based on the positive labels for each class. Returns: Computation for performing the weighted macro average. """ key = metric_types.MetricKey(name=metric_name, model_name=model_name, output_name=output_name, sub_key=sub_key) class_ids = [k.class_id for k in sub_keys if k.class_id is not None] # Compute the weights for labels. computations = _class_weights_from_labels(class_ids=class_ids, eval_config=eval_config, model_name=model_name, output_name=output_name) # Class weights metrics are based on a single computation and key. class_weights_from_labels_key = computations[0].keys[0] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, float]: """Returns weighted macro average.""" class_weights_from_labels = metrics[class_weights_from_labels_key] total_value = 0.0 total_weight = 0.0 for sub_key in sub_keys: child_key = metric_types.MetricKey(name=metric_name, model_name=model_name, output_name=output_name, sub_key=sub_key) if child_key not in metrics: # Use private name if not found under metric name child_key = metric_types.MetricKey(name='_' + metric_name, model_name=model_name, output_name=output_name, sub_key=sub_key) weight = 1.0 if not class_weights else 0.0 offset = None if (child_key.sub_key is not None and child_key.sub_key.class_id is not None): offset = child_key.sub_key.class_id elif child_key.sub_key is not None and child_key.sub_key.k is not None: offset = child_key.sub_key.k if offset is not None: if (class_weights_from_labels and child_key.sub_key.class_id in class_weights_from_labels): weight = class_weights_from_labels[offset] if class_weights and child_key.sub_key.class_id in class_weights: weight *= class_weights[offset] total_value += _to_float(metrics[child_key]) * weight total_weight += weight average = total_value / total_weight if total_weight else float('nan') return {key: average} derived_computation = metric_types.DerivedMetricComputation(keys=[key], result=result) computations.append(derived_computation) return computations
def _wrap_confusion_matrix_metric( metric: tf.keras.metrics.Metric, eval_config: config_pb2.EvalConfig, model_name: Text, output_name: Text, sub_key: Optional[metric_types.SubKey], aggregation_type: Optional[metric_types.AggregationType], class_weights: Optional[Dict[int, float]]) -> metric_types.MetricComputations: """Returns confusion matrix metric wrapped in a more efficient computation.""" # Special handling for AUC metric which supports aggregation inherently via # multi_label flag. if (isinstance(metric, tf.keras.metrics.AUC) and hasattr(metric, 'label_weights')): if metric.label_weights: if class_weights: raise ValueError( 'class weights are configured in two different places: (1) via the ' 'tf.keras.metrics.AUC class (using "label_weights") and (2) via ' 'the MetricsSpecs (using "aggregate.class_weights"). Either remove ' 'the label_weights settings in the AUC class or remove the ' 'class_weights from the AggregationOptions: metric={}, ' 'class_weights={}'.format(metric, class_weights)) class_weights = {i: v for i, v in enumerate(metric.label_weights)} if metric.multi_label: raise NotImplementedError('AUC.multi_label=True is not implemented yet.') sub_key = _verify_and_update_sub_key(model_name, output_name, sub_key, metric) key = metric_types.MetricKey( name=metric.name, model_name=model_name, output_name=output_name, aggregation_type=aggregation_type, sub_key=sub_key) metric_config = tf.keras.metrics.serialize(metric) thresholds = None num_thresholds = None # The top_k metrics have special settings. If we are setting the top_k value # outside of keras (i.e. using BinarizeOptions), then we need to set the # special threshold ourselves otherwise the default threshold of 0.5 is used. if (sub_key and sub_key.top_k is not None and _get_config_value(_TOP_K_KEY, metric_config) is None and _get_config_value(_THRESHOLDS_KEY, metric_config) is None and _get_config_value(_NUM_THRESHOLDS_KEY, metric_config) is None): thresholds = [float('-inf')] elif hasattr(metric, _THRESHOLDS_KEY): thresholds = metric.thresholds # Only one of either thresholds or num_thresholds should be used. Keras AUC # allows both but thresholds has more precedence. if thresholds is None and hasattr(metric, _NUM_THRESHOLDS_KEY): num_thresholds = metric.num_thresholds # Make sure matrices are calculated. computations = binary_confusion_matrices.binary_confusion_matrices( num_thresholds=num_thresholds, thresholds=thresholds, eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, aggregation_type=aggregation_type, class_weights=class_weights) matrices_key = computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Any]: """Returns result derived from binary confustion matrices.""" matrices = metrics[matrices_key] metric = tf.keras.metrics.deserialize(metric_config) if (isinstance(metric, tf.keras.metrics.AUC) or isinstance(metric, tf.keras.metrics.SpecificityAtSensitivity) or isinstance(metric, tf.keras.metrics.SensitivityAtSpecificity)): metric.true_positives.assign(np.array(matrices.tp)) metric.true_negatives.assign(np.array(matrices.tn)) metric.false_positives.assign(np.array(matrices.fp)) metric.false_negatives.assign(np.array(matrices.fn)) elif isinstance(metric, tf.keras.metrics.Precision): metric.true_positives.assign(np.array(matrices.tp)) metric.false_positives.assign(np.array(matrices.fp)) elif isinstance(metric, tf.keras.metrics.Recall): metric.true_positives.assign(np.array(matrices.tp)) metric.false_negatives.assign(np.array(matrices.fn)) elif isinstance(metric, tf.keras.metrics.TruePositives): metric.accumulator.assign(np.array(matrices.tp)) elif isinstance(metric, tf.keras.metrics.FalsePositives): metric.accumulator.assign(np.array(matrices.fp)) elif isinstance(metric, tf.keras.metrics.TrueNegatives): metric.accumulator.assign(np.array(matrices.tn)) elif isinstance(metric, tf.keras.metrics.FalseNegatives): metric.accumulator.assign(np.array(matrices.fn)) return {key: metric.result().numpy()} derived_computation = metric_types.DerivedMetricComputation( keys=[key], result=result) computations.append(derived_computation) return computations
def binary_confusion_matrices( num_thresholds: Optional[int] = None, thresholds: Optional[List[float]] = None, name: Text = BINARY_CONFUSION_MATRICES_NAME, eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None, aggregation_type: Optional[metric_types.AggregationType] = None, class_weights: Optional[Dict[int, float]] = None ) -> metric_types.MetricComputations: """Returns metric computations for computing binary confusion matrices. Args: num_thresholds: Number of thresholds to use. Thresholds will be calculated using linear interpolation between 0.0 and 1.0 with equidistant values and bondardaries at -epsilon and 1.0+epsilon. Values must be > 0. Only one of num_thresholds or thresholds should be used. If used, num_thresholds must be > 1. thresholds: A specific set of thresholds to use. The caller is responsible for marking the boundaries with +/-epsilon if desired. Only one of num_thresholds or thresholds should be used. For metrics computed at top k this may be a single negative threshold value (i.e. -inf). name: Metric name. eval_config: Eval config. model_name: Optional model name (if multi-model evaluation). output_name: Optional output name (if multi-output model type). sub_key: Optional sub key. aggregation_type: Optional aggregation type. class_weights: Optional class weights to apply to multi-class / multi-label labels and predictions prior to flattening (when micro averaging is used). Raises: ValueError: If both num_thresholds and thresholds are set at the same time. """ key = metric_types.MetricKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key) if num_thresholds is not None and thresholds is not None: raise ValueError( 'only one of thresholds or num_thresholds can be set at a time') if num_thresholds is None and thresholds is None: num_thresholds = DEFAULT_NUM_THRESHOLDS if num_thresholds is not None: if num_thresholds <= 1: raise ValueError('num_thresholds must be > 1') # The interpolation strategy used here matches that used by keras for AUC. thresholds = [(i + 1) * 1.0 / (num_thresholds - 1) for i in range(num_thresholds - 2)] thresholds = [-_EPSILON] + thresholds + [1.0 + _EPSILON] # Use calibration histogram to calculate matrices. For efficiency (unless all # predictions are matched - i.e. thresholds <= 0) we will assume that other # metrics will make use of the calibration histogram and re-use the default # histogram for the given model_name/output_name/sub_key. This is also # required to get accurate counts at the threshold boundaries. If this becomes # an issue, then calibration histogram can be updated to support non-linear # boundaries. histogram_computations = calibration_histogram.calibration_histogram( eval_config=eval_config, num_buckets=( # For precision/recall_at_k were a single large negative threshold is # used, we only need one bucket. Note that the histogram will actually # have 2 buckets: one that we set (which handles predictions > -1.0) # and a default catch-all bucket (i.e. bucket 0) that the histogram # creates for large negative predictions (i.e. predictions <= -1.0). 1 if len(thresholds) == 1 and thresholds[0] <= 0 else None), model_name=model_name, output_name=output_name, sub_key=sub_key, aggregation_type=aggregation_type, class_weights=class_weights) histogram_key = histogram_computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Matrices]: """Returns binary confusion matrices.""" if len(thresholds) == 1 and thresholds[0] < 0: # This case is used when all positive prediction values are considered # matches (e.g. when calculating top_k for precision/recall where the # non-top_k values are expected to have been set to float('-inf')). histogram = metrics[histogram_key] else: # Calibration histogram uses intervals of the form [start, end) where the # prediction >= start. The confusion matrices want intervals of the form # (start, end] where the prediction > start. Add a small epsilon so that # >= checks don't match. This correction shouldn't be needed in practice # but allows for correctness in small tests. rebin_thresholds = [ t + _EPSILON if t != 0 else t for t in thresholds ] if thresholds[0] >= 0: # Add -epsilon bucket to account for differences in histogram vs # confusion matrix intervals mentioned above. If the epsilon bucket is # missing the false negatives and false positives will be 0 for the # first threshold. rebin_thresholds = [-_EPSILON] + rebin_thresholds if thresholds[-1] < 1.0: # If the last threshold < 1.0, then add a fence post at 1.0 + epsilon # othewise true negatives and true positives will be overcounted. rebin_thresholds = rebin_thresholds + [1.0 + _EPSILON] histogram = calibration_histogram.rebin(rebin_thresholds, metrics[histogram_key]) matrices = _to_binary_confusion_matrices(thresholds, histogram) return {key: matrices} derived_computation = metric_types.DerivedMetricComputation(keys=[key], result=result) computations = histogram_computations computations.append(derived_computation) return computations
def _flip_rate( counterfactual_prediction_key: str, example_id_key: Optional[str] = None, example_ids_count: int = flip_count.DEFAULT_NUM_EXAMPLE_IDS, name: str = FLIP_RATE_NAME, thresholds: Sequence[float] = flip_count.DEFAULT_THRESHOLDS, model_name: str = '', output_name: str = '', eval_config: Optional[config.EvalConfig] = None, ) -> metric_types.MetricComputations: """Returns computations for flip rate.""" keys, metric_key_by_name_by_threshold = flip_count.create_metric_keys( thresholds, _METRICS_LIST, name, model_name, output_name) computations = flip_count.flip_count( thresholds=thresholds, counterfactual_prediction_key=counterfactual_prediction_key, example_id_key=example_id_key, example_ids_count=example_ids_count, model_name=model_name, output_name=output_name, eval_config=eval_config) _, flip_count_metric_key_by_name_by_threshold = flip_count.create_metric_keys( thresholds, flip_count.METRICS_LIST, flip_count.FLIP_COUNT_NAME, model_name, output_name) def result( metrics: Dict[metric_types.MetricKey, float] ) -> Dict[metric_types.MetricKey, float]: """Returns flip rate metrics values.""" output = {} for threshold in thresholds: ptn = flip_count_metric_key_by_name_by_threshold[threshold][ 'positive_to_negative'] ntp = flip_count_metric_key_by_name_by_threshold[threshold][ 'negative_to_positive'] pos_examples = flip_count_metric_key_by_name_by_threshold[ threshold]['positive_to_negative_examples_ids'] neg_examples = flip_count_metric_key_by_name_by_threshold[ threshold]['negative_to_positive_examples_ids'] pos = flip_count_metric_key_by_name_by_threshold[threshold][ 'positive_examples_count'] neg = flip_count_metric_key_by_name_by_threshold[threshold][ 'negative_examples_count'] output[metric_key_by_name_by_threshold[threshold]['overall']] = ( metrics[ntp] + metrics[ptn]) / (metrics[pos] + metrics[neg]) output[metric_key_by_name_by_threshold[threshold] ['positive_to_negative']] = metrics[ptn] / metrics[pos] output[metric_key_by_name_by_threshold[threshold] ['negative_to_positive']] = metrics[ntp] / metrics[neg] output[metric_key_by_name_by_threshold[threshold][ 'positive_to_negative_examples_ids']] = metrics[pos_examples] output[metric_key_by_name_by_threshold[threshold][ 'negative_to_positive_examples_ids']] = metrics[neg_examples] return output derived_computation = metric_types.DerivedMetricComputation(keys=keys, result=result) computations.append(derived_computation) return computations
def macro_average( metric_name: Text, sub_keys: Iterable[metric_types.SubKey], eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None, class_weights: Optional[Dict[int, float]] = None ) -> metric_types.MetricComputations: """Returns metric computations for computing macro average of given metric. Args: metric_name: Name of underlying metric average is being computed for. sub_keys: Sub keys used to compute the metric (e.g. class_ids, etc). eval_config: Eval config. model_name: Optional model name. output_name: Optional output name. sub_key: Optional sub key associated with aggregation metric (e.g. top_k). class_weights: Optional class weights to apply. Required if sub_key is not provided. If class_weights are provided, but a sub_key.class_id (if sub_key is None) or sub_key.k (if sub_key is top_k) is not set or not found in the dictionary then 0.0 is assumed. Returns: Computation for performing the macro average. """ del eval_config key = metric_types.MetricKey(name=metric_name, model_name=model_name, output_name=output_name, sub_key=sub_key) def result( metrics: Dict[metric_types.MetricKey, float] ) -> Dict[metric_types.MetricKey, float]: """Returns macro average.""" total_value = 0.0 total_weight = 0.0 for sub_key in sub_keys: child_key = metric_types.MetricKey(name=metric_name, model_name=model_name, output_name=output_name, sub_key=sub_key) if child_key not in metrics: # Use private name if not found under metric name child_key = metric_types.MetricKey(name='_' + metric_name, model_name=model_name, output_name=output_name, sub_key=sub_key) weight = 1.0 if not class_weights else 0.0 offset = None if (child_key.sub_key is not None and child_key.sub_key.class_id is not None): offset = child_key.sub_key.class_id elif child_key.sub_key is not None and child_key.sub_key.k is not None: offset = child_key.sub_key.k if offset is not None and offset in class_weights: weight = class_weights[offset] total_value += _to_float(metrics[child_key]) * weight total_weight += weight average = total_value / total_weight if total_weight else float('nan') return {key: average} return [metric_types.DerivedMetricComputation(keys=[key], result=result)]
def binary_confusion_matrices( num_thresholds: Optional[int] = None, thresholds: Optional[List[float]] = None, name: Optional[Text] = None, eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None, aggregation_type: Optional[metric_types.AggregationType] = None, class_weights: Optional[Dict[int, float]] = None, use_histogram: Optional[bool] = None, extract_label_prediction_and_weight: Optional[Callable[ ..., Any]] = metric_util.to_label_prediction_example_weight, preprocessor: Optional[Callable[..., Any]] = None, example_id_key: Optional[Text] = None, example_ids_count: Optional[int] = None, fractional_labels: float = True) -> metric_types.MetricComputations: """Returns metric computations for computing binary confusion matrices. Args: num_thresholds: Number of thresholds to use. Thresholds will be calculated using linear interpolation between 0.0 and 1.0 with equidistant values and bondardaries at -epsilon and 1.0+epsilon. Values must be > 0. Only one of num_thresholds or thresholds should be used. If used, num_thresholds must be > 1. thresholds: A specific set of thresholds to use. The caller is responsible for marking the boundaries with +/-epsilon if desired. Only one of num_thresholds or thresholds should be used. For metrics computed at top k this may be a single negative threshold value (i.e. -inf). name: Metric name. eval_config: Eval config. model_name: Optional model name (if multi-model evaluation). output_name: Optional output name (if multi-output model type). sub_key: Optional sub key. aggregation_type: Optional aggregation type. class_weights: Optional class weights to apply to multi-class / multi-label labels and predictions prior to flattening (when micro averaging is used). use_histogram: If true, matrices will be derived from calibration histograms. extract_label_prediction_and_weight: User-provided function argument that yields label, prediction, and example weights for use in calculations (relevant only when use_histogram flag is not true). preprocessor: User-provided preprocessor for including additional extracts in StandardMetricInputs (relevant only when use_histogram flag is not true). example_id_key: Feature key containing example id (relevant only when use_histogram flag is not true). example_ids_count: Max number of example ids to be extracted for false positives and false negatives (relevant only when use_histogram flag is not true). fractional_labels: If true, each incoming tuple of (label, prediction, and example weight) will be split into two tuples as follows (where l, p, w represent the resulting label, prediction, and example weight values): (1) l = 0.0, p = prediction, and w = example_weight * (1.0 - label) (2) l = 1.0, p = prediction, and w = example_weight * label If enabled, an exception will be raised if labels are not within [0, 1]. The implementation is such that tuples associated with a weight of zero are not yielded. This means it is safe to enable fractional_labels even when the labels only take on the values of 0.0 or 1.0. Raises: ValueError: If both num_thresholds and thresholds are set at the same time. """ if num_thresholds is not None and thresholds is not None: raise ValueError( 'only one of thresholds or num_thresholds can be set at a time') if num_thresholds is None and thresholds is None: num_thresholds = DEFAULT_NUM_THRESHOLDS # Keras AUC turns num_thresholds parameters into thresholds which circumvents # sharing of settings. If the thresholds match the interpolated version of the # thresholds then reset back to num_thresholds. if (name is None and thresholds and thresholds == _interpolated_thresholds(len(thresholds))): num_thresholds = len(thresholds) thresholds = None if num_thresholds is not None: if num_thresholds <= 1: raise ValueError('num_thresholds must be > 1') # The interpolation strategy used here matches that used by keras for AUC. thresholds = _interpolated_thresholds(num_thresholds) if name is None: name = '{}_{}'.format(BINARY_CONFUSION_MATRICES_NAME, num_thresholds) elif name is None: name = '{}_{}'.format(BINARY_CONFUSION_MATRICES_NAME, list(thresholds)) key = metric_types.MetricKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key) computations = [] metric_key = None if use_histogram is None: use_histogram = (num_thresholds is not None or (len(thresholds) == 1 and thresholds[0] < 0)) if use_histogram: # Use calibration histogram to calculate matrices. For efficiency (unless # all predictions are matched - i.e. thresholds <= 0) we will assume that # other metrics will make use of the calibration histogram and re-use the # default histogram for the given model_name/output_name/sub_key. This is # also required to get accurate counts at the threshold boundaries. If this # becomes an issue, then calibration histogram can be updated to support # non-linear boundaries. computations = calibration_histogram.calibration_histogram( eval_config=eval_config, num_buckets=( # For precision/recall_at_k were a single large negative threshold # is used, we only need one bucket. Note that the histogram will # actually have 2 buckets: one that we set (which handles # predictions > -1.0) and a default catch-all bucket (i.e. bucket 0) # that the histogram creates for large negative predictions (i.e. # predictions <= -1.0). 1 if len(thresholds) == 1 and thresholds[0] <= 0 else None), model_name=model_name, output_name=output_name, sub_key=sub_key, aggregation_type=aggregation_type, class_weights=class_weights) metric_key = computations[-1].keys[-1] else: computations = _binary_confusion_matrix_computation( eval_config=eval_config, thresholds=thresholds, model_name=model_name, output_name=output_name, sub_key=sub_key, extract_label_prediction_and_weight= extract_label_prediction_and_weight, preprocessor=preprocessor, example_id_key=example_id_key, example_ids_count=example_ids_count, aggregation_type=aggregation_type, class_weights=class_weights, fractional_labels=fractional_labels) metric_key = computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Matrices]: """Returns binary confusion matrices.""" matrices = None if use_histogram: if len(thresholds) == 1 and thresholds[0] < 0: # This case is used when all positive prediction values are relevant # matches (e.g. when calculating top_k for precision/recall where the # non-top_k values are expected to have been set to float('-inf')). histogram = metrics[metric_key] else: # Calibration histogram uses intervals of the form [start, end) where # the prediction >= start. The confusion matrices want intervals of the # form (start, end] where the prediction > start. Add a small epsilon so # that >= checks don't match. This correction shouldn't be needed in # practice but allows for correctness in small tests. rebin_thresholds = [ t + _EPSILON if t != 0 else t for t in thresholds ] if thresholds[0] >= 0: # Add -epsilon bucket to account for differences in histogram vs # confusion matrix intervals mentioned above. If the epsilon bucket is # missing the false negatives and false positives will be 0 for the # first threshold. rebin_thresholds = [-_EPSILON] + rebin_thresholds if thresholds[-1] < 1.0: # If the last threshold < 1.0, then add a fence post at 1.0 + epsilon # othewise true negatives and true positives will be overcounted. rebin_thresholds = rebin_thresholds + [1.0 + _EPSILON] histogram = calibration_histogram.rebin( rebin_thresholds, metrics[metric_key]) matrices = _historgram_to_binary_confusion_matrices( thresholds, histogram) else: matrices = _matrix_to_binary_confusion_matrices( thresholds, metrics[metric_key]) return {key: matrices} derived_computation = metric_types.DerivedMetricComputation(keys=[key], result=result) computations.append(derived_computation) return computations
def binary_confusion_matrices( num_thresholds: Optional[int] = None, thresholds: Optional[List[float]] = None, name: Text = BINARY_CONFUSION_MATRICES_NAME, eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None, class_weights: Optional[Dict[int, float]] = None ) -> metric_types.MetricComputations: """Returns metric computations for computing binary confusion matrices. Args: num_thresholds: Number of thresholds to use. Thresholds will be calculated using linear interpolation between 0.0 and 1.0 with equidistant values and bondardaries at -epsilon and 1.0+epsilon. Values must be > 0. Only one of num_thresholds or thresholds should be used. thresholds: A specific set of thresholds to use. The caller is responsible for marking the bondaires with +/-epsilon if desired. Only one of num_thresholds or thresholds should be used. name: Metric name. eval_config: Eval config. model_name: Optional model name (if multi-model evaluation). output_name: Optional output name (if multi-output model type). sub_key: Optional sub key. class_weights: Optional class weights to apply to multi-class / multi-label labels and predictions prior to flattening (when micro averaging is used). Raises: ValueError: If both num_thresholds and thresholds are set at the same time. """ key = metric_types.MetricKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key) if num_thresholds is not None and thresholds is not None: raise ValueError( 'only one of thresholds or num_thresholds can be set at a time') if num_thresholds is None and thresholds is None: num_thresholds = DEFAULT_NUM_THRESHOLDS if num_thresholds is not None: # The interpolation strategy used here matches that used by keras for AUC. thresholds = [(i + 1) * 1.0 / (num_thresholds - 1) for i in range(num_thresholds - 2)] thresholds = [-_EPSILON] + thresholds + [1.0 + _EPSILON] # Use calibration histogram to calculate matrices. For efficiency (unless all # predictions are matched - i.e. thresholds <= 0) we will assume that other # metrics will make use of the calibration histogram and re-use the default # histogram for the given model_name/output_name/sub_key. This is also # required to get accurate counts at the threshold boundaries. If this becomes # an issue, then calibration histogram can be updated to support non-linear # boundaries. num_buckets = 1 if len(thresholds) == 1 and thresholds[0] <= 0 else None histogram_computations = calibration_histogram.calibration_histogram( eval_config=eval_config, num_buckets=num_buckets, model_name=model_name, output_name=output_name, sub_key=sub_key, class_weights=class_weights) histogram_key = histogram_computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Matrices]: """Returns binary confusion matrices.""" # Calibration histogram uses intervals of the form [start, end) where the # prediction >= start. The confusion matrices want intervals of the form # (start, end] where the prediction > start. Add a small epsilon so that >= # checks don't match. This correction shouldn't be needed in practice but # allows for correctness in small tests. if len(thresholds) == 1: # When there is only one threshold, we need to make adjustments so that # we have proper boundaries around the threshold for <, >= comparions. if thresholds[0] < 0: # This case is used when all prediction values are considered matches # (e.g. when calculating top_k for precision/recall). rebin_thresholds = [thresholds[0], thresholds[0] + _EPSILON] else: # This case is used for a single threshold within [0, 1] (e.g. 0.5). rebin_thresholds = [ -_EPSILON, thresholds[0] + _EPSILON, 1.0 + _EPSILON ] else: rebin_thresholds = ([thresholds[0]] + [t + _EPSILON for t in thresholds[1:]]) histogram = calibration_histogram.rebin(rebin_thresholds, metrics[histogram_key]) matrices = _to_binary_confusion_matrices(thresholds, histogram) if len(thresholds) == 1: # Reset back to 1 bucket matrices = Matrices(thresholds, tp=[matrices.tp[1]], fp=[matrices.fp[1]], tn=[matrices.tn[1]], fn=[matrices.fn[1]]) return {key: matrices} derived_computation = metric_types.DerivedMetricComputation(keys=[key], result=result) computations = histogram_computations computations.append(derived_computation) return computations
def flip_count( counterfactual_prediction_key: Optional[str] = None, example_id_key: Optional[str] = None, example_ids_count: int = DEFAULT_NUM_EXAMPLE_IDS, name: str = FLIP_COUNT_NAME, thresholds: Sequence[float] = DEFAULT_THRESHOLDS, model_name: str = '', output_name: str = '', eval_config: Optional[config_pb2.EvalConfig] = None, example_weighted: bool = False) -> metric_types.MetricComputations: """Returns metric computations for computing flip counts.""" keys, metric_key_by_name_by_threshold = create_metric_keys( thresholds, METRICS_LIST, name, model_name, output_name, example_weighted) feature_keys = [counterfactual_prediction_key] if example_id_key: feature_keys.append(example_id_key) def extract_label_prediction_and_weight( inputs: metric_types.StandardMetricInputs, eval_config: Optional[config_pb2.EvalConfig] = None, model_name: str = '', output_name: str = '', sub_key: Optional[metric_types.SubKey] = None, aggregation_type: Optional[metric_types.AggregationType] = None, class_weights: Optional[Dict[int, float]] = None, example_weighted: bool = False, fractional_labels: bool = False, flatten: bool = True, ) -> Iterator[Tuple[np.ndarray, np.ndarray, np.ndarray]]: """Yields label, prediction, and example weights to be used in calculations. This function is a customized metric_util.to_label_prediction_example_weight function which yields original prediction as label and counterfactual prediction as prediction and derive flip count metrics from false positives and false negatives of binary confusion matrix. Args: inputs: Standard metric inputs. eval_config: Eval config model_name: Optional model name (if multi-model evaluation). output_name: Optional output name (if multi-output model type). sub_key: Optional sub key. (unused) aggregation_type: Optional aggregation type. (unused) class_weights: Optional class weights to apply to multi-class / multi-label labels and predictions. (unused) example_weighted: True if example weights should be applied. fractional_labels: If true, each incoming tuple of (label, prediction, example weight) will be split into two tuples as follows (where l, p, w represent the resulting label, prediction, and example weight values): (1) l = 0.0, p = prediction, and w = example_weight * (1.0 - label) (2) l = 1.0, p = prediction, and w = example_weight * label If enabled, an exception will be raised if labels are not within [0, 1]. The implementation is such that tuples associated with a weight of zero are not yielded. This means it is safe to enable fractional_labels even when the labels only take on the values of 0.0 or 1.0. (unused) flatten: True to flatten the final label and prediction outputs so that the yielded values are always arrays of size 1. For example, multi-class /multi-label outputs would be converted into label and prediction pairs that could then be processed by a binary classification metric in order to compute a micro average over all classes. (unused) Yields: Tuple of (label, prediction, example_weight). Raises: ValueError: If counterfactual prediction key is not found within either the features or predictions. ValueError: If predictions is None or empty. """ del (sub_key, aggregation_type, class_weights, fractional_labels, flatten) # unused # TODO(sokeefe): Look into removing the options to pass counterfactual # predictions in a feature and instead as a baseline model. if (counterfactual_prediction_key is not None and counterfactual_prediction_key in inputs.features): counterfactual_prediction = inputs.features[ counterfactual_prediction_key] elif eval_config is not None: counterfactual_model_spec = model_util.get_baseline_model_spec( eval_config) if counterfactual_model_spec is not None: _, counterfactual_prediction, _ = next( metric_util.to_label_prediction_example_weight( inputs, eval_config=eval_config, model_name=counterfactual_model_spec.name, output_name=output_name, example_weighted=example_weighted, fractional_labels= False, # Labels are ignored for flip counts. flatten=False, # Flattened below allow_none=True, # Allow None labels require_single_example_weight=True)) else: raise ValueError( 'The Counterfactual model must be listed with ' f'`is_baseline` equal to `True`. Found: {eval_config}') else: raise ValueError( '`counterfactual_prediction` was not found within the provided ' 'inputs. It must be included as either a feature key or within the ' 'predictions. Found:\n' f'`counterfactual_prediction_key`: {counterfactual_prediction_key}\n' f'`inputs.prediction`:{inputs.prediction}') if counterfactual_prediction is None: raise ValueError( '%s feature key is None (required for FlipCount metric)' % counterfactual_prediction_key) def get_by_keys(value: Any, keys: List[str]) -> Any: if isinstance(value, dict): new_value = util.get_by_keys(value, keys, optional=True) if new_value is not None: return new_value return value if model_name: counterfactual_prediction = get_by_keys(counterfactual_prediction, [model_name]) if output_name: counterfactual_prediction = get_by_keys(counterfactual_prediction, [output_name]) _, prediction, example_weight = next( metric_util.to_label_prediction_example_weight( inputs, eval_config=eval_config, model_name=model_name, output_name=output_name, example_weighted=example_weighted, fractional_labels=False, # Labels are ignored for flip counts. flatten=False, # Flattened below allow_none=True, # Allow None labels require_single_example_weight=True)) if prediction.size != counterfactual_prediction.size: raise ValueError( 'prediction and counterfactual_prediction size should be same for ' 'FlipCount metric, %f != %f' % (prediction.size, counterfactual_prediction.size)) if prediction.size == 0: raise ValueError( 'prediction is empty (required for FlipCount metric)') else: # Always flatten example_weight = np.array( [float(example_weight) for i in range(prediction.shape[-1])]) for p, cfp, w in zip(prediction.flatten(), counterfactual_prediction.flatten(), example_weight.flatten()): yield np.array([p]), np.array([cfp]), np.array([w]) # Setting fractional label to false, since prediction is being used as label # and it can be a non-binary value. computations = binary_confusion_matrices.binary_confusion_matrices( thresholds=list(thresholds), eval_config=eval_config, model_name=model_name, output_name=output_name, example_weighted=example_weighted, extract_label_prediction_and_weight=extract_label_prediction_and_weight, preprocessor=metric_types.FeaturePreprocessor( feature_keys=feature_keys), example_id_key=example_id_key, example_ids_count=example_ids_count, fractional_labels=False) examples_metric_key, matrices_metric_key = computations[-1].keys def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Any]: """Returns flip count metrics values.""" matrix = metrics[matrices_metric_key] examples = metrics[examples_metric_key] output = {} for i, threshold in enumerate(matrix.thresholds): output[metric_key_by_name_by_threshold[threshold] ['positive_to_negative']] = matrix.fn[i] output[metric_key_by_name_by_threshold[threshold] ['negative_to_positive']] = matrix.fp[i] output[metric_key_by_name_by_threshold[threshold] ['positive_to_negative_examples_ids']] = np.array( examples.fn_examples[i]) output[metric_key_by_name_by_threshold[threshold] ['negative_to_positive_examples_ids']] = np.array( examples.fp_examples[i]) output[metric_key_by_name_by_threshold[threshold] ['positive_examples_count']] = matrix.fn[i] + matrix.tp[i] output[metric_key_by_name_by_threshold[threshold] ['negative_examples_count']] = matrix.fp[i] + matrix.tn[i] return output derived_computation = metric_types.DerivedMetricComputation(keys=keys, result=result) computations.append(derived_computation) return computations