def testBinaryConfusionMatrices(self, kwargs, expected_matrices): computations = binary_confusion_matrices.binary_confusion_matrices( **kwargs) histogram = computations[0] matrices = computations[1] example1 = { 'labels': np.array([0.0]), 'predictions': np.array([0.0]), 'example_weights': np.array([1.0]) } example2 = { 'labels': np.array([0.0]), 'predictions': np.array([0.5]), 'example_weights': np.array([1.0]) } example3 = { 'labels': np.array([1.0]), 'predictions': np.array([0.3]), 'example_weights': np.array([1.0]) } example4 = { 'labels': np.array([1.0]), 'predictions': np.array([0.9]), 'example_weights': np.array([1.0]) } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [example1, example2, example3, example4]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner) | 'ComputeMatrices' >> beam.Map(lambda x: (x[0], matrices.result(x[1]))) ) # pyformat: disable # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_metrics, 1) name = '_binary_confusion_matrices_{}'.format( kwargs['num_thresholds'] if 'num_thresholds' in kwargs else kwargs['thresholds']) key = metric_types.MetricKey(name=name) self.assertIn(key, got_metrics) got_matrices = got_metrics[key] self.assertEqual(got_matrices, expected_matrices) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def _confusion_matrix_at_thresholds( thresholds: List[float], name: Text = CONFUSION_MATRIX_AT_THRESHOLDS_NAME, eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None ) -> metric_types.MetricComputations: """Returns metric computations for confusion matrix at thresholds.""" key = metric_types.MetricKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key) # Make sure matrices are calculated. matrices_computations = binary_confusion_matrices.binary_confusion_matrices( eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, thresholds=thresholds) matrices_key = matrices_computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, metrics_for_slice_pb2.ConfusionMatrixAtThresholds] ) -> Dict[metric_types.MetricKey, Any]: return {key: to_proto(thresholds, metrics[matrices_key])} derived_computation = metric_types.DerivedMetricComputation(keys=[key], result=result) computations = matrices_computations computations.append(derived_computation) return computations
def _confusion_matrix_plot( num_thresholds: int = DEFAULT_NUM_THRESHOLDS, name: Text = CONFUSION_MATRIX_PLOT_NAME, eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None, aggregation_type: Optional[metric_types.AggregationType] = None, class_weights: Optional[Dict[int, float]] = None ) -> metric_types.MetricComputations: """Returns metric computations for confusion matrix plots.""" key = metric_types.PlotKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key) # The interoploation strategy used here matches how the legacy post export # metrics calculated its plots. thresholds = [ i * 1.0 / num_thresholds for i in range(0, num_thresholds + 1) ] thresholds = [-1e-6] + thresholds # Make sure matrices are calculated. matrices_computations = binary_confusion_matrices.binary_confusion_matrices( # Use a custom name since we have a custom interpolation strategy which # will cause the default naming used by the binary confusion matrix to be # very long. name=(binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME + '_' + name), eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, aggregation_type=aggregation_type, class_weights=class_weights, thresholds=thresholds) matrices_key = matrices_computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, metrics_for_slice_pb2.ConfusionMatrixAtThresholds]: return { key: confusion_matrix_metrics.to_proto(thresholds, metrics[matrices_key]) } derived_computation = metric_types.DerivedMetricComputation(keys=[key], result=result) computations = matrices_computations computations.append(derived_computation) return computations
def _metric_computation( self, thresholds: Optional[List[float]] = None, name: Text = '', eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None, aggregation_type: Optional[metric_types.AggregationType] = None, class_weights: Optional[Dict[int, float]] = None ) -> metric_types.MetricComputations: """Returns metric computations for specificity.""" key = metric_types.MetricKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key) if not thresholds: thresholds = [0.5] # Make sure matrices are calculated. matrices_computations = binary_confusion_matrices.binary_confusion_matrices( eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, aggregation_type=aggregation_type, class_weights=class_weights, thresholds=thresholds) matrices_key = matrices_computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Union[float, np.ndarray]]: matrices = metrics[matrices_key] values = [] for i in range(len(thresholds)): values.append( self.result(matrices.tp[i], matrices.tn[i], matrices.fp[i], matrices.fn[i])) return { key: values[0] if len(thresholds) == 1 else np.array(values) } derived_computation = metric_types.DerivedMetricComputation( keys=[key], result=result) computations = matrices_computations computations.append(derived_computation) return computations
def _auc_plot( num_thresholds: int = DEFAULT_NUM_THRESHOLDS, name: Text = AUC_PLOT_NAME, eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None, class_weights: Optional[Dict[int, float]] = None ) -> metric_types.MetricComputations: """Returns metric computations for AUC plots.""" key = metric_types.PlotKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key) # The interoploation stragety used here matches how the legacy post export # metrics calculated its plots. thresholds = [ i * 1.0 / num_thresholds for i in range(0, num_thresholds + 1) ] thresholds = [-1e-6] + thresholds # Make sure matrices are calculated. matrices_computations = binary_confusion_matrices.binary_confusion_matrices( eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, class_weights=class_weights, thresholds=thresholds) matrices_key = matrices_computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, metrics_for_slice_pb2.ConfusionMatrixAtThresholds]: return { key: confusion_matrix_at_thresholds.to_proto(thresholds, metrics[matrices_key]) } derived_computation = metric_types.DerivedMetricComputation(keys=[key], result=result) computations = matrices_computations computations.append(derived_computation) return computations
def _wrap_confusion_matrix_metric( metric: tf.keras.metrics.Metric, eval_config: config_pb2.EvalConfig, model_name: Text, output_name: Text, sub_key: Optional[metric_types.SubKey], aggregation_type: Optional[metric_types.AggregationType], class_weights: Optional[Dict[int, float]]) -> metric_types.MetricComputations: """Returns confusion matrix metric wrapped in a more efficient computation.""" # Special handling for AUC metric which supports aggregation inherently via # multi_label flag. if (isinstance(metric, tf.keras.metrics.AUC) and hasattr(metric, 'label_weights')): if metric.label_weights: if class_weights: raise ValueError( 'class weights are configured in two different places: (1) via the ' 'tf.keras.metrics.AUC class (using "label_weights") and (2) via ' 'the MetricsSpecs (using "aggregate.class_weights"). Either remove ' 'the label_weights settings in the AUC class or remove the ' 'class_weights from the AggregationOptions: metric={}, ' 'class_weights={}'.format(metric, class_weights)) class_weights = {i: v for i, v in enumerate(metric.label_weights)} if metric.multi_label: raise NotImplementedError('AUC.multi_label=True is not implemented yet.') sub_key = _verify_and_update_sub_key(model_name, output_name, sub_key, metric) key = metric_types.MetricKey( name=metric.name, model_name=model_name, output_name=output_name, aggregation_type=aggregation_type, sub_key=sub_key) metric_config = tf.keras.metrics.serialize(metric) thresholds = None num_thresholds = None # The top_k metrics have special settings. If we are setting the top_k value # outside of keras (i.e. using BinarizeOptions), then we need to set the # special threshold ourselves otherwise the default threshold of 0.5 is used. if (sub_key and sub_key.top_k is not None and _get_config_value(_TOP_K_KEY, metric_config) is None and _get_config_value(_THRESHOLDS_KEY, metric_config) is None and _get_config_value(_NUM_THRESHOLDS_KEY, metric_config) is None): thresholds = [float('-inf')] elif hasattr(metric, _THRESHOLDS_KEY): thresholds = metric.thresholds # Only one of either thresholds or num_thresholds should be used. Keras AUC # allows both but thresholds has more precedence. if thresholds is None and hasattr(metric, _NUM_THRESHOLDS_KEY): num_thresholds = metric.num_thresholds # Make sure matrices are calculated. computations = binary_confusion_matrices.binary_confusion_matrices( num_thresholds=num_thresholds, thresholds=thresholds, eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, aggregation_type=aggregation_type, class_weights=class_weights) matrices_key = computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Any]: """Returns result derived from binary confustion matrices.""" matrices = metrics[matrices_key] metric = tf.keras.metrics.deserialize(metric_config) if (isinstance(metric, tf.keras.metrics.AUC) or isinstance(metric, tf.keras.metrics.SpecificityAtSensitivity) or isinstance(metric, tf.keras.metrics.SensitivityAtSpecificity)): metric.true_positives.assign(np.array(matrices.tp)) metric.true_negatives.assign(np.array(matrices.tn)) metric.false_positives.assign(np.array(matrices.fp)) metric.false_negatives.assign(np.array(matrices.fn)) elif isinstance(metric, tf.keras.metrics.Precision): metric.true_positives.assign(np.array(matrices.tp)) metric.false_positives.assign(np.array(matrices.fp)) elif isinstance(metric, tf.keras.metrics.Recall): metric.true_positives.assign(np.array(matrices.tp)) metric.false_negatives.assign(np.array(matrices.fn)) elif isinstance(metric, tf.keras.metrics.TruePositives): metric.accumulator.assign(np.array(matrices.tp)) elif isinstance(metric, tf.keras.metrics.FalsePositives): metric.accumulator.assign(np.array(matrices.fp)) elif isinstance(metric, tf.keras.metrics.TrueNegatives): metric.accumulator.assign(np.array(matrices.tn)) elif isinstance(metric, tf.keras.metrics.FalseNegatives): metric.accumulator.assign(np.array(matrices.fn)) return {key: metric.result().numpy()} derived_computation = metric_types.DerivedMetricComputation( keys=[key], result=result) computations.append(derived_computation) return computations
def _fairness_indicators_metrics_at_thresholds( thresholds: List[float], name: Text = FAIRNESS_INDICATORS_METRICS_NAME, eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None, class_weights: Optional[Dict[int, float]] = None ) -> metric_types.MetricComputations: """Returns computations for fairness metrics at thresholds.""" metric_key_by_name_by_threshold = collections.defaultdict(dict) keys = [] digits_num = calculate_digits(thresholds) for t in thresholds: for m in FAIRNESS_INDICATORS_SUB_METRICS: key = metric_types.MetricKey( name='%s/%s@%.*f' % (name, m, digits_num, t), # e.g. "fairness_indicators_metrics/[email protected]" model_name=model_name, output_name=output_name, sub_key=sub_key) keys.append(key) metric_key_by_name_by_threshold[t][m] = key # Make sure matrices are calculated. computations = binary_confusion_matrices.binary_confusion_matrices( eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, class_weights=class_weights, thresholds=thresholds) confusion_matrices_key = computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Any]: """Returns fairness metrics values.""" metric = metrics[confusion_matrices_key] output = {} for i, threshold in enumerate(thresholds): num_positives = metric.tp[i] + metric.fn[i] num_negatives = metric.tn[i] + metric.fp[i] tpr = metric.tp[i] / (num_positives or float('nan')) tnr = metric.tn[i] / (num_negatives or float('nan')) fpr = metric.fp[i] / (num_negatives or float('nan')) fnr = metric.fn[i] / (num_positives or float('nan')) pr = (metric.tp[i] + metric.fp[i]) / ( (num_positives + num_negatives) or float('nan')) nr = (metric.tn[i] + metric.fn[i]) / ( (num_positives + num_negatives) or float('nan')) fdr = metric.fp[i] / ( (metric.fp[i] + metric.tp[i]) or float('nan')) fomr = metric.fn[i] / ( (metric.fn[i] + metric.tn[i]) or float('nan')) output[metric_key_by_name_by_threshold[threshold] ['false_positive_rate']] = fpr output[metric_key_by_name_by_threshold[threshold] ['false_negative_rate']] = fnr output[metric_key_by_name_by_threshold[threshold] ['true_positive_rate']] = tpr output[metric_key_by_name_by_threshold[threshold] ['true_negative_rate']] = tnr output[metric_key_by_name_by_threshold[threshold] ['positive_rate']] = pr output[metric_key_by_name_by_threshold[threshold] ['negative_rate']] = nr output[metric_key_by_name_by_threshold[threshold] ['false_discovery_rate']] = fdr output[metric_key_by_name_by_threshold[threshold] ['false_omission_rate']] = fomr return output derived_computation = metric_types.DerivedMetricComputation(keys=keys, result=result) computations.append(derived_computation) return computations
def _wrap_confusion_matrix_metric( metric: tf.keras.metrics.Metric, eval_config: config.EvalConfig, model_name: Text, output_name: Text, sub_key: Optional[metric_types.SubKey], class_weights: Optional[Dict[int, float]]) -> metric_types.MetricComputations: """Returns confusion matrix metric wrapped in a more efficient computation.""" # Special handling for AUC metric which supports aggregation inherently via # multi_label flag. if (isinstance(metric, tf.keras.metrics.AUC) and hasattr(metric, 'label_weights')): if metric.label_weights: if class_weights: raise ValueError( 'class weights are configured in two different places: (1) via the ' 'tf.keras.metrics.AUC class (using "label_weights") and (2) via ' 'the MetricsSpecs (using "aggregate.class_weights"). Either remove ' 'the label_weights settings in the AUC class or remove the ' 'class_weights from the AggregationOptions: metric={}, ' 'class_weights={}'.format(metric, class_weights)) class_weights = {i: v for i, v in enumerate(metric.label_weights)} if metric.multi_label: raise NotImplementedError('AUC.multi_label=True is not implemented yet.') sub_key = _verify_and_update_sub_key(model_name, output_name, sub_key, metric) key = metric_types.MetricKey( name=metric.name, model_name=model_name, output_name=output_name, sub_key=sub_key) metric_config = tf.keras.metrics.serialize(metric) thresholds = None num_thresholds = None if hasattr(metric, _THRESHOLDS_KEY): if (len( metric.thresholds) == binary_confusion_matrices.DEFAULT_NUM_THRESHOLDS): num_thresholds = binary_confusion_matrices.DEFAULT_NUM_THRESHOLDS else: thresholds = metric.thresholds # Only one of either thresholds or num_thresholds should be used. Keras AUC # allows both but thresholds has more precedence. if thresholds is None and hasattr(metric, _NUM_THRESHOLDS_KEY): num_thresholds = metric.num_thresholds # By default use separate compuations for the confusion matrices since the # metrics might be using different thresholds (note, the underlying histogram # the confusion matrices are based on will still only be calculated once). if (num_thresholds is not None and num_thresholds == binary_confusion_matrices.DEFAULT_NUM_THRESHOLDS): name = binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME else: name = '_{}{}'.format( metric.name, binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME) # Make sure matrices are calculated. Note that the use of class_weights here # implies that micro averaging is being performed. computations = binary_confusion_matrices.binary_confusion_matrices( num_thresholds=num_thresholds, thresholds=thresholds, name=name, eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, class_weights=class_weights) matrices_key = computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Any]: """Returns AUC derived from binary confustion matrices.""" matrices = metrics[matrices_key] metric = tf.keras.metrics.deserialize(metric_config) if (isinstance(metric, tf.keras.metrics.AUC) or isinstance(metric, tf.keras.metrics.SpecificityAtSensitivity) or isinstance(metric, tf.keras.metrics.SensitivityAtSpecificity)): metric.true_positives.assign(np.array(matrices.tp)) metric.true_negatives.assign(np.array(matrices.tn)) metric.false_positives.assign(np.array(matrices.fp)) metric.false_negatives.assign(np.array(matrices.fn)) elif isinstance(metric, tf.keras.metrics.Precision): metric.true_positives.assign(np.array(matrices.tp)) metric.false_positives.assign(np.array(matrices.fp)) elif isinstance(metric, tf.keras.metrics.Recall): metric.true_positives.assign(np.array(matrices.tp)) metric.false_negatives.assign(np.array(matrices.fn)) elif isinstance(metric, tf.keras.metrics.TruePositives): metric.accumulator.assign(np.array(matrices.tp)) elif isinstance(metric, tf.keras.metrics.FalsePositives): metric.accumulator.assign(np.array(matrices.fp)) elif isinstance(metric, tf.keras.metrics.TrueNegatives): metric.accumulator.assign(np.array(matrices.tn)) elif isinstance(metric, tf.keras.metrics.FalseNegatives): metric.accumulator.assign(np.array(matrices.fn)) return {key: metric.result().numpy()} derived_computation = metric_types.DerivedMetricComputation( keys=[key], result=result) computations.append(derived_computation) return computations
def _wrap_confusion_matrix_metric( metric: tf.keras.metrics.Metric, model_name: Text, output_name: Text, sub_key: Optional[metric_types.SubKey], class_weights: Optional[Dict[int, float]]) -> metric_types.MetricComputations: """Returns confusion matrix metric wrapped in a more efficient computation.""" sub_key = _verify_and_update_sub_key(model_name, output_name, sub_key, metric) key = metric_types.MetricKey( name=metric.name, model_name=model_name, output_name=output_name, sub_key=sub_key) metric_config = tf.keras.metrics.serialize(metric) # By default use separate compuations for the confusion matrices since the # metrics might be using different thresholds (note, the underlying histogram # the confusion matrices are based on will still only be calculated once). name = '_{}{}'.format( metric.name, binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME) thresholds = None if hasattr(metric, _THRESHOLDS_KEY): thresholds = metric.thresholds num_thresholds = None if hasattr(metric, _NUM_THRESHOLDS_KEY): num_thresholds = metric.num_thresholds # Increase the default number of thresholds if keras defaults were used (this # also allows us to share the computation with other confusion based metrics). if (num_thresholds == _DEFAULT_NUM_THRESHOLDS_IN_KERAS and _CONFIG_KEY in metric_config and _NUM_THRESHOLDS_KEY in metric_config[_CONFIG_KEY]): name = binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME num_thresholds = binary_confusion_matrices.DEFAULT_NUM_THRESHOLDS metric_config[_CONFIG_KEY][_NUM_THRESHOLDS_KEY] = num_thresholds thresholds = None if _THRESHOLDS_KEY in metric_config[_CONFIG_KEY]: metric_config[_CONFIG_KEY][_THRESHOLDS_KEY] = None # Only one of either thresholds or num_thresholds should be used. Keras AUC # allows both but thresholds has more precedence. if thresholds is not None and num_thresholds is not None: num_thresholds = None # Make sure matrices are calculated. computations = binary_confusion_matrices.binary_confusion_matrices( num_thresholds=num_thresholds, thresholds=thresholds, name=name, model_name=model_name, output_name=output_name, sub_key=sub_key, class_weights=class_weights) matrices_key = computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Any]: """Returns AUC derived from binary confustion matrices.""" matrices = metrics[matrices_key] metric = tf.keras.metrics.deserialize(metric_config) if (isinstance(metric, tf.keras.metrics.AUC) or isinstance(metric, tf.keras.metrics.SpecificityAtSensitivity) or isinstance(metric, tf.keras.metrics.SensitivityAtSpecificity)): metric.true_positives.assign(np.array(matrices.tp)) metric.true_negatives.assign(np.array(matrices.tn)) metric.false_positives.assign(np.array(matrices.fp)) metric.false_negatives.assign(np.array(matrices.fn)) elif isinstance(metric, tf.keras.metrics.Precision): metric.true_positives.assign(np.array(matrices.tp)) metric.false_positives.assign(np.array(matrices.fp)) elif isinstance(metric, tf.keras.metrics.Recall): metric.true_positives.assign(np.array(matrices.tp)) metric.false_negatives.assign(np.array(matrices.fn)) elif isinstance(metric, tf.keras.metrics.TruePositives): metric.accumulator.assign(np.array(matrices.tp)) elif isinstance(metric, tf.keras.metrics.FalsePositives): metric.accumulator.assign(np.array(matrices.fp)) elif isinstance(metric, tf.keras.metrics.TrueNegatives): metric.accumulator.assign(np.array(matrices.tn)) elif isinstance(metric, tf.keras.metrics.FalseNegatives): metric.accumulator.assign(np.array(matrices.fn)) return {key: metric.result().numpy()} derived_computation = metric_types.DerivedMetricComputation( keys=[key], result=result) computations.append(derived_computation) return computations
def testBinaryConfusionMatricesTopK(self): computations = binary_confusion_matrices.binary_confusion_matrices( thresholds=[float('-inf')], sub_key=metric_types.SubKey(top_k=3), use_histogram=True) histogram = computations[0] matrices = computations[1] example1 = { 'labels': np.array([2]), 'predictions': np.array([0.1, 0.2, 0.1, 0.25, 0.35]), 'example_weights': np.array([1.0]) } example2 = { 'labels': np.array([1]), 'predictions': np.array([0.2, 0.3, 0.05, 0.15, 0.3]), 'example_weights': np.array([1.0]) } example3 = { 'labels': np.array([3]), 'predictions': np.array([0.01, 0.2, 0.09, 0.5, 0.2]), 'example_weights': np.array([1.0]) } example4 = { 'labels': np.array([4]), 'predictions': np.array([0.3, 0.2, 0.05, 0.4, 0.05]), 'example_weights': np.array([1.0]) } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [example1, example2, example3, example4]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner) | 'ComputeMatrices' >> beam.Map(lambda x: (x[0], matrices.result(x[1]))) ) # pyformat: disable # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_metrics, 1) key = metric_types.MetricKey( name='_binary_confusion_matrices_[-inf]', sub_key=metric_types.SubKey(top_k=3)) self.assertIn(key, got_metrics) got_matrices = got_metrics[key] self.assertEqual( got_matrices, binary_confusion_matrices.Matrices( thresholds=[float('-inf')], tp=[2.0], fp=[10.0], tn=[6.0], fn=[2.0], tp_examples=[], tn_examples=[], fp_examples=[], fn_examples=[])) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def flip_count( counterfactual_prediction_key: Optional[str] = None, example_id_key: Optional[str] = None, example_ids_count: int = DEFAULT_NUM_EXAMPLE_IDS, name: str = FLIP_COUNT_NAME, thresholds: Sequence[float] = DEFAULT_THRESHOLDS, model_name: str = '', output_name: str = '', eval_config: Optional[config_pb2.EvalConfig] = None, example_weighted: bool = False) -> metric_types.MetricComputations: """Returns metric computations for computing flip counts.""" keys, metric_key_by_name_by_threshold = create_metric_keys( thresholds, METRICS_LIST, name, model_name, output_name, example_weighted) feature_keys = [counterfactual_prediction_key] if example_id_key: feature_keys.append(example_id_key) def extract_label_prediction_and_weight( inputs: metric_types.StandardMetricInputs, eval_config: Optional[config_pb2.EvalConfig] = None, model_name: str = '', output_name: str = '', sub_key: Optional[metric_types.SubKey] = None, aggregation_type: Optional[metric_types.AggregationType] = None, class_weights: Optional[Dict[int, float]] = None, example_weighted: bool = False, fractional_labels: bool = False, flatten: bool = True, ) -> Iterator[Tuple[np.ndarray, np.ndarray, np.ndarray]]: """Yields label, prediction, and example weights to be used in calculations. This function is a customized metric_util.to_label_prediction_example_weight function which yields original prediction as label and counterfactual prediction as prediction and derive flip count metrics from false positives and false negatives of binary confusion matrix. Args: inputs: Standard metric inputs. eval_config: Eval config model_name: Optional model name (if multi-model evaluation). output_name: Optional output name (if multi-output model type). sub_key: Optional sub key. (unused) aggregation_type: Optional aggregation type. (unused) class_weights: Optional class weights to apply to multi-class / multi-label labels and predictions. (unused) example_weighted: True if example weights should be applied. fractional_labels: If true, each incoming tuple of (label, prediction, example weight) will be split into two tuples as follows (where l, p, w represent the resulting label, prediction, and example weight values): (1) l = 0.0, p = prediction, and w = example_weight * (1.0 - label) (2) l = 1.0, p = prediction, and w = example_weight * label If enabled, an exception will be raised if labels are not within [0, 1]. The implementation is such that tuples associated with a weight of zero are not yielded. This means it is safe to enable fractional_labels even when the labels only take on the values of 0.0 or 1.0. (unused) flatten: True to flatten the final label and prediction outputs so that the yielded values are always arrays of size 1. For example, multi-class /multi-label outputs would be converted into label and prediction pairs that could then be processed by a binary classification metric in order to compute a micro average over all classes. (unused) Yields: Tuple of (label, prediction, example_weight). Raises: ValueError: If counterfactual prediction key is not found within either the features or predictions. ValueError: If predictions is None or empty. """ del (sub_key, aggregation_type, class_weights, fractional_labels, flatten) # unused # TODO(sokeefe): Look into removing the options to pass counterfactual # predictions in a feature and instead as a baseline model. if (counterfactual_prediction_key is not None and counterfactual_prediction_key in inputs.features): counterfactual_prediction = inputs.features[ counterfactual_prediction_key] elif eval_config is not None: counterfactual_model_spec = model_util.get_baseline_model_spec( eval_config) if counterfactual_model_spec is not None: _, counterfactual_prediction, _ = next( metric_util.to_label_prediction_example_weight( inputs, eval_config=eval_config, model_name=counterfactual_model_spec.name, output_name=output_name, example_weighted=example_weighted, fractional_labels= False, # Labels are ignored for flip counts. flatten=False, # Flattened below allow_none=True, # Allow None labels require_single_example_weight=True)) else: raise ValueError( 'The Counterfactual model must be listed with ' f'`is_baseline` equal to `True`. Found: {eval_config}') else: raise ValueError( '`counterfactual_prediction` was not found within the provided ' 'inputs. It must be included as either a feature key or within the ' 'predictions. Found:\n' f'`counterfactual_prediction_key`: {counterfactual_prediction_key}\n' f'`inputs.prediction`:{inputs.prediction}') if counterfactual_prediction is None: raise ValueError( '%s feature key is None (required for FlipCount metric)' % counterfactual_prediction_key) def get_by_keys(value: Any, keys: List[str]) -> Any: if isinstance(value, dict): new_value = util.get_by_keys(value, keys, optional=True) if new_value is not None: return new_value return value if model_name: counterfactual_prediction = get_by_keys(counterfactual_prediction, [model_name]) if output_name: counterfactual_prediction = get_by_keys(counterfactual_prediction, [output_name]) _, prediction, example_weight = next( metric_util.to_label_prediction_example_weight( inputs, eval_config=eval_config, model_name=model_name, output_name=output_name, example_weighted=example_weighted, fractional_labels=False, # Labels are ignored for flip counts. flatten=False, # Flattened below allow_none=True, # Allow None labels require_single_example_weight=True)) if prediction.size != counterfactual_prediction.size: raise ValueError( 'prediction and counterfactual_prediction size should be same for ' 'FlipCount metric, %f != %f' % (prediction.size, counterfactual_prediction.size)) if prediction.size == 0: raise ValueError( 'prediction is empty (required for FlipCount metric)') else: # Always flatten example_weight = np.array( [float(example_weight) for i in range(prediction.shape[-1])]) for p, cfp, w in zip(prediction.flatten(), counterfactual_prediction.flatten(), example_weight.flatten()): yield np.array([p]), np.array([cfp]), np.array([w]) # Setting fractional label to false, since prediction is being used as label # and it can be a non-binary value. computations = binary_confusion_matrices.binary_confusion_matrices( thresholds=list(thresholds), eval_config=eval_config, model_name=model_name, output_name=output_name, example_weighted=example_weighted, extract_label_prediction_and_weight=extract_label_prediction_and_weight, preprocessor=metric_types.FeaturePreprocessor( feature_keys=feature_keys), example_id_key=example_id_key, example_ids_count=example_ids_count, fractional_labels=False) examples_metric_key, matrices_metric_key = computations[-1].keys def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Any]: """Returns flip count metrics values.""" matrix = metrics[matrices_metric_key] examples = metrics[examples_metric_key] output = {} for i, threshold in enumerate(matrix.thresholds): output[metric_key_by_name_by_threshold[threshold] ['positive_to_negative']] = matrix.fn[i] output[metric_key_by_name_by_threshold[threshold] ['negative_to_positive']] = matrix.fp[i] output[metric_key_by_name_by_threshold[threshold] ['positive_to_negative_examples_ids']] = np.array( examples.fn_examples[i]) output[metric_key_by_name_by_threshold[threshold] ['negative_to_positive_examples_ids']] = np.array( examples.fp_examples[i]) output[metric_key_by_name_by_threshold[threshold] ['positive_examples_count']] = matrix.fn[i] + matrix.tp[i] output[metric_key_by_name_by_threshold[threshold] ['negative_examples_count']] = matrix.fp[i] + matrix.tn[i] return output derived_computation = metric_types.DerivedMetricComputation(keys=keys, result=result) computations.append(derived_computation) return computations