def _calibration_plot( num_buckets: int = DEFAULT_NUM_BUCKETS, left: Optional[float] = None, right: Optional[float] = None, name: Text = CALIBRATION_PLOT_NAME, eval_config: Optional[config_pb2.EvalConfig] = None, schema: Optional[schema_pb2.Schema] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None, aggregation_type: Optional[metric_types.AggregationType] = None, class_weights: Optional[Dict[int, float]] = None ) -> metric_types.MetricComputations: """Returns metric computations for calibration plot.""" key = metric_types.PlotKey( name=name, model_name=model_name, output_name=output_name, sub_key=sub_key) label_left, label_right = None, None if (left is None or right is None) and eval_config and schema: label_left, label_right = _find_label_domain(eval_config, schema, model_name, output_name) if left is None: left = label_left if label_left is not None else 0.0 if right is None: right = label_right if label_right is not None else 1.0 # Make sure calibration histogram is calculated. Note we are using the default # number of buckets assigned to the histogram instead of the value used for # the plots just in case the computation is shared with other metrics and # plots that need higher preicion. It will be downsampled later. computations = calibration_histogram.calibration_histogram( eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, left=left, right=right, aggregation_type=aggregation_type, class_weights=class_weights) histogram_key = computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Any]: thresholds = [ left + i * (right - left) / num_buckets for i in range(num_buckets + 1) ] thresholds = [float('-inf')] + thresholds histogram = calibration_histogram.rebin( thresholds, metrics[histogram_key], left=left, right=right) return {key: _to_proto(thresholds, histogram)} derived_computation = metric_types.DerivedMetricComputation( keys=[key], result=result) computations.append(derived_computation) return computations
def _calibration_plot( num_buckets: int = DEFAULT_NUM_BUCKETS, left: float = 0.0, right: float = 1.0, name: Text = CALIBRATION_PLOT_NAME, eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None ) -> metric_types.MetricComputations: """Returns metric computations for calibration plot.""" key = metric_types.PlotKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key) # Make sure calibration histogram is calculated. Note we are using the default # number of buckets assigned to the histogram instead of the value used for # the plots just in case the computation is shared with other metrics and # plots that need higher preicion. It will be downsampled later. computations = calibration_histogram.calibration_histogram( eval_config=eval_config, model_name=model_name, output_name=output_name, sub_key=sub_key, left=left, right=right) histogram_key = computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Any]: thresholds = [ left + i * (right - left) / num_buckets for i in range(num_buckets + 1) ] thresholds = [float('-inf')] + thresholds histogram = calibration_histogram.rebin(thresholds, metrics[histogram_key], left=left, right=right) return {key: _to_proto(thresholds, histogram)} derived_computation = metric_types.DerivedMetricComputation(keys=[key], result=result) computations.append(derived_computation) return computations
def testCalibrationHistogram(self): histogram = calibration_histogram.calibration_histogram()[0] example1 = { 'labels': np.array([0.0]), 'predictions': np.array([0.2]), 'example_weights': np.array([1.0]) } example2 = { 'labels': np.array([1.0]), 'predictions': np.array([0.8]), 'example_weights': np.array([2.0]) } example3 = { 'labels': np.array([0.0]), 'predictions': np.array([0.5]), 'example_weights': np.array([3.0]) } example4 = { 'labels': np.array([1.0]), 'predictions': np.array([-0.1]), 'example_weights': np.array([4.0]) } example5 = { 'labels': np.array([1.0]), 'predictions': np.array([0.5]), 'example_weights': np.array([5.0]) } example6 = { 'labels': np.array([1.0]), 'predictions': np.array([0.8]), 'example_weights': np.array([6.0]) } example7 = { 'labels': np.array([0.0]), 'predictions': np.array([0.2]), 'example_weights': np.array([7.0]) } example8 = { 'labels': np.array([1.0]), 'predictions': np.array([1.1]), 'example_weights': np.array([8.0]) } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([ example1, example2, example3, example4, example5, example6, example7, example8 ]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner)) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_plots = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_plots, 1) key = metric_types.PlotKey('_calibration_histogram_10000') self.assertIn(key, got_plots) got_histogram = got_plots[key] self.assertLen(got_histogram, 5) self.assertEqual( got_histogram[0], calibration_histogram.Bucket( bucket_id=0, weighted_labels=1.0 * 4.0, weighted_predictions=-0.1 * 4.0, weighted_examples=4.0)) self.assertEqual( got_histogram[1], calibration_histogram.Bucket( bucket_id=2001, weighted_labels=0.0 + 0.0, weighted_predictions=0.2 + 7 * 0.2, weighted_examples=1.0 + 7.0)) self.assertEqual( got_histogram[2], calibration_histogram.Bucket( bucket_id=5001, weighted_labels=1.0 * 5.0, weighted_predictions=0.5 * 3.0 + 0.5 * 5.0, weighted_examples=3.0 + 5.0)) self.assertEqual( got_histogram[3], calibration_histogram.Bucket( bucket_id=8001, weighted_labels=1.0 * 2.0 + 1.0 * 6.0, weighted_predictions=0.8 * 2.0 + 0.8 * 6.0, weighted_examples=2.0 + 6.0)) self.assertEqual( got_histogram[4], calibration_histogram.Bucket(bucket_id=10001, weighted_labels=1.0 * 8.0, weighted_predictions=1.1 * 8.0, weighted_examples=8.0)) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def binary_confusion_matrices( num_thresholds: Optional[int] = None, thresholds: Optional[List[float]] = None, name: Optional[Text] = None, eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None, aggregation_type: Optional[metric_types.AggregationType] = None, class_weights: Optional[Dict[int, float]] = None, use_histogram: Optional[bool] = None, extract_label_prediction_and_weight: Optional[Callable[ ..., Any]] = metric_util.to_label_prediction_example_weight, preprocessor: Optional[Callable[..., Any]] = None, example_id_key: Optional[Text] = None, example_ids_count: Optional[int] = None, fractional_labels: float = True) -> metric_types.MetricComputations: """Returns metric computations for computing binary confusion matrices. Args: num_thresholds: Number of thresholds to use. Thresholds will be calculated using linear interpolation between 0.0 and 1.0 with equidistant values and bondardaries at -epsilon and 1.0+epsilon. Values must be > 0. Only one of num_thresholds or thresholds should be used. If used, num_thresholds must be > 1. thresholds: A specific set of thresholds to use. The caller is responsible for marking the boundaries with +/-epsilon if desired. Only one of num_thresholds or thresholds should be used. For metrics computed at top k this may be a single negative threshold value (i.e. -inf). name: Metric name. eval_config: Eval config. model_name: Optional model name (if multi-model evaluation). output_name: Optional output name (if multi-output model type). sub_key: Optional sub key. aggregation_type: Optional aggregation type. class_weights: Optional class weights to apply to multi-class / multi-label labels and predictions prior to flattening (when micro averaging is used). use_histogram: If true, matrices will be derived from calibration histograms. extract_label_prediction_and_weight: User-provided function argument that yields label, prediction, and example weights for use in calculations (relevant only when use_histogram flag is not true). preprocessor: User-provided preprocessor for including additional extracts in StandardMetricInputs (relevant only when use_histogram flag is not true). example_id_key: Feature key containing example id (relevant only when use_histogram flag is not true). example_ids_count: Max number of example ids to be extracted for false positives and false negatives (relevant only when use_histogram flag is not true). fractional_labels: If true, each incoming tuple of (label, prediction, and example weight) will be split into two tuples as follows (where l, p, w represent the resulting label, prediction, and example weight values): (1) l = 0.0, p = prediction, and w = example_weight * (1.0 - label) (2) l = 1.0, p = prediction, and w = example_weight * label If enabled, an exception will be raised if labels are not within [0, 1]. The implementation is such that tuples associated with a weight of zero are not yielded. This means it is safe to enable fractional_labels even when the labels only take on the values of 0.0 or 1.0. Raises: ValueError: If both num_thresholds and thresholds are set at the same time. """ if num_thresholds is not None and thresholds is not None: raise ValueError( 'only one of thresholds or num_thresholds can be set at a time') if num_thresholds is None and thresholds is None: num_thresholds = DEFAULT_NUM_THRESHOLDS # Keras AUC turns num_thresholds parameters into thresholds which circumvents # sharing of settings. If the thresholds match the interpolated version of the # thresholds then reset back to num_thresholds. if (name is None and thresholds and thresholds == _interpolated_thresholds(len(thresholds))): num_thresholds = len(thresholds) thresholds = None if num_thresholds is not None: if num_thresholds <= 1: raise ValueError('num_thresholds must be > 1') # The interpolation strategy used here matches that used by keras for AUC. thresholds = _interpolated_thresholds(num_thresholds) if name is None: name = '{}_{}'.format(BINARY_CONFUSION_MATRICES_NAME, num_thresholds) elif name is None: name = '{}_{}'.format(BINARY_CONFUSION_MATRICES_NAME, list(thresholds)) key = metric_types.MetricKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key) computations = [] metric_key = None if use_histogram is None: use_histogram = (num_thresholds is not None or (len(thresholds) == 1 and thresholds[0] < 0)) if use_histogram: # Use calibration histogram to calculate matrices. For efficiency (unless # all predictions are matched - i.e. thresholds <= 0) we will assume that # other metrics will make use of the calibration histogram and re-use the # default histogram for the given model_name/output_name/sub_key. This is # also required to get accurate counts at the threshold boundaries. If this # becomes an issue, then calibration histogram can be updated to support # non-linear boundaries. computations = calibration_histogram.calibration_histogram( eval_config=eval_config, num_buckets=( # For precision/recall_at_k were a single large negative threshold # is used, we only need one bucket. Note that the histogram will # actually have 2 buckets: one that we set (which handles # predictions > -1.0) and a default catch-all bucket (i.e. bucket 0) # that the histogram creates for large negative predictions (i.e. # predictions <= -1.0). 1 if len(thresholds) == 1 and thresholds[0] <= 0 else None), model_name=model_name, output_name=output_name, sub_key=sub_key, aggregation_type=aggregation_type, class_weights=class_weights) metric_key = computations[-1].keys[-1] else: computations = _binary_confusion_matrix_computation( eval_config=eval_config, thresholds=thresholds, model_name=model_name, output_name=output_name, sub_key=sub_key, extract_label_prediction_and_weight= extract_label_prediction_and_weight, preprocessor=preprocessor, example_id_key=example_id_key, example_ids_count=example_ids_count, aggregation_type=aggregation_type, class_weights=class_weights, fractional_labels=fractional_labels) metric_key = computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Matrices]: """Returns binary confusion matrices.""" matrices = None if use_histogram: if len(thresholds) == 1 and thresholds[0] < 0: # This case is used when all positive prediction values are relevant # matches (e.g. when calculating top_k for precision/recall where the # non-top_k values are expected to have been set to float('-inf')). histogram = metrics[metric_key] else: # Calibration histogram uses intervals of the form [start, end) where # the prediction >= start. The confusion matrices want intervals of the # form (start, end] where the prediction > start. Add a small epsilon so # that >= checks don't match. This correction shouldn't be needed in # practice but allows for correctness in small tests. rebin_thresholds = [ t + _EPSILON if t != 0 else t for t in thresholds ] if thresholds[0] >= 0: # Add -epsilon bucket to account for differences in histogram vs # confusion matrix intervals mentioned above. If the epsilon bucket is # missing the false negatives and false positives will be 0 for the # first threshold. rebin_thresholds = [-_EPSILON] + rebin_thresholds if thresholds[-1] < 1.0: # If the last threshold < 1.0, then add a fence post at 1.0 + epsilon # othewise true negatives and true positives will be overcounted. rebin_thresholds = rebin_thresholds + [1.0 + _EPSILON] histogram = calibration_histogram.rebin( rebin_thresholds, metrics[metric_key]) matrices = _historgram_to_binary_confusion_matrices( thresholds, histogram) else: matrices = _matrix_to_binary_confusion_matrices( thresholds, metrics[metric_key]) return {key: matrices} derived_computation = metric_types.DerivedMetricComputation(keys=[key], result=result) computations.append(derived_computation) return computations
def binary_confusion_matrices( num_thresholds: Optional[int] = None, thresholds: Optional[List[float]] = None, name: Text = BINARY_CONFUSION_MATRICES_NAME, eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None, class_weights: Optional[Dict[int, float]] = None ) -> metric_types.MetricComputations: """Returns metric computations for computing binary confusion matrices. Args: num_thresholds: Number of thresholds to use. Thresholds will be calculated using linear interpolation between 0.0 and 1.0 with equidistant values and bondardaries at -epsilon and 1.0+epsilon. Values must be > 0. Only one of num_thresholds or thresholds should be used. thresholds: A specific set of thresholds to use. The caller is responsible for marking the bondaires with +/-epsilon if desired. Only one of num_thresholds or thresholds should be used. name: Metric name. eval_config: Eval config. model_name: Optional model name (if multi-model evaluation). output_name: Optional output name (if multi-output model type). sub_key: Optional sub key. class_weights: Optional class weights to apply to multi-class / multi-label labels and predictions prior to flattening (when micro averaging is used). Raises: ValueError: If both num_thresholds and thresholds are set at the same time. """ key = metric_types.MetricKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key) if num_thresholds is not None and thresholds is not None: raise ValueError( 'only one of thresholds or num_thresholds can be set at a time') if num_thresholds is None and thresholds is None: num_thresholds = DEFAULT_NUM_THRESHOLDS if num_thresholds is not None: # The interpolation strategy used here matches that used by keras for AUC. thresholds = [(i + 1) * 1.0 / (num_thresholds - 1) for i in range(num_thresholds - 2)] thresholds = [-_EPSILON] + thresholds + [1.0 + _EPSILON] # Use calibration histogram to calculate matrices. For efficiency (unless all # predictions are matched - i.e. thresholds <= 0) we will assume that other # metrics will make use of the calibration histogram and re-use the default # histogram for the given model_name/output_name/sub_key. This is also # required to get accurate counts at the threshold boundaries. If this becomes # an issue, then calibration histogram can be updated to support non-linear # boundaries. num_buckets = 1 if len(thresholds) == 1 and thresholds[0] <= 0 else None histogram_computations = calibration_histogram.calibration_histogram( eval_config=eval_config, num_buckets=num_buckets, model_name=model_name, output_name=output_name, sub_key=sub_key, class_weights=class_weights) histogram_key = histogram_computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Matrices]: """Returns binary confusion matrices.""" # Calibration histogram uses intervals of the form [start, end) where the # prediction >= start. The confusion matrices want intervals of the form # (start, end] where the prediction > start. Add a small epsilon so that >= # checks don't match. This correction shouldn't be needed in practice but # allows for correctness in small tests. if len(thresholds) == 1: # When there is only one threshold, we need to make adjustments so that # we have proper boundaries around the threshold for <, >= comparions. if thresholds[0] < 0: # This case is used when all prediction values are considered matches # (e.g. when calculating top_k for precision/recall). rebin_thresholds = [thresholds[0], thresholds[0] + _EPSILON] else: # This case is used for a single threshold within [0, 1] (e.g. 0.5). rebin_thresholds = [ -_EPSILON, thresholds[0] + _EPSILON, 1.0 + _EPSILON ] else: rebin_thresholds = ([thresholds[0]] + [t + _EPSILON for t in thresholds[1:]]) histogram = calibration_histogram.rebin(rebin_thresholds, metrics[histogram_key]) matrices = _to_binary_confusion_matrices(thresholds, histogram) if len(thresholds) == 1: # Reset back to 1 bucket matrices = Matrices(thresholds, tp=[matrices.tp[1]], fp=[matrices.fp[1]], tn=[matrices.tn[1]], fn=[matrices.fn[1]]) return {key: matrices} derived_computation = metric_types.DerivedMetricComputation(keys=[key], result=result) computations = histogram_computations computations.append(derived_computation) return computations
def binary_confusion_matrices( num_thresholds: Optional[int] = None, thresholds: Optional[List[float]] = None, name: Text = BINARY_CONFUSION_MATRICES_NAME, eval_config: Optional[config.EvalConfig] = None, model_name: Text = '', output_name: Text = '', sub_key: Optional[metric_types.SubKey] = None, aggregation_type: Optional[metric_types.AggregationType] = None, class_weights: Optional[Dict[int, float]] = None ) -> metric_types.MetricComputations: """Returns metric computations for computing binary confusion matrices. Args: num_thresholds: Number of thresholds to use. Thresholds will be calculated using linear interpolation between 0.0 and 1.0 with equidistant values and bondardaries at -epsilon and 1.0+epsilon. Values must be > 0. Only one of num_thresholds or thresholds should be used. If used, num_thresholds must be > 1. thresholds: A specific set of thresholds to use. The caller is responsible for marking the boundaries with +/-epsilon if desired. Only one of num_thresholds or thresholds should be used. For metrics computed at top k this may be a single negative threshold value (i.e. -inf). name: Metric name. eval_config: Eval config. model_name: Optional model name (if multi-model evaluation). output_name: Optional output name (if multi-output model type). sub_key: Optional sub key. aggregation_type: Optional aggregation type. class_weights: Optional class weights to apply to multi-class / multi-label labels and predictions prior to flattening (when micro averaging is used). Raises: ValueError: If both num_thresholds and thresholds are set at the same time. """ key = metric_types.MetricKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key) if num_thresholds is not None and thresholds is not None: raise ValueError( 'only one of thresholds or num_thresholds can be set at a time') if num_thresholds is None and thresholds is None: num_thresholds = DEFAULT_NUM_THRESHOLDS if num_thresholds is not None: if num_thresholds <= 1: raise ValueError('num_thresholds must be > 1') # The interpolation strategy used here matches that used by keras for AUC. thresholds = [(i + 1) * 1.0 / (num_thresholds - 1) for i in range(num_thresholds - 2)] thresholds = [-_EPSILON] + thresholds + [1.0 + _EPSILON] # Use calibration histogram to calculate matrices. For efficiency (unless all # predictions are matched - i.e. thresholds <= 0) we will assume that other # metrics will make use of the calibration histogram and re-use the default # histogram for the given model_name/output_name/sub_key. This is also # required to get accurate counts at the threshold boundaries. If this becomes # an issue, then calibration histogram can be updated to support non-linear # boundaries. histogram_computations = calibration_histogram.calibration_histogram( eval_config=eval_config, num_buckets=( # For precision/recall_at_k were a single large negative threshold is # used, we only need one bucket. Note that the histogram will actually # have 2 buckets: one that we set (which handles predictions > -1.0) # and a default catch-all bucket (i.e. bucket 0) that the histogram # creates for large negative predictions (i.e. predictions <= -1.0). 1 if len(thresholds) == 1 and thresholds[0] <= 0 else None), model_name=model_name, output_name=output_name, sub_key=sub_key, aggregation_type=aggregation_type, class_weights=class_weights) histogram_key = histogram_computations[-1].keys[-1] def result( metrics: Dict[metric_types.MetricKey, Any] ) -> Dict[metric_types.MetricKey, Matrices]: """Returns binary confusion matrices.""" if len(thresholds) == 1 and thresholds[0] < 0: # This case is used when all positive prediction values are considered # matches (e.g. when calculating top_k for precision/recall where the # non-top_k values are expected to have been set to float('-inf')). histogram = metrics[histogram_key] else: # Calibration histogram uses intervals of the form [start, end) where the # prediction >= start. The confusion matrices want intervals of the form # (start, end] where the prediction > start. Add a small epsilon so that # >= checks don't match. This correction shouldn't be needed in practice # but allows for correctness in small tests. rebin_thresholds = [ t + _EPSILON if t != 0 else t for t in thresholds ] if thresholds[0] >= 0: # Add -epsilon bucket to account for differences in histogram vs # confusion matrix intervals mentioned above. If the epsilon bucket is # missing the false negatives and false positives will be 0 for the # first threshold. rebin_thresholds = [-_EPSILON] + rebin_thresholds if thresholds[-1] < 1.0: # If the last threshold < 1.0, then add a fence post at 1.0 + epsilon # othewise true negatives and true positives will be overcounted. rebin_thresholds = rebin_thresholds + [1.0 + _EPSILON] histogram = calibration_histogram.rebin(rebin_thresholds, metrics[histogram_key]) matrices = _to_binary_confusion_matrices(thresholds, histogram) return {key: matrices} derived_computation = metric_types.DerivedMetricComputation(keys=[key], result=result) computations = histogram_computations computations.append(derived_computation) return computations
def _lift_metrics( num_buckets: Optional[int] = None, left: Optional[float] = None, right: Optional[float] = None, name: Optional[str] = None, eval_config: Optional[config_pb2.EvalConfig] = None, model_name: str = '', output_name: str = '', aggregation_type: Optional[metric_types.AggregationType] = None, sub_key: Optional[metric_types.SubKey] = None, class_weights: Optional[Dict[int, float]] = None, example_weighted: bool = False, ignore_out_of_bound_examples: bool = False, ) -> metric_types.MetricComputations: """Returns computations for lift metrics.""" if eval_config is None or not eval_config.cross_slicing_specs: raise ValueError( 'tfma.CrossSlicingSpec with a baseline and at least one comparison ' 'slicing spec must be provided for Lift metrics') if num_buckets is None: num_buckets = DEFAULT_NUM_BUCKETS key = metric_types.MetricKey(name=name, model_name=model_name, output_name=output_name, sub_key=sub_key, example_weighted=example_weighted) computations = calibration_histogram.calibration_histogram( eval_config=eval_config, num_buckets=num_buckets, left=left, right=right, model_name=model_name, output_name=output_name, sub_key=sub_key, aggregation_type=aggregation_type, class_weights=class_weights, example_weighted=example_weighted, prediction_based_bucketing=False, fractional_labels=False) metric_key = computations[-1].keys[-1] def cross_slice_comparison( baseline_metrics: Dict[metric_types.MetricKey, Any], comparison_metrics: Dict[metric_types.MetricKey, Any], ) -> Dict[metric_types.MetricKey, Any]: """Returns lift metrics values.""" baseline_histogram = baseline_metrics[metric_key] comparison_histogram = comparison_metrics[metric_key] baseline_bucket = {} comparison_bucket = {} bucket_ids = set() for bucket in baseline_histogram: baseline_bucket[bucket.bucket_id] = bucket bucket_ids.add(bucket.bucket_id) for bucket in comparison_histogram: comparison_bucket[bucket.bucket_id] = bucket bucket_ids.add(bucket.bucket_id) baseline_pred_values = 0.0 comparison_pred_values = 0.0 comparison_num_examples = 0.0 for bucket_id in bucket_ids: if ignore_out_of_bound_examples: # Ignore buckets having examples with out of bound label values. if bucket_id <= 0 or bucket_id > num_buckets: continue num_examples = 0.0 if bucket_id in comparison_bucket: num_examples = comparison_bucket[bucket_id].weighted_examples comparison_pred_values += comparison_bucket[ bucket_id].weighted_predictions comparison_num_examples += num_examples if bucket_id in baseline_bucket: # To compute background/baseline re-weighted average prediction values. # Background re-weighting is done by dividing the in-slice ground truth # density by the background density so that the marginal ground truth # distributions of in-slice items and background items appear similar. weight = num_examples / baseline_bucket[ bucket_id].weighted_examples baseline_pred_values += weight * baseline_bucket[ bucket_id].weighted_predictions lift_value = (comparison_pred_values - baseline_pred_values) / comparison_num_examples return {key: lift_value} cross_slice_computation = metric_types.CrossSliceMetricComputation( keys=[key], cross_slice_comparison=cross_slice_comparison) computations.append(cross_slice_computation) return computations
def testTopKCalibrationHistogramWithTopK(self): histogram = calibration_histogram.calibration_histogram( sub_key=metric_types.SubKey(top_k=2), example_weighted=True)[0] example1 = { 'labels': np.array([2]), 'predictions': np.array([0.2, 0.05, 0.5, 0.05]), 'example_weights': np.array([1.0]) } example2 = { 'labels': np.array([2]), 'predictions': np.array([0.8, 0.1, 0.8, 0.5]), 'example_weights': np.array([2.0]) } example3 = { 'labels': np.array([3]), 'predictions': np.array([0.2, 0.5, 0.1, 0.1]), 'example_weights': np.array([3.0]) } example4 = { 'labels': np.array([0]), 'predictions': np.array([-0.1, 1.1, -0.7, -0.4]), 'example_weights': np.array([4.0]) } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([example1, example2, example3, example4]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner)) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_plots = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_plots, 1) key = metric_types.PlotKey( name='_calibration_histogram_10000', sub_key=metric_types.SubKey(top_k=2), example_weighted=True) self.assertIn(key, got_plots) got_histogram = got_plots[key] self.assertLen(got_histogram, 5) self.assertEqual( got_histogram[0], calibration_histogram.Bucket( bucket_id=0, weighted_labels=3.0 + 4.0, weighted_predictions=(2 * 1.0 * float('-inf') + 2 * 2.0 * float('-inf') + 2 * 3.0 * float('-inf') + 2 * 4.0 * float('-inf') + -0.1 * 4.0), weighted_examples=(1.0 * 2.0 + 2.0 * 2.0 + 3.0 * 2.0 + 4.0 * 3.0))) self.assertEqual( got_histogram[1], calibration_histogram.Bucket( bucket_id=2001, weighted_labels=0.0 + 0.0, weighted_predictions=0.2 + 3 * 0.2, weighted_examples=1.0 + 3.0)) self.assertEqual( got_histogram[2], calibration_histogram.Bucket( bucket_id=5001, weighted_labels=1.0 + 0.0 * 3.0, weighted_predictions=0.5 * 1.0 + 0.5 * 3.0, weighted_examples=1.0 + 3.0)) self.assertEqual( got_histogram[3], calibration_histogram.Bucket( bucket_id=8001, weighted_labels=0.0 * 2.0 + 1.0 * 2.0, weighted_predictions=0.8 * 2.0 + 0.8 * 2.0, weighted_examples=2.0 + 2.0)) self.assertEqual( got_histogram[4], calibration_histogram.Bucket( bucket_id=10001, weighted_labels=0.0 * 4.0, weighted_predictions=1.1 * 4.0, weighted_examples=4.0)) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')