Example #1
0
    def testCalculateConfidenceInterval(self):
        sampling_data_list = [
            np.array([
                [0, 0, 2, 7, 0.77777779, 1],
                [1, 0, 2, 6, 0.75, 0.85714287],
                [4, 0, 2, 3, 0.60000002, 0.42857143],
                [4, 2, 0, 3, 1, 0.42857143],
                [7, 2, 0, 0, float('nan'), 0],
            ]),
            np.array([
                [7, 2, 0, 0, float('nan'), 0],
                [0, 0, 2, 7, 0.77777779, 1],
                [1, 0, 2, 6, 0.75, 0.85714287],
                [4, 0, 2, 3, 0.60000002, 0.42857143],
                [4, 2, 0, 3, 1, 0.42857143],
            ]),
        ]
        unsampled_data = np.array([
            [4, 2, 0, 3, 1, 0.42857143],
            [7, 2, 0, 0, float('nan'), 0],
            [0, 0, 2, 7, 0.77777779, 1],
            [1, 0, 2, 6, 0.75, 0.85714287],
            [4, 0, 2, 3, 0.60000002, 0.42857143],
        ])
        result = aggregate._calculate_confidence_interval(
            sampling_data_list, unsampled_data)
        print(result)
        self.assertIsInstance(result, np.ndarray)
        self.assertEqual(result.shape, (5, 6))
        self.assertAlmostEqual(result[0][0].value, 3.5, delta=0.1)
        self.assertAlmostEqual(result[0][0].lower_bound, -40.97, delta=0.1)
        self.assertAlmostEqual(result[0][0].upper_bound, 47.97, delta=0.1)
        self.assertEqual(result[0][0].unsampled_value, 4.0)
        self.assertAlmostEqual(result[0][4].value, 0.77, delta=0.1)
        self.assertTrue(np.isnan(result[0][4].lower_bound))
        self.assertTrue(np.isnan(result[0][4].upper_bound))
        self.assertEqual(result[0][4].unsampled_value, 1.0)

        sampling_data_list = [
            np.array([1, 2]),
            np.array([1, 2]),
            np.array([1, float('nan')])
        ]
        unsampled_data = np.array([1, 2])
        result = aggregate._calculate_confidence_interval(
            sampling_data_list, unsampled_data)
        self.assertIsInstance(result, np.ndarray)
        self.assertEqual(result.tolist(), [
            types.ValueWithConfidenceInterval(
                value=1.0, lower_bound=1.0, upper_bound=1.0,
                unsampled_value=1),
            types.ValueWithConfidenceInterval(
                value=2.0, lower_bound=2.0, upper_bound=2.0, unsampled_value=2)
        ])
 def testUncertaintyValuedMetrics(self):
     slice_key = _make_slice_key()
     slice_metrics = {
         'one_dim':
         types.ValueWithConfidenceInterval(2.0, 1.0, 3.0),
         'nans':
         types.ValueWithConfidenceInterval(float('nan'), float('nan'),
                                           float('nan')),
     }
     expected_metrics_for_slice = text_format.Parse(
         """
     slice_key {}
     metrics {
       key: "one_dim"
       value {
         bounded_value {
           value {
             value: 2.0
           }
           lower_bound {
             value: 1.0
           }
           upper_bound {
             value: 3.0
           }
           methodology: POISSON_BOOTSTRAP
         }
       }
     }
     metrics {
       key: "nans"
       value {
         bounded_value {
           value {
             value: nan
           }
           lower_bound {
             value: nan
           }
           upper_bound {
             value: nan
           }
           methodology: POISSON_BOOTSTRAP
         }
       }
     }
     """, metrics_for_slice_pb2.MetricsForSlice())
     got = metrics_and_plots_evaluator._serialize_metrics(
         (slice_key, slice_metrics), [])
     self.assertProtoEquals(
         expected_metrics_for_slice,
         metrics_for_slice_pb2.MetricsForSlice.FromString(got))
Example #3
0
def _calculate_confidence_interval(sampling_data_list: List[Union[int, float,
                                                                  np.ndarray]],
                                   unsampled_data: Union[int, float,
                                                         np.ndarray]):
    """Calculate the confidence interval of the data.

  Args:
    sampling_data_list: A list of number or np.ndarray.
    unsampled_data: Individual number or np.ndarray. The format of the
      unsampled_data should match the format of the element inside
      sampling_data_list.

  Returns:
    Confidence Interval value stored inside
    types.ValueWithConfidenceInterval.
  """
    if isinstance(sampling_data_list[0], (np.ndarray, list)):
        merged_data = sampling_data_list[0][:]
        if isinstance(sampling_data_list[0], np.ndarray):
            merged_data = merged_data.astype(object)
        for index in range(len(merged_data)):
            merged_data[index] = _calculate_confidence_interval(
                [data[index] for data in sampling_data_list],
                unsampled_data[index])
        return merged_data
    else:
        # Data has to be numeric. That means throw out nan values.
        sampling_data_list = [
            data for data in sampling_data_list if not np.isnan(data)
        ]
        n_samples = len(sampling_data_list)
        if n_samples:
            confidence = 0.95
            sample_mean = mean(sampling_data_list)
            std_err = sem(sampling_data_list)
            t_stat = t.ppf((1 + confidence) / 2, n_samples - 1)
            upper_bound = sample_mean + t_stat * std_err
            lower_bound = sample_mean - t_stat * std_err
            # Set [mean, lower_bound, upper_bound] for each metric component.
            return types.ValueWithConfidenceInterval(sample_mean, lower_bound,
                                                     upper_bound,
                                                     unsampled_data)
        else:
            return types.ValueWithConfidenceInterval(float('nan'),
                                                     float('nan'),
                                                     float('nan'),
                                                     float('nan'))
    def testSerializeMetricsRanges(self):
        slice_key = _make_slice_key('age', 5, 'language', 'english', 'price',
                                    0.3)
        slice_metrics = {
            'accuracy': types.ValueWithConfidenceInterval(0.8, 0.7, 0.9),
            _full_key(metric_keys.AUPRC): 0.1,
            _full_key(metric_keys.lower_bound(metric_keys.AUPRC)): 0.05,
            _full_key(metric_keys.upper_bound(metric_keys.AUPRC)): 0.17,
            _full_key(metric_keys.AUC): 0.2,
            _full_key(metric_keys.lower_bound(metric_keys.AUC)): 0.1,
            _full_key(metric_keys.upper_bound(metric_keys.AUC)): 0.3
        }
        expected_metrics_for_slice = text_format.Parse(
            string.Template("""
        slice_key {
          single_slice_keys {
            column: 'age'
            int64_value: 5
          }
          single_slice_keys {
            column: 'language'
            bytes_value: 'english'
          }
          single_slice_keys {
            column: 'price'
            float_value: 0.3
          }
        }
        metrics {
          key: "accuracy"
          value {
            bounded_value {
              value {
                value: 0.8
              }
              lower_bound {
                value: 0.7
              }
              upper_bound {
                value: 0.9
              }
              methodology: POISSON_BOOTSTRAP
            }
          }
        }
        metrics {
          key: "$auc"
          value {
            bounded_value {
              lower_bound {
                value: 0.1
              }
              upper_bound {
                value: 0.3
              }
              value {
                value: 0.2
              }
              methodology: RIEMANN_SUM
            }
          }
        }
        metrics {
          key: "$auprc"
          value {
            bounded_value {
              lower_bound {
                value: 0.05
              }
              upper_bound {
                value: 0.17
              }
              value {
                value: 0.1
              }
              methodology: RIEMANN_SUM
            }
          }
        }""").substitute(auc=_full_key(metric_keys.AUC),
                         auprc=_full_key(metric_keys.AUPRC)),
            metrics_for_slice_pb2.MetricsForSlice())

        got = metrics_and_plots_evaluator._serialize_metrics(
            (slice_key, slice_metrics),
            [post_export_metrics.auc(),
             post_export_metrics.auc(curve='PR')])
        self.assertProtoEquals(
            expected_metrics_for_slice,
            metrics_for_slice_pb2.MetricsForSlice.FromString(got))
Example #5
0
    def process(self, element, unsampled_results):
        slice_key, metrics = element
        # metrics should be a list of dicts, but the dataflow runner has a quirk
        # that requires specific casting.
        metrics = list(metrics)
        side_input_results = {}

        for result in unsampled_results:
            unsampled_slice_key, unsampled_metrics = result
            side_input_results[unsampled_slice_key] = unsampled_metrics
        if len(metrics) == 1:
            yield slice_key, metrics[0]
            return

        original_structure = copy.copy(metrics[0])
        uber_metrics = {}
        unsampled_metrics = {}
        for m_dict in metrics:
            # For each metric in each slice, aggregate values over all of the computed
            # samples.
            for key in m_dict:
                _collect_metrics(m_dict[key], key, uber_metrics)
                unsampled_slice_key = slice_key
                _collect_metrics(side_input_results[unsampled_slice_key][key],
                                 key, unsampled_metrics)

        for key in uber_metrics:
            # Compute confidence interval given the data points per metric.
            confidence = 0.95
            data = uber_metrics[key]
            # Data has to be numeric. That means throw out nan values.
            n_samples = len(data)
            if n_samples:
                sample_mean = mean(data)
                std_err = sem(data)
                t_stat = t.ppf((1 + confidence) / 2, n_samples - 1)
                upper_bound = sample_mean + t_stat * std_err
                lower_bound = sample_mean - t_stat * std_err
                # Set [mean, lower_bound, upper_bound] for each metric component.
                uber_metrics[key] = types.ValueWithConfidenceInterval(
                    sample_mean, lower_bound, upper_bound,
                    unsampled_metrics[key][0])
            else:
                uber_metrics[key] = types.ValueWithConfidenceInterval(
                    float('nan'), float('nan'), float('nan'), float('nan'))

        # Convert metrics back into expected format with bounded values.
        for sub_key in uber_metrics:
            # Break sub-key into components.
            key_components = sub_key.split(',')
            original_key = key_components[0]
            metric_structure = original_structure[original_key]
            if isinstance(metric_structure, np.ndarray):
                metric_structure = np.array(metric_structure, dtype=object)
                _populate_bounded_metrics(key_components[1:], metric_structure,
                                          uber_metrics[sub_key])
            else:
                metric_structure = uber_metrics[sub_key]
            original_structure[original_key] = metric_structure

        yield slice_key, original_structure