def check_result(got_pcoll):
     expected_pcoll = [
         (slice_key1, {
             x_key:
             types.ValueWithTDistribution(
                 sample_mean=1.5,
                 sample_standard_deviation=0.5,
                 sample_degrees_of_freedom=1,
                 unsampled_value=1.6),
             y_key:
             types.ValueWithTDistribution(
                 sample_mean=15.,
                 sample_standard_deviation=5,
                 sample_degrees_of_freedom=1,
                 unsampled_value=16),
             cm_key:
             types.ValueWithTDistribution(
                 sample_mean=cm_metric,
                 sample_standard_deviation=(
                     binary_confusion_matrices.Matrices(
                         thresholds=[0.5],
                         tp=[1],
                         fp=[1],
                         tn=[1],
                         fn=[1])),
                 sample_degrees_of_freedom=1,
                 unsampled_value=cm_metric),
         }),
         (slice_key2, {
             x_key:
             types.ValueWithTDistribution(
                 sample_mean=3.,
                 sample_standard_deviation=1,
                 sample_degrees_of_freedom=1,
                 unsampled_value=3.3),
             y_key:
             types.ValueWithTDistribution(
                 sample_mean=30.,
                 sample_standard_deviation=10,
                 sample_degrees_of_freedom=1,
                 unsampled_value=33),
             cm_key:
             types.ValueWithTDistribution(
                 sample_mean=cm_metric,
                 sample_standard_deviation=(
                     binary_confusion_matrices.Matrices(
                         thresholds=[0.5],
                         tp=[10],
                         fp=[10],
                         tn=[10],
                         fn=[10])),
                 sample_degrees_of_freedom=1,
                 unsampled_value=cm_metric),
         }),
     ]
     self.assertCountEqual(expected_pcoll, got_pcoll)
            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    self.assertLen(got_metrics, 1)
                    key = metric_types.MetricKey(
                        name='_binary_confusion_matrices_[-inf]',
                        sub_key=metric_types.SubKey(top_k=3))
                    self.assertIn(key, got_metrics)
                    got_matrices = got_metrics[key]
                    self.assertEqual(
                        got_matrices,
                        binary_confusion_matrices.Matrices(
                            thresholds=[float('-inf')],
                            tp=[2.0],
                            fp=[10.0],
                            tn=[6.0],
                            fn=[2.0],
                            tp_examples=[],
                            tn_examples=[],
                            fp_examples=[],
                            fn_examples=[]))

                except AssertionError as err:
                    raise util.BeamAssertException(err)
Beispiel #3
0
            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    self.assertLen(got_metrics, 1)
                    key = metric_types.MetricKey(
                        name='confusion_matrix_at_thresholds')
                    self.assertIn(key, got_metrics)
                    got_metric = got_metrics[key]
                    self.assertEqual(
                        binary_confusion_matrices.Matrices(
                            thresholds=[0.3, 0.5, 0.8],
                            tp=[1.0, 1.0, 1.0],
                            tn=[1.0, 2.0, 2.0],
                            fp=[1.0, 0.0, 0.0],
                            fn=[1.0, 1.0, 1.0]), got_metric)

                except AssertionError as err:
                    raise util.BeamAssertException(err)
Beispiel #4
0
    def testCalculateConfidenceIntervalConfusionMatrices(self):
        mid, lb, ub = math_util.calculate_confidence_interval(
            types.ValueWithTDistribution(
                sample_mean=binary_confusion_matrices.Matrices(
                    thresholds=[0.5], tp=[0.0], tn=[2.0], fp=[1.0], fn=[1.0]),
                sample_standard_deviation=binary_confusion_matrices.Matrices(
                    thresholds=[0.5],
                    tp=[0.0],
                    tn=[2.051956704170308],
                    fp=[1.025978352085154],
                    fn=[1.2139539573337679]),
                sample_degrees_of_freedom=19,
                unsampled_value=binary_confusion_matrices.Matrices(
                    thresholds=[0.5], tp=[0.0], tn=[2.0], fp=[1.0], fn=[1.0])))

        expected_mid = binary_confusion_matrices.Matrices(thresholds=[0.5],
                                                          tp=[0.0],
                                                          tn=[2.0],
                                                          fp=[1.0],
                                                          fn=[1.0])
        self.assertEqual(expected_mid, mid)

        expected_lb = binary_confusion_matrices.Matrices(
            thresholds=[0.5],
            tp=[0.0],
            tn=[-2.2947947404327547],
            fp=[-1.1473973702163773],
            fn=[-1.5408348336436783])
        self.assertEqual(expected_lb.thresholds, lb.thresholds)
        np.testing.assert_almost_equal(lb.tp, expected_lb.tp)
        np.testing.assert_almost_equal(lb.fp, expected_lb.fp)
        np.testing.assert_almost_equal(lb.tn, expected_lb.tn)
        np.testing.assert_almost_equal(lb.fn, expected_lb.fn)

        expected_ub = binary_confusion_matrices.Matrices(
            thresholds=[0.5],
            tp=[0.0],
            tn=[6.294794740432755],
            fp=[3.1473973702163773],
            fn=[3.5408348336436783])
        self.assertEqual(expected_ub.thresholds, ub.thresholds)
        np.testing.assert_almost_equal(ub.tp, expected_ub.tp)
        np.testing.assert_almost_equal(ub.fp, expected_ub.fp)
        np.testing.assert_almost_equal(ub.tn, expected_ub.tn)
        np.testing.assert_almost_equal(ub.fn, expected_ub.fn)
Beispiel #5
0
      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          self.assertLen(got_metrics, 1)
          key = metric_types.MetricKey(
              name='{}_[-inf]'.format(
                  binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME),
              sub_key=metric_types.SubKey(top_k=3))
          self.assertIn(key, got_metrics)
          got_matrices = got_metrics[key]
          self.assertEqual(
              got_matrices,
              binary_confusion_matrices.Matrices(
                  thresholds=[float('-inf')],
                  tp=[2.0],
                  fp=[10.0],
                  tn=[6.0],
                  fn=[2.0]))

        except AssertionError as err:
          raise util.BeamAssertException(err)
class BinaryConfusionMatricesTest(testutil.TensorflowModelAnalysisTest,
                                  parameterized.TestCase):
    @parameterized.named_parameters(
        ('using_num_thresholds', {
            'num_thresholds': 3,
        },
         binary_confusion_matrices.Matrices(
             thresholds=[-1e-7, 0.5, 1.0 + 1e-7],
             tp=[2.0, 1.0, 0.0],
             fp=[2.0, 0.0, 0.0],
             tn=[0.0, 2.0, 2.0],
             fn=[0.0, 1.0, 2.0],
             tp_examples=[],
             tn_examples=[],
             fp_examples=[],
             fn_examples=[])),
        ('single_threshold', {
            'thresholds': [0.5],
            'use_histogram': True,
        },
         binary_confusion_matrices.Matrices(thresholds=[0.5],
                                            tp=[1.0],
                                            fp=[0.0],
                                            tn=[2.0],
                                            fn=[1.0],
                                            tp_examples=[],
                                            tn_examples=[],
                                            fp_examples=[],
                                            fn_examples=[])),
        ('inner_thresholds', {
            'thresholds': [0.25, 0.75],
            'use_histogram': True,
        },
         binary_confusion_matrices.Matrices(thresholds=[0.25, 0.75],
                                            tp=[2.0, 1.0],
                                            fp=[1.0, 0.0],
                                            tn=[1.0, 2.0],
                                            fn=[0.0, 1.0],
                                            tp_examples=[],
                                            tn_examples=[],
                                            fp_examples=[],
                                            fn_examples=[])),
        ('boundary_thresholds', {
            'thresholds': [0.0, 1.0],
            'use_histogram': True,
        },
         binary_confusion_matrices.Matrices(thresholds=[0.0, 1.0],
                                            tp=[2.0, 0.0],
                                            fp=[2.0, 0.0],
                                            tn=[0.0, 2.0],
                                            fn=[0.0, 2.0],
                                            tp_examples=[],
                                            tn_examples=[],
                                            fp_examples=[],
                                            fn_examples=[])),
        ('left_boundary', {
            'thresholds': [0.0, 0.5],
            'use_histogram': True,
        },
         binary_confusion_matrices.Matrices(thresholds=[0.0, 0.5],
                                            tp=[2.0, 1.0],
                                            fp=[2.0, 0.0],
                                            tn=[0.0, 2.0],
                                            fn=[0.0, 1.0],
                                            tp_examples=[],
                                            tn_examples=[],
                                            fp_examples=[],
                                            fn_examples=[])),
        ('right_boundary', {
            'thresholds': [0.5, 1.0],
            'use_histogram': True,
        },
         binary_confusion_matrices.Matrices(thresholds=[0.5, 1.0],
                                            tp=[1.0, 0.0],
                                            fp=[0.0, 0.0],
                                            tn=[2.0, 2.0],
                                            fn=[1.0, 2.0],
                                            tp_examples=[],
                                            tn_examples=[],
                                            fp_examples=[],
                                            fn_examples=[])),
    )
    def testBinaryConfusionMatrices(self, kwargs, expected_matrices):
        computations = binary_confusion_matrices.binary_confusion_matrices(
            **kwargs)
        histogram = computations[0]
        matrices = computations[1]

        example1 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.0]),
            'example_weights': np.array([1.0])
        }
        example2 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.5]),
            'example_weights': np.array([1.0])
        }
        example3 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.3]),
            'example_weights': np.array([1.0])
        }
        example4 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.9]),
            'example_weights': np.array([1.0])
        }

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [example1, example2, example3, example4])
                | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
                | 'AddSlice' >> beam.Map(lambda x: ((), x))
                | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner)
                | 'ComputeMatrices' >> beam.Map(lambda x:
                                                (x[0], matrices.result(x[1])))
            )  # pyformat: disable

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    self.assertLen(got_metrics, 1)
                    name = '_binary_confusion_matrices_{}'.format(
                        kwargs['num_thresholds'] if 'num_thresholds' in
                        kwargs else kwargs['thresholds'])
                    key = metric_types.MetricKey(name=name)
                    self.assertIn(key, got_metrics)
                    got_matrices = got_metrics[key]
                    self.assertEqual(got_matrices, expected_matrices)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')

    @parameterized.named_parameters(
        ('using_num_thresholds', {
            'num_thresholds': 3,
            'use_histogram': False,
        },
         binary_confusion_matrices.Matrices(
             thresholds=[-1e-7, 0.5, 1.0 + 1e-7],
             tp=[2.0, 1.0, 0.0],
             fp=[2.0, 0.0, 0.0],
             tn=[0.0, 2.0, 2.0],
             fn=[0.0, 1.0, 2.0],
             tp_examples=[[], [], []],
             tn_examples=[[], [], []],
             fp_examples=[[], [], []],
             fn_examples=[[], [], []])),
        ('single_threshold', {
            'thresholds': [0.5],
        },
         binary_confusion_matrices.Matrices(thresholds=[0.5],
                                            tp=[1.0],
                                            fp=[0.0],
                                            tn=[2.0],
                                            fn=[1.0],
                                            tp_examples=[[]],
                                            tn_examples=[[]],
                                            fp_examples=[[]],
                                            fn_examples=[[]])),
        ('multiple_thresholds', {
            'thresholds': [0.25, 0.75],
        },
         binary_confusion_matrices.Matrices(thresholds=[0.25, 0.75],
                                            tp=[2.0, 1.0],
                                            fp=[1.0, 0.0],
                                            tn=[1.0, 2.0],
                                            fn=[0.0, 1.0],
                                            tp_examples=[[], []],
                                            tn_examples=[[], []],
                                            fp_examples=[[], []],
                                            fn_examples=[[], []])),
        ('with_example_ids', {
            'thresholds': [0.1, 0.9],
            'example_id_key': 'example_id_key',
            'example_ids_count': 2,
        },
         binary_confusion_matrices.Matrices(
             thresholds=[0.1, 0.9],
             tp=[2.0, 0.0],
             fp=[1.0, 0.0],
             tn=[1.0, 2.0],
             fn=[0.0, 2.0],
             tp_examples=[['id_3', 'id_4'], []],
             tn_examples=[['id_1'], ['id_1', 'id_2']],
             fp_examples=[['id_2'], []],
             fn_examples=[[], ['id_3', 'id_4']])))
    def testBinaryConfusionMatrices_noHistograms(self, kwargs,
                                                 expected_matrices):
        computations = binary_confusion_matrices.binary_confusion_matrices(
            **kwargs)
        histogram = computations[0]
        matrices = computations[1]

        example1 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.0]),
            'example_weights': np.array([1.0]),
            'features': {
                'example_id_key': np.array(['id_1']),
            },
        }
        example2 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.5]),
            'example_weights': np.array([1.0]),
            'features': {
                'example_id_key': np.array(['id_2']),
            },
        }
        example3 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.3]),
            'example_weights': np.array([1.0]),
            'features': {
                'example_id_key': np.array(['id_3']),
            },
        }
        example4 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.9]),
            'example_weights': np.array([1.0]),
            'features': {
                'example_id_key': np.array(['id_4']),
            },
        }

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [example1, example2, example3, example4])
                | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
                | 'AddSlice' >> beam.Map(lambda x: ((), x))
                | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner)
                | 'ComputeMatrices' >> beam.Map(lambda x:
                                                (x[0], matrices.result(x[1])))
            )  # pyformat: disable

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    self.assertLen(got_metrics, 1)
                    name = '_binary_confusion_matrices_{}'.format(
                        kwargs['num_thresholds'] if 'num_thresholds' in
                        kwargs else kwargs['thresholds'])
                    key = metric_types.MetricKey(name=name)
                    self.assertIn(key, got_metrics)
                    got_matrices = got_metrics[key]
                    self.assertEqual(got_matrices, expected_matrices)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')

    def testBinaryConfusionMatricesTopK(self):
        computations = binary_confusion_matrices.binary_confusion_matrices(
            thresholds=[float('-inf')],
            sub_key=metric_types.SubKey(top_k=3),
            use_histogram=True)
        histogram = computations[0]
        matrices = computations[1]

        example1 = {
            'labels': np.array([2]),
            'predictions': np.array([0.1, 0.2, 0.1, 0.25, 0.35]),
            'example_weights': np.array([1.0])
        }
        example2 = {
            'labels': np.array([1]),
            'predictions': np.array([0.2, 0.3, 0.05, 0.15, 0.3]),
            'example_weights': np.array([1.0])
        }
        example3 = {
            'labels': np.array([3]),
            'predictions': np.array([0.01, 0.2, 0.09, 0.5, 0.2]),
            'example_weights': np.array([1.0])
        }
        example4 = {
            'labels': np.array([4]),
            'predictions': np.array([0.3, 0.2, 0.05, 0.4, 0.05]),
            'example_weights': np.array([1.0])
        }

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [example1, example2, example3, example4])
                | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
                | 'AddSlice' >> beam.Map(lambda x: ((), x))
                | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner)
                | 'ComputeMatrices' >> beam.Map(lambda x:
                                                (x[0], matrices.result(x[1])))
            )  # pyformat: disable

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    self.assertLen(got_metrics, 1)
                    key = metric_types.MetricKey(
                        name='_binary_confusion_matrices_[-inf]',
                        sub_key=metric_types.SubKey(top_k=3))
                    self.assertIn(key, got_metrics)
                    got_matrices = got_metrics[key]
                    self.assertEqual(
                        got_matrices,
                        binary_confusion_matrices.Matrices(
                            thresholds=[float('-inf')],
                            tp=[2.0],
                            fp=[10.0],
                            tn=[6.0],
                            fn=[2.0],
                            tp_examples=[],
                            tn_examples=[],
                            fp_examples=[],
                            fn_examples=[]))

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
    def test_jackknife_sample_combine_fn(self):
        x_key = metric_types.MetricKey('x')
        y_key = metric_types.MetricKey('y')
        cm_key = metric_types.MetricKey('confusion_matrix')
        cm_metric = binary_confusion_matrices.Matrices(thresholds=[0.5],
                                                       tp=[0],
                                                       fp=[1],
                                                       tn=[2],
                                                       fn=[3])
        slice_key1 = (('slice_feature', 1), )
        slice_key2 = (('slice_feature', 2), )
        samples = [
            # point estimate for slice 1
            (slice_key1,
             confidence_intervals_util.SampleMetrics(
                 sample_id=jackknife._FULL_SAMPLE_ID,
                 metrics={
                     x_key: 1.6,
                     y_key: 16,
                     cm_key: cm_metric,
                 })),
            # sample values 1 of 2 for slice 1
            (slice_key1,
             confidence_intervals_util.SampleMetrics(sample_id=0,
                                                     metrics={
                                                         x_key: 1,
                                                         y_key: 10,
                                                         cm_key: cm_metric - 1,
                                                     })),
            # sample values 2 of 2 for slice 1
            (slice_key1,
             confidence_intervals_util.SampleMetrics(sample_id=1,
                                                     metrics={
                                                         x_key: 2,
                                                         y_key: 20,
                                                         cm_key: cm_metric + 1,
                                                     })),
            # point estimate for slice 2
            (slice_key2,
             confidence_intervals_util.SampleMetrics(
                 sample_id=jackknife._FULL_SAMPLE_ID,
                 metrics={
                     x_key: 3.3,
                     y_key: 33,
                     cm_key: cm_metric,
                 })),
            # sample values 1 of 2 for slice 2
            (slice_key2,
             confidence_intervals_util.SampleMetrics(sample_id=0,
                                                     metrics={
                                                         x_key: 2,
                                                         y_key: 20,
                                                         cm_key:
                                                         cm_metric - 10,
                                                     })),
            # sample values 2 of 2 for slice 2
            (slice_key2,
             confidence_intervals_util.SampleMetrics(sample_id=1,
                                                     metrics={
                                                         x_key: 4,
                                                         y_key: 40,
                                                         cm_key:
                                                         cm_metric + 10,
                                                     })),
        ]

        with beam.Pipeline() as pipeline:
            result = (pipeline
                      | 'Create' >> beam.Create(samples, reshuffle=False)
                      | 'CombineJackknifeSamplesPerKey' >> beam.CombinePerKey(
                          jackknife._JackknifeSampleCombineFn(
                              num_jackknife_samples=2)))

            # WARNING: Do not change this test without carefully considering the
            # impact on clients due to changed CI bounds. The current implementation
            # follows jackknife cookie bucket method described in:
            # go/rasta-confidence-intervals
            def check_result(got_pcoll):
                expected_pcoll = [
                    (slice_key1, {
                        x_key:
                        types.ValueWithTDistribution(
                            sample_mean=1.5,
                            sample_standard_deviation=0.5,
                            sample_degrees_of_freedom=1,
                            unsampled_value=1.6),
                        y_key:
                        types.ValueWithTDistribution(
                            sample_mean=15.,
                            sample_standard_deviation=5,
                            sample_degrees_of_freedom=1,
                            unsampled_value=16),
                        cm_key:
                        types.ValueWithTDistribution(
                            sample_mean=cm_metric,
                            sample_standard_deviation=(
                                binary_confusion_matrices.Matrices(
                                    thresholds=[0.5],
                                    tp=[1],
                                    fp=[1],
                                    tn=[1],
                                    fn=[1])),
                            sample_degrees_of_freedom=1,
                            unsampled_value=cm_metric),
                    }),
                    (slice_key2, {
                        x_key:
                        types.ValueWithTDistribution(
                            sample_mean=3.,
                            sample_standard_deviation=1,
                            sample_degrees_of_freedom=1,
                            unsampled_value=3.3),
                        y_key:
                        types.ValueWithTDistribution(
                            sample_mean=30.,
                            sample_standard_deviation=10,
                            sample_degrees_of_freedom=1,
                            unsampled_value=33),
                        cm_key:
                        types.ValueWithTDistribution(
                            sample_mean=cm_metric,
                            sample_standard_deviation=(
                                binary_confusion_matrices.Matrices(
                                    thresholds=[0.5],
                                    tp=[10],
                                    fp=[10],
                                    tn=[10],
                                    fn=[10])),
                            sample_degrees_of_freedom=1,
                            unsampled_value=cm_metric),
                    }),
                ]
                self.assertCountEqual(expected_pcoll, got_pcoll)

            util.assert_that(result, check_result)
    def test_jackknife_merge_jackknife_samples(self):
        x_key = metric_types.MetricKey(u'x')
        y_key = metric_types.MetricKey(u'y')
        cm_key = metric_types.MetricKey(u'confusion_matrix')
        cm_metric = binary_confusion_matrices.Matrices(thresholds=[0.5],
                                                       tp=[0],
                                                       fp=[1],
                                                       tn=[2],
                                                       fn=[3],
                                                       tp_examples=[],
                                                       tn_examples=[],
                                                       fp_examples=[],
                                                       fn_examples=[])
        example_count_key = metric_types.MetricKey(
            example_count.EXAMPLE_COUNT_NAME)
        slice_key1 = (u'slice_feature', 1)
        slice_key2 = (u'slice_feature', 2)
        sliced_derived_metrics = [
            # unsampled value for slice 1
            ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY,
                           jackknife._JACKKNIFE_FULL_SAMPLE_ID)), {
                               x_key: 1.6,
                               y_key: 16,
                               cm_key: cm_metric,
                               example_count_key: 100,
                               jackknife._JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY:
                               100
                           }),
            # sample values 1 of 2 for slice 1
            ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 0)), {
                x_key: 1,
                y_key: 10,
                cm_key: cm_metric,
                example_count_key: 45,
            }),
            # sample values 2 of 2 for slice 1
            ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 1)), {
                x_key: 2,
                y_key: 20,
                cm_key: cm_metric,
                example_count_key: 55,
            }),
            # unsampled value for slice 2
            ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY,
                           jackknife._JACKKNIFE_FULL_SAMPLE_ID)), {
                               x_key: 3.3,
                               y_key: 33,
                               cm_key: cm_metric,
                               example_count_key: 1000,
                               jackknife._JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY:
                               1000
                           }),
            # sample values 1 of 2 for slice 2
            ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 0)), {
                x_key: 2,
                y_key: 20,
                cm_key: cm_metric,
                example_count_key: 450,
            }),
            # sample values 2 of 2 for slice 2
            ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 1)), {
                x_key: 4,
                y_key: 40,
                cm_key: cm_metric,
                example_count_key: 550,
            }),
        ]

        with beam.Pipeline() as pipeline:
            result = (
                pipeline
                | 'Create' >> beam.Create(sliced_derived_metrics,
                                          reshuffle=False)
                | 'JackknifeCombinePerKey' >> jackknife.MergeJackknifeSamples(
                    num_jackknife_samples=2,
                    skip_ci_metric_keys=[example_count_key]))

            # For standard error calculations, see delete-d jackknife formula in:
            # https://www.stat.berkeley.edu/~hhuang/STAT152/Jackknife-Bootstrap.pdf
            # Rather than normalize by all possible n-choose-d samples, we normalize
            # by the actual number of samples (2).
            def check_result(got_pcoll):
                expected_pcoll = [
                    (
                        (slice_key1, ),
                        {
                            x_key:
                            types.ValueWithTDistribution(
                                sample_mean=1.5,
                                # (((100 - 100/2)/(100/2))*np.var([1, 2]))**0.5
                                sample_standard_deviation=.5,
                                sample_degrees_of_freedom=1,
                                unsampled_value=1.6),
                            y_key:
                            types.ValueWithTDistribution(
                                sample_mean=15,
                                # (((100 - 100/2)/(100/2))*np.var([10, 20]))**0.5
                                sample_standard_deviation=5,
                                sample_degrees_of_freedom=1,
                                unsampled_value=16),
                            cm_key:
                            cm_metric,
                            example_count_key:
                            100,
                        }),
                    (
                        (slice_key2, ),
                        {
                            x_key:
                            types.ValueWithTDistribution(
                                sample_mean=3,
                                # (((1000 - 1000/2)/(1000/2))*np.var([2, 4]))**0.5
                                sample_standard_deviation=1,
                                sample_degrees_of_freedom=1,
                                unsampled_value=3.3),
                            y_key:
                            types.ValueWithTDistribution(
                                sample_mean=30,
                                # (((1000 - 1000/2)/(1000/2))*np.var([20, 40]))**0.5
                                sample_standard_deviation=10,
                                sample_degrees_of_freedom=1,
                                unsampled_value=33),
                            cm_key:
                            cm_metric,
                            example_count_key:
                            1000,
                        }),
                ]
                self.assertCountEqual(expected_pcoll, got_pcoll)

            util.assert_that(result, check_result)
  def test_boostrap_sample_combine_fn_per_slice(self):
    x_key = metric_types.MetricKey('x')
    y_key = metric_types.MetricKey('y')
    cm_key = metric_types.MetricKey('confusion_matrix')
    cm_metric = binary_confusion_matrices.Matrices(
        thresholds=[0.5], tp=[0], fp=[1], tn=[2], fn=[3])
    skipped_metric_key = metric_types.MetricKey('skipped_metric')
    slice_key1 = (('slice_feature', 1),)
    slice_key2 = (('slice_feature', 2),)
    samples = [
        # unsampled value for slice 1
        (slice_key1,
         confidence_intervals_util.SampleMetrics(
             sample_id=poisson_bootstrap._FULL_SAMPLE_ID,
             metrics={
                 x_key: 1.6,
                 y_key: 16,
                 cm_key: cm_metric,
                 skipped_metric_key: 100,
             })),
        # sample values 1 of 2 for slice 1
        (slice_key1,
         confidence_intervals_util.SampleMetrics(
             sample_id=0,
             metrics={
                 x_key: 1,
                 y_key: 10,
                 cm_key: cm_metric,
                 skipped_metric_key: 45,
             })),
        # sample values 2 of 2 for slice 1
        (slice_key1,
         confidence_intervals_util.SampleMetrics(
             sample_id=1,
             metrics={
                 x_key: 2,
                 y_key: 20,
                 cm_key: cm_metric,
                 skipped_metric_key: 55,
             })),
        # unsampled value for slice 2
        (slice_key2,
         confidence_intervals_util.SampleMetrics(
             sample_id=poisson_bootstrap._FULL_SAMPLE_ID,
             metrics={
                 x_key: 3.3,
                 y_key: 33,
                 cm_key: cm_metric,
                 skipped_metric_key: 1000,
             })),
        # sample values 1 of 2 for slice 2
        (slice_key2,
         confidence_intervals_util.SampleMetrics(
             sample_id=0,
             metrics={
                 x_key: 2,
                 y_key: 20,
                 cm_key: cm_metric,
                 skipped_metric_key: 450,
             })),
        # sample values 2 of 2 for slice 2
        (slice_key2,
         confidence_intervals_util.SampleMetrics(
             sample_id=1,
             metrics={
                 x_key: 4,
                 y_key: 40,
                 cm_key: cm_metric,
                 skipped_metric_key: 550,
             })),
    ]

    with beam.Pipeline() as pipeline:
      result = (
          pipeline
          | 'Create' >> beam.Create(samples, reshuffle=False)
          | 'CombineSamplesPerKey' >> beam.CombinePerKey(
              poisson_bootstrap._BootstrapSampleCombineFn(
                  num_bootstrap_samples=2,
                  skip_ci_metric_keys=[skipped_metric_key])))

      def check_result(got_pcoll):
        expected_pcoll = [
            (
                slice_key1,
                {
                    x_key:
                        types.ValueWithTDistribution(
                            sample_mean=1.5,
                            # sample_standard_deviation=0.5
                            sample_standard_deviation=np.std([1, 2], ddof=1),
                            sample_degrees_of_freedom=1,
                            unsampled_value=1.6),
                    y_key:
                        types.ValueWithTDistribution(
                            sample_mean=15.,
                            # sample_standard_deviation=5,
                            sample_standard_deviation=np.std([10, 20], ddof=1),
                            sample_degrees_of_freedom=1,
                            unsampled_value=16),
                    cm_key:
                        types.ValueWithTDistribution(
                            sample_mean=cm_metric,
                            sample_standard_deviation=cm_metric * 0,
                            sample_degrees_of_freedom=1,
                            unsampled_value=cm_metric),
                    skipped_metric_key:
                        100,
                }),
            (
                slice_key2,
                {
                    x_key:
                        types.ValueWithTDistribution(
                            sample_mean=3.,
                            # sample_standard_deviation=1,
                            sample_standard_deviation=np.std([2, 4], ddof=1),
                            sample_degrees_of_freedom=1,
                            unsampled_value=3.3),
                    y_key:
                        types.ValueWithTDistribution(
                            sample_mean=30.,
                            # sample_standard_deviation=10,
                            sample_standard_deviation=np.std([20, 40], ddof=1),
                            sample_degrees_of_freedom=1,
                            unsampled_value=33),
                    cm_key:
                        types.ValueWithTDistribution(
                            sample_mean=cm_metric,
                            sample_standard_deviation=cm_metric * 0,
                            sample_degrees_of_freedom=1,
                            unsampled_value=cm_metric),
                    skipped_metric_key:
                        1000,
                }),
        ]
        self.assertCountEqual(expected_pcoll, got_pcoll)

      util.assert_that(result, check_result)
class ConfidenceIntervalsUtilTest(parameterized.TestCase):

  @parameterized.named_parameters(
      {
          'testcase_name': '_ints',
          'values': [0, 1, 2],
          'ddof': 1,
          'expected_mean': 1,
          'expected_std': np.std([0, 1, 2], ddof=1),
      }, {
          'testcase_name': '_ndarrays',
          'values': [np.array([0]), np.array([1]),
                     np.array([2])],
          'ddof': 1,
          'expected_mean': np.array([1]),
          'expected_std': np.array([np.std([0, 1, 2], ddof=1)]),
      }, {
          'testcase_name':
              '_confusion_matrices',
          'values': [
              binary_confusion_matrices.Matrices(
                  thresholds=[0.5], tp=[0], fp=[1], tn=[2], fn=[3]),
              binary_confusion_matrices.Matrices(
                  thresholds=[0.5], tp=[4], fp=[5], tn=[6], fn=[7]),
              binary_confusion_matrices.Matrices(
                  thresholds=[0.5], tp=[8], fp=[9], tn=[10], fn=[11])
          ],
          'ddof':
              1,
          'expected_mean':
              binary_confusion_matrices.Matrices(
                  thresholds=[0.5],
                  tp=np.mean([0, 4, 8]),
                  fp=np.mean([1, 5, 9]),
                  tn=np.mean([2, 6, 10]),
                  fn=np.mean([3, 7, 11])),
          'expected_std':
              binary_confusion_matrices.Matrices(
                  thresholds=[0.5],
                  tp=np.std([0, 4, 8], ddof=1),
                  fp=np.std([1, 5, 9], ddof=1),
                  tn=np.std([2, 6, 10], ddof=1),
                  fn=np.std([3, 7, 11], ddof=1)),
      })
  def test_mean_and_std(self, values, ddof, expected_mean, expected_std):
    actual_mean, actual_std = confidence_intervals_util.mean_and_std(
        values, ddof)
    self.assertEqual(expected_mean, actual_mean)
    self.assertEqual(expected_std, actual_std)

  def test_sample_combine_fn(self):
    metric_key = metric_types.MetricKey('metric')
    array_metric_key = metric_types.MetricKey('array_metric')
    missing_sample_metric_key = metric_types.MetricKey('missing_metric')
    non_numeric_metric_key = metric_types.MetricKey('non_numeric_metric')
    non_numeric_array_metric_key = metric_types.MetricKey('non_numeric_array')
    skipped_metric_key = metric_types.MetricKey('skipped_metric')
    slice_key1 = (('slice_feature', 1),)
    slice_key2 = (('slice_feature', 2),)
    # the sample value is irrelevant for this test as we only verify counters.
    samples = [
        # unsampled value for slice 1
        (slice_key1,
         confidence_intervals_util.SampleMetrics(
             sample_id=_FULL_SAMPLE_ID,
             metrics={
                 metric_key: 2.1,
                 array_metric_key: np.array([1, 2]),
                 missing_sample_metric_key: 3,
                 non_numeric_metric_key: 'a',
                 non_numeric_array_metric_key: np.array(['a', 'aaa']),
                 skipped_metric_key: 16
             })),
        # sample values for slice 1
        (slice_key1,
         confidence_intervals_util.SampleMetrics(
             sample_id=0,
             metrics={
                 metric_key: 1,
                 array_metric_key: np.array([2, 3]),
                 missing_sample_metric_key: 2,
                 non_numeric_metric_key: 'b',
                 non_numeric_array_metric_key: np.array(['a', 'aaa']),
                 skipped_metric_key: 7
             })),
        # sample values for slice 1 missing missing_sample_metric_key
        (slice_key1,
         confidence_intervals_util.SampleMetrics(
             sample_id=1,
             metrics={
                 metric_key: 2,
                 array_metric_key: np.array([0, 1]),
                 non_numeric_metric_key: 'c',
                 non_numeric_array_metric_key: np.array(['a', 'aaa']),
                 skipped_metric_key: 8
             })),
        # unsampled value for slice 2
        (slice_key2,
         confidence_intervals_util.SampleMetrics(
             sample_id=_FULL_SAMPLE_ID,
             metrics={
                 metric_key: 6.3,
                 array_metric_key: np.array([10, 20]),
                 missing_sample_metric_key: 6,
                 non_numeric_metric_key: 'd',
                 non_numeric_array_metric_key: np.array(['a', 'aaa']),
                 skipped_metric_key: 10000
             })),
        # Only 1 sample value (missing sample ID 1) for slice 2
        (slice_key2,
         confidence_intervals_util.SampleMetrics(
             sample_id=0,
             metrics={
                 metric_key: 3,
                 array_metric_key: np.array([20, 30]),
                 missing_sample_metric_key: 12,
                 non_numeric_metric_key: 'd',
                 non_numeric_array_metric_key: np.array(['a', 'aaa']),
                 skipped_metric_key: 5000
             })),
    ]

    with beam.Pipeline() as pipeline:
      result = (
          pipeline
          | 'Create' >> beam.Create(samples, reshuffle=False)
          | 'CombineSamplesPerKey' >> beam.CombinePerKey(
              _ValidateSampleCombineFn(
                  num_samples=2,
                  full_sample_id=_FULL_SAMPLE_ID,
                  skip_ci_metric_keys=[skipped_metric_key])))

      def check_result(got_pcoll):
        self.assertLen(got_pcoll, 2)
        accumulators_by_slice = dict(got_pcoll)

        self.assertIn(slice_key1, accumulators_by_slice)
        slice1_accumulator = accumulators_by_slice[slice_key1]
        # check unsampled value
        self.assertIn(metric_key, slice1_accumulator.point_estimates)
        self.assertEqual(2.1, slice1_accumulator.point_estimates[metric_key])
        # check numeric case sample_values
        self.assertIn(metric_key, slice1_accumulator.metric_samples)
        self.assertEqual([1, 2], slice1_accumulator.metric_samples[metric_key])
        # check numeric array in sample_values
        self.assertIn(array_metric_key, slice1_accumulator.metric_samples)
        array_metric_samples = (
            slice1_accumulator.metric_samples[array_metric_key])
        self.assertLen(array_metric_samples, 2)
        testing.assert_array_equal(np.array([2, 3]), array_metric_samples[0])
        testing.assert_array_equal(np.array([0, 1]), array_metric_samples[1])
        # check that non-numeric metric sample_values are not present
        self.assertIn(non_numeric_metric_key,
                      slice1_accumulator.point_estimates)
        self.assertNotIn(non_numeric_metric_key,
                         slice1_accumulator.metric_samples)
        self.assertIn(non_numeric_array_metric_key,
                      slice1_accumulator.point_estimates)
        self.assertNotIn(non_numeric_array_metric_key,
                         slice1_accumulator.metric_samples)
        # check that single metric missing samples generates error
        error_key = metric_types.MetricKey('__ERROR__')
        self.assertIn(error_key, slice1_accumulator.point_estimates)
        self.assertRegex(slice1_accumulator.point_estimates[error_key],
                         'CI not computed for.*missing_metric.*')
        # check that skipped metrics have no samples
        self.assertNotIn(skipped_metric_key, slice1_accumulator.metric_samples)

        self.assertIn(slice_key2, accumulators_by_slice)
        slice2_accumulator = accumulators_by_slice[slice_key2]
        # check unsampled value
        self.assertIn(metric_key, slice2_accumulator.point_estimates)
        self.assertEqual(6.3, slice2_accumulator.point_estimates[metric_key])
        # check that entirely missing sample generates error
        self.assertIn(
            metric_types.MetricKey('__ERROR__'),
            slice2_accumulator.point_estimates)
        self.assertRegex(slice2_accumulator.point_estimates[error_key],
                         'CI not computed because only 1.*Expected 2.*')

      util.assert_that(result, check_result)

      runner_result = pipeline.run()
      # we expect one missing samples counter increment for slice2, since we
      # expected 2 samples, but only saw 1.
      metric_filter = beam.metrics.metric.MetricsFilter().with_name(
          'num_slices_missing_samples')
      counters = runner_result.metrics().query(filter=metric_filter)['counters']
      self.assertLen(counters, 1)
      self.assertEqual(1, counters[0].committed)

      # verify total slice counter
      metric_filter = beam.metrics.metric.MetricsFilter().with_name(
          'num_slices')
      counters = runner_result.metrics().query(filter=metric_filter)['counters']
      self.assertLen(counters, 1)
      self.assertEqual(2, counters[0].committed)