def check_result(got_pcoll): expected_pcoll = [ (slice_key1, { x_key: types.ValueWithTDistribution( sample_mean=1.5, sample_standard_deviation=0.5, sample_degrees_of_freedom=1, unsampled_value=1.6), y_key: types.ValueWithTDistribution( sample_mean=15., sample_standard_deviation=5, sample_degrees_of_freedom=1, unsampled_value=16), cm_key: types.ValueWithTDistribution( sample_mean=cm_metric, sample_standard_deviation=( binary_confusion_matrices.Matrices( thresholds=[0.5], tp=[1], fp=[1], tn=[1], fn=[1])), sample_degrees_of_freedom=1, unsampled_value=cm_metric), }), (slice_key2, { x_key: types.ValueWithTDistribution( sample_mean=3., sample_standard_deviation=1, sample_degrees_of_freedom=1, unsampled_value=3.3), y_key: types.ValueWithTDistribution( sample_mean=30., sample_standard_deviation=10, sample_degrees_of_freedom=1, unsampled_value=33), cm_key: types.ValueWithTDistribution( sample_mean=cm_metric, sample_standard_deviation=( binary_confusion_matrices.Matrices( thresholds=[0.5], tp=[10], fp=[10], tn=[10], fn=[10])), sample_degrees_of_freedom=1, unsampled_value=cm_metric), }), ] self.assertCountEqual(expected_pcoll, got_pcoll)
def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_metrics, 1) key = metric_types.MetricKey( name='_binary_confusion_matrices_[-inf]', sub_key=metric_types.SubKey(top_k=3)) self.assertIn(key, got_metrics) got_matrices = got_metrics[key] self.assertEqual( got_matrices, binary_confusion_matrices.Matrices( thresholds=[float('-inf')], tp=[2.0], fp=[10.0], tn=[6.0], fn=[2.0], tp_examples=[], tn_examples=[], fp_examples=[], fn_examples=[])) except AssertionError as err: raise util.BeamAssertException(err)
def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_metrics, 1) key = metric_types.MetricKey( name='confusion_matrix_at_thresholds') self.assertIn(key, got_metrics) got_metric = got_metrics[key] self.assertEqual( binary_confusion_matrices.Matrices( thresholds=[0.3, 0.5, 0.8], tp=[1.0, 1.0, 1.0], tn=[1.0, 2.0, 2.0], fp=[1.0, 0.0, 0.0], fn=[1.0, 1.0, 1.0]), got_metric) except AssertionError as err: raise util.BeamAssertException(err)
def testCalculateConfidenceIntervalConfusionMatrices(self): mid, lb, ub = math_util.calculate_confidence_interval( types.ValueWithTDistribution( sample_mean=binary_confusion_matrices.Matrices( thresholds=[0.5], tp=[0.0], tn=[2.0], fp=[1.0], fn=[1.0]), sample_standard_deviation=binary_confusion_matrices.Matrices( thresholds=[0.5], tp=[0.0], tn=[2.051956704170308], fp=[1.025978352085154], fn=[1.2139539573337679]), sample_degrees_of_freedom=19, unsampled_value=binary_confusion_matrices.Matrices( thresholds=[0.5], tp=[0.0], tn=[2.0], fp=[1.0], fn=[1.0]))) expected_mid = binary_confusion_matrices.Matrices(thresholds=[0.5], tp=[0.0], tn=[2.0], fp=[1.0], fn=[1.0]) self.assertEqual(expected_mid, mid) expected_lb = binary_confusion_matrices.Matrices( thresholds=[0.5], tp=[0.0], tn=[-2.2947947404327547], fp=[-1.1473973702163773], fn=[-1.5408348336436783]) self.assertEqual(expected_lb.thresholds, lb.thresholds) np.testing.assert_almost_equal(lb.tp, expected_lb.tp) np.testing.assert_almost_equal(lb.fp, expected_lb.fp) np.testing.assert_almost_equal(lb.tn, expected_lb.tn) np.testing.assert_almost_equal(lb.fn, expected_lb.fn) expected_ub = binary_confusion_matrices.Matrices( thresholds=[0.5], tp=[0.0], tn=[6.294794740432755], fp=[3.1473973702163773], fn=[3.5408348336436783]) self.assertEqual(expected_ub.thresholds, ub.thresholds) np.testing.assert_almost_equal(ub.tp, expected_ub.tp) np.testing.assert_almost_equal(ub.fp, expected_ub.fp) np.testing.assert_almost_equal(ub.tn, expected_ub.tn) np.testing.assert_almost_equal(ub.fn, expected_ub.fn)
def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_metrics, 1) key = metric_types.MetricKey( name='{}_[-inf]'.format( binary_confusion_matrices.BINARY_CONFUSION_MATRICES_NAME), sub_key=metric_types.SubKey(top_k=3)) self.assertIn(key, got_metrics) got_matrices = got_metrics[key] self.assertEqual( got_matrices, binary_confusion_matrices.Matrices( thresholds=[float('-inf')], tp=[2.0], fp=[10.0], tn=[6.0], fn=[2.0])) except AssertionError as err: raise util.BeamAssertException(err)
class BinaryConfusionMatricesTest(testutil.TensorflowModelAnalysisTest, parameterized.TestCase): @parameterized.named_parameters( ('using_num_thresholds', { 'num_thresholds': 3, }, binary_confusion_matrices.Matrices( thresholds=[-1e-7, 0.5, 1.0 + 1e-7], tp=[2.0, 1.0, 0.0], fp=[2.0, 0.0, 0.0], tn=[0.0, 2.0, 2.0], fn=[0.0, 1.0, 2.0], tp_examples=[], tn_examples=[], fp_examples=[], fn_examples=[])), ('single_threshold', { 'thresholds': [0.5], 'use_histogram': True, }, binary_confusion_matrices.Matrices(thresholds=[0.5], tp=[1.0], fp=[0.0], tn=[2.0], fn=[1.0], tp_examples=[], tn_examples=[], fp_examples=[], fn_examples=[])), ('inner_thresholds', { 'thresholds': [0.25, 0.75], 'use_histogram': True, }, binary_confusion_matrices.Matrices(thresholds=[0.25, 0.75], tp=[2.0, 1.0], fp=[1.0, 0.0], tn=[1.0, 2.0], fn=[0.0, 1.0], tp_examples=[], tn_examples=[], fp_examples=[], fn_examples=[])), ('boundary_thresholds', { 'thresholds': [0.0, 1.0], 'use_histogram': True, }, binary_confusion_matrices.Matrices(thresholds=[0.0, 1.0], tp=[2.0, 0.0], fp=[2.0, 0.0], tn=[0.0, 2.0], fn=[0.0, 2.0], tp_examples=[], tn_examples=[], fp_examples=[], fn_examples=[])), ('left_boundary', { 'thresholds': [0.0, 0.5], 'use_histogram': True, }, binary_confusion_matrices.Matrices(thresholds=[0.0, 0.5], tp=[2.0, 1.0], fp=[2.0, 0.0], tn=[0.0, 2.0], fn=[0.0, 1.0], tp_examples=[], tn_examples=[], fp_examples=[], fn_examples=[])), ('right_boundary', { 'thresholds': [0.5, 1.0], 'use_histogram': True, }, binary_confusion_matrices.Matrices(thresholds=[0.5, 1.0], tp=[1.0, 0.0], fp=[0.0, 0.0], tn=[2.0, 2.0], fn=[1.0, 2.0], tp_examples=[], tn_examples=[], fp_examples=[], fn_examples=[])), ) def testBinaryConfusionMatrices(self, kwargs, expected_matrices): computations = binary_confusion_matrices.binary_confusion_matrices( **kwargs) histogram = computations[0] matrices = computations[1] example1 = { 'labels': np.array([0.0]), 'predictions': np.array([0.0]), 'example_weights': np.array([1.0]) } example2 = { 'labels': np.array([0.0]), 'predictions': np.array([0.5]), 'example_weights': np.array([1.0]) } example3 = { 'labels': np.array([1.0]), 'predictions': np.array([0.3]), 'example_weights': np.array([1.0]) } example4 = { 'labels': np.array([1.0]), 'predictions': np.array([0.9]), 'example_weights': np.array([1.0]) } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [example1, example2, example3, example4]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner) | 'ComputeMatrices' >> beam.Map(lambda x: (x[0], matrices.result(x[1]))) ) # pyformat: disable # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_metrics, 1) name = '_binary_confusion_matrices_{}'.format( kwargs['num_thresholds'] if 'num_thresholds' in kwargs else kwargs['thresholds']) key = metric_types.MetricKey(name=name) self.assertIn(key, got_metrics) got_matrices = got_metrics[key] self.assertEqual(got_matrices, expected_matrices) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result') @parameterized.named_parameters( ('using_num_thresholds', { 'num_thresholds': 3, 'use_histogram': False, }, binary_confusion_matrices.Matrices( thresholds=[-1e-7, 0.5, 1.0 + 1e-7], tp=[2.0, 1.0, 0.0], fp=[2.0, 0.0, 0.0], tn=[0.0, 2.0, 2.0], fn=[0.0, 1.0, 2.0], tp_examples=[[], [], []], tn_examples=[[], [], []], fp_examples=[[], [], []], fn_examples=[[], [], []])), ('single_threshold', { 'thresholds': [0.5], }, binary_confusion_matrices.Matrices(thresholds=[0.5], tp=[1.0], fp=[0.0], tn=[2.0], fn=[1.0], tp_examples=[[]], tn_examples=[[]], fp_examples=[[]], fn_examples=[[]])), ('multiple_thresholds', { 'thresholds': [0.25, 0.75], }, binary_confusion_matrices.Matrices(thresholds=[0.25, 0.75], tp=[2.0, 1.0], fp=[1.0, 0.0], tn=[1.0, 2.0], fn=[0.0, 1.0], tp_examples=[[], []], tn_examples=[[], []], fp_examples=[[], []], fn_examples=[[], []])), ('with_example_ids', { 'thresholds': [0.1, 0.9], 'example_id_key': 'example_id_key', 'example_ids_count': 2, }, binary_confusion_matrices.Matrices( thresholds=[0.1, 0.9], tp=[2.0, 0.0], fp=[1.0, 0.0], tn=[1.0, 2.0], fn=[0.0, 2.0], tp_examples=[['id_3', 'id_4'], []], tn_examples=[['id_1'], ['id_1', 'id_2']], fp_examples=[['id_2'], []], fn_examples=[[], ['id_3', 'id_4']]))) def testBinaryConfusionMatrices_noHistograms(self, kwargs, expected_matrices): computations = binary_confusion_matrices.binary_confusion_matrices( **kwargs) histogram = computations[0] matrices = computations[1] example1 = { 'labels': np.array([0.0]), 'predictions': np.array([0.0]), 'example_weights': np.array([1.0]), 'features': { 'example_id_key': np.array(['id_1']), }, } example2 = { 'labels': np.array([0.0]), 'predictions': np.array([0.5]), 'example_weights': np.array([1.0]), 'features': { 'example_id_key': np.array(['id_2']), }, } example3 = { 'labels': np.array([1.0]), 'predictions': np.array([0.3]), 'example_weights': np.array([1.0]), 'features': { 'example_id_key': np.array(['id_3']), }, } example4 = { 'labels': np.array([1.0]), 'predictions': np.array([0.9]), 'example_weights': np.array([1.0]), 'features': { 'example_id_key': np.array(['id_4']), }, } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [example1, example2, example3, example4]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner) | 'ComputeMatrices' >> beam.Map(lambda x: (x[0], matrices.result(x[1]))) ) # pyformat: disable # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_metrics, 1) name = '_binary_confusion_matrices_{}'.format( kwargs['num_thresholds'] if 'num_thresholds' in kwargs else kwargs['thresholds']) key = metric_types.MetricKey(name=name) self.assertIn(key, got_metrics) got_matrices = got_metrics[key] self.assertEqual(got_matrices, expected_matrices) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result') def testBinaryConfusionMatricesTopK(self): computations = binary_confusion_matrices.binary_confusion_matrices( thresholds=[float('-inf')], sub_key=metric_types.SubKey(top_k=3), use_histogram=True) histogram = computations[0] matrices = computations[1] example1 = { 'labels': np.array([2]), 'predictions': np.array([0.1, 0.2, 0.1, 0.25, 0.35]), 'example_weights': np.array([1.0]) } example2 = { 'labels': np.array([1]), 'predictions': np.array([0.2, 0.3, 0.05, 0.15, 0.3]), 'example_weights': np.array([1.0]) } example3 = { 'labels': np.array([3]), 'predictions': np.array([0.01, 0.2, 0.09, 0.5, 0.2]), 'example_weights': np.array([1.0]) } example4 = { 'labels': np.array([4]), 'predictions': np.array([0.3, 0.2, 0.05, 0.4, 0.05]), 'example_weights': np.array([1.0]) } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [example1, example2, example3, example4]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner) | 'ComputeMatrices' >> beam.Map(lambda x: (x[0], matrices.result(x[1]))) ) # pyformat: disable # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_metrics, 1) key = metric_types.MetricKey( name='_binary_confusion_matrices_[-inf]', sub_key=metric_types.SubKey(top_k=3)) self.assertIn(key, got_metrics) got_matrices = got_metrics[key] self.assertEqual( got_matrices, binary_confusion_matrices.Matrices( thresholds=[float('-inf')], tp=[2.0], fp=[10.0], tn=[6.0], fn=[2.0], tp_examples=[], tn_examples=[], fp_examples=[], fn_examples=[])) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def test_jackknife_sample_combine_fn(self): x_key = metric_types.MetricKey('x') y_key = metric_types.MetricKey('y') cm_key = metric_types.MetricKey('confusion_matrix') cm_metric = binary_confusion_matrices.Matrices(thresholds=[0.5], tp=[0], fp=[1], tn=[2], fn=[3]) slice_key1 = (('slice_feature', 1), ) slice_key2 = (('slice_feature', 2), ) samples = [ # point estimate for slice 1 (slice_key1, confidence_intervals_util.SampleMetrics( sample_id=jackknife._FULL_SAMPLE_ID, metrics={ x_key: 1.6, y_key: 16, cm_key: cm_metric, })), # sample values 1 of 2 for slice 1 (slice_key1, confidence_intervals_util.SampleMetrics(sample_id=0, metrics={ x_key: 1, y_key: 10, cm_key: cm_metric - 1, })), # sample values 2 of 2 for slice 1 (slice_key1, confidence_intervals_util.SampleMetrics(sample_id=1, metrics={ x_key: 2, y_key: 20, cm_key: cm_metric + 1, })), # point estimate for slice 2 (slice_key2, confidence_intervals_util.SampleMetrics( sample_id=jackknife._FULL_SAMPLE_ID, metrics={ x_key: 3.3, y_key: 33, cm_key: cm_metric, })), # sample values 1 of 2 for slice 2 (slice_key2, confidence_intervals_util.SampleMetrics(sample_id=0, metrics={ x_key: 2, y_key: 20, cm_key: cm_metric - 10, })), # sample values 2 of 2 for slice 2 (slice_key2, confidence_intervals_util.SampleMetrics(sample_id=1, metrics={ x_key: 4, y_key: 40, cm_key: cm_metric + 10, })), ] with beam.Pipeline() as pipeline: result = (pipeline | 'Create' >> beam.Create(samples, reshuffle=False) | 'CombineJackknifeSamplesPerKey' >> beam.CombinePerKey( jackknife._JackknifeSampleCombineFn( num_jackknife_samples=2))) # WARNING: Do not change this test without carefully considering the # impact on clients due to changed CI bounds. The current implementation # follows jackknife cookie bucket method described in: # go/rasta-confidence-intervals def check_result(got_pcoll): expected_pcoll = [ (slice_key1, { x_key: types.ValueWithTDistribution( sample_mean=1.5, sample_standard_deviation=0.5, sample_degrees_of_freedom=1, unsampled_value=1.6), y_key: types.ValueWithTDistribution( sample_mean=15., sample_standard_deviation=5, sample_degrees_of_freedom=1, unsampled_value=16), cm_key: types.ValueWithTDistribution( sample_mean=cm_metric, sample_standard_deviation=( binary_confusion_matrices.Matrices( thresholds=[0.5], tp=[1], fp=[1], tn=[1], fn=[1])), sample_degrees_of_freedom=1, unsampled_value=cm_metric), }), (slice_key2, { x_key: types.ValueWithTDistribution( sample_mean=3., sample_standard_deviation=1, sample_degrees_of_freedom=1, unsampled_value=3.3), y_key: types.ValueWithTDistribution( sample_mean=30., sample_standard_deviation=10, sample_degrees_of_freedom=1, unsampled_value=33), cm_key: types.ValueWithTDistribution( sample_mean=cm_metric, sample_standard_deviation=( binary_confusion_matrices.Matrices( thresholds=[0.5], tp=[10], fp=[10], tn=[10], fn=[10])), sample_degrees_of_freedom=1, unsampled_value=cm_metric), }), ] self.assertCountEqual(expected_pcoll, got_pcoll) util.assert_that(result, check_result)
def test_jackknife_merge_jackknife_samples(self): x_key = metric_types.MetricKey(u'x') y_key = metric_types.MetricKey(u'y') cm_key = metric_types.MetricKey(u'confusion_matrix') cm_metric = binary_confusion_matrices.Matrices(thresholds=[0.5], tp=[0], fp=[1], tn=[2], fn=[3], tp_examples=[], tn_examples=[], fp_examples=[], fn_examples=[]) example_count_key = metric_types.MetricKey( example_count.EXAMPLE_COUNT_NAME) slice_key1 = (u'slice_feature', 1) slice_key2 = (u'slice_feature', 2) sliced_derived_metrics = [ # unsampled value for slice 1 ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, jackknife._JACKKNIFE_FULL_SAMPLE_ID)), { x_key: 1.6, y_key: 16, cm_key: cm_metric, example_count_key: 100, jackknife._JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY: 100 }), # sample values 1 of 2 for slice 1 ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 0)), { x_key: 1, y_key: 10, cm_key: cm_metric, example_count_key: 45, }), # sample values 2 of 2 for slice 1 ((slice_key1, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 1)), { x_key: 2, y_key: 20, cm_key: cm_metric, example_count_key: 55, }), # unsampled value for slice 2 ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, jackknife._JACKKNIFE_FULL_SAMPLE_ID)), { x_key: 3.3, y_key: 33, cm_key: cm_metric, example_count_key: 1000, jackknife._JACKKNIFE_EXAMPLE_COUNT_METRIC_KEY: 1000 }), # sample values 1 of 2 for slice 2 ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 0)), { x_key: 2, y_key: 20, cm_key: cm_metric, example_count_key: 450, }), # sample values 2 of 2 for slice 2 ((slice_key2, (jackknife._JACKKNIFE_SAMPLE_ID_KEY, 1)), { x_key: 4, y_key: 40, cm_key: cm_metric, example_count_key: 550, }), ] with beam.Pipeline() as pipeline: result = ( pipeline | 'Create' >> beam.Create(sliced_derived_metrics, reshuffle=False) | 'JackknifeCombinePerKey' >> jackknife.MergeJackknifeSamples( num_jackknife_samples=2, skip_ci_metric_keys=[example_count_key])) # For standard error calculations, see delete-d jackknife formula in: # https://www.stat.berkeley.edu/~hhuang/STAT152/Jackknife-Bootstrap.pdf # Rather than normalize by all possible n-choose-d samples, we normalize # by the actual number of samples (2). def check_result(got_pcoll): expected_pcoll = [ ( (slice_key1, ), { x_key: types.ValueWithTDistribution( sample_mean=1.5, # (((100 - 100/2)/(100/2))*np.var([1, 2]))**0.5 sample_standard_deviation=.5, sample_degrees_of_freedom=1, unsampled_value=1.6), y_key: types.ValueWithTDistribution( sample_mean=15, # (((100 - 100/2)/(100/2))*np.var([10, 20]))**0.5 sample_standard_deviation=5, sample_degrees_of_freedom=1, unsampled_value=16), cm_key: cm_metric, example_count_key: 100, }), ( (slice_key2, ), { x_key: types.ValueWithTDistribution( sample_mean=3, # (((1000 - 1000/2)/(1000/2))*np.var([2, 4]))**0.5 sample_standard_deviation=1, sample_degrees_of_freedom=1, unsampled_value=3.3), y_key: types.ValueWithTDistribution( sample_mean=30, # (((1000 - 1000/2)/(1000/2))*np.var([20, 40]))**0.5 sample_standard_deviation=10, sample_degrees_of_freedom=1, unsampled_value=33), cm_key: cm_metric, example_count_key: 1000, }), ] self.assertCountEqual(expected_pcoll, got_pcoll) util.assert_that(result, check_result)
def test_boostrap_sample_combine_fn_per_slice(self): x_key = metric_types.MetricKey('x') y_key = metric_types.MetricKey('y') cm_key = metric_types.MetricKey('confusion_matrix') cm_metric = binary_confusion_matrices.Matrices( thresholds=[0.5], tp=[0], fp=[1], tn=[2], fn=[3]) skipped_metric_key = metric_types.MetricKey('skipped_metric') slice_key1 = (('slice_feature', 1),) slice_key2 = (('slice_feature', 2),) samples = [ # unsampled value for slice 1 (slice_key1, confidence_intervals_util.SampleMetrics( sample_id=poisson_bootstrap._FULL_SAMPLE_ID, metrics={ x_key: 1.6, y_key: 16, cm_key: cm_metric, skipped_metric_key: 100, })), # sample values 1 of 2 for slice 1 (slice_key1, confidence_intervals_util.SampleMetrics( sample_id=0, metrics={ x_key: 1, y_key: 10, cm_key: cm_metric, skipped_metric_key: 45, })), # sample values 2 of 2 for slice 1 (slice_key1, confidence_intervals_util.SampleMetrics( sample_id=1, metrics={ x_key: 2, y_key: 20, cm_key: cm_metric, skipped_metric_key: 55, })), # unsampled value for slice 2 (slice_key2, confidence_intervals_util.SampleMetrics( sample_id=poisson_bootstrap._FULL_SAMPLE_ID, metrics={ x_key: 3.3, y_key: 33, cm_key: cm_metric, skipped_metric_key: 1000, })), # sample values 1 of 2 for slice 2 (slice_key2, confidence_intervals_util.SampleMetrics( sample_id=0, metrics={ x_key: 2, y_key: 20, cm_key: cm_metric, skipped_metric_key: 450, })), # sample values 2 of 2 for slice 2 (slice_key2, confidence_intervals_util.SampleMetrics( sample_id=1, metrics={ x_key: 4, y_key: 40, cm_key: cm_metric, skipped_metric_key: 550, })), ] with beam.Pipeline() as pipeline: result = ( pipeline | 'Create' >> beam.Create(samples, reshuffle=False) | 'CombineSamplesPerKey' >> beam.CombinePerKey( poisson_bootstrap._BootstrapSampleCombineFn( num_bootstrap_samples=2, skip_ci_metric_keys=[skipped_metric_key]))) def check_result(got_pcoll): expected_pcoll = [ ( slice_key1, { x_key: types.ValueWithTDistribution( sample_mean=1.5, # sample_standard_deviation=0.5 sample_standard_deviation=np.std([1, 2], ddof=1), sample_degrees_of_freedom=1, unsampled_value=1.6), y_key: types.ValueWithTDistribution( sample_mean=15., # sample_standard_deviation=5, sample_standard_deviation=np.std([10, 20], ddof=1), sample_degrees_of_freedom=1, unsampled_value=16), cm_key: types.ValueWithTDistribution( sample_mean=cm_metric, sample_standard_deviation=cm_metric * 0, sample_degrees_of_freedom=1, unsampled_value=cm_metric), skipped_metric_key: 100, }), ( slice_key2, { x_key: types.ValueWithTDistribution( sample_mean=3., # sample_standard_deviation=1, sample_standard_deviation=np.std([2, 4], ddof=1), sample_degrees_of_freedom=1, unsampled_value=3.3), y_key: types.ValueWithTDistribution( sample_mean=30., # sample_standard_deviation=10, sample_standard_deviation=np.std([20, 40], ddof=1), sample_degrees_of_freedom=1, unsampled_value=33), cm_key: types.ValueWithTDistribution( sample_mean=cm_metric, sample_standard_deviation=cm_metric * 0, sample_degrees_of_freedom=1, unsampled_value=cm_metric), skipped_metric_key: 1000, }), ] self.assertCountEqual(expected_pcoll, got_pcoll) util.assert_that(result, check_result)
class ConfidenceIntervalsUtilTest(parameterized.TestCase): @parameterized.named_parameters( { 'testcase_name': '_ints', 'values': [0, 1, 2], 'ddof': 1, 'expected_mean': 1, 'expected_std': np.std([0, 1, 2], ddof=1), }, { 'testcase_name': '_ndarrays', 'values': [np.array([0]), np.array([1]), np.array([2])], 'ddof': 1, 'expected_mean': np.array([1]), 'expected_std': np.array([np.std([0, 1, 2], ddof=1)]), }, { 'testcase_name': '_confusion_matrices', 'values': [ binary_confusion_matrices.Matrices( thresholds=[0.5], tp=[0], fp=[1], tn=[2], fn=[3]), binary_confusion_matrices.Matrices( thresholds=[0.5], tp=[4], fp=[5], tn=[6], fn=[7]), binary_confusion_matrices.Matrices( thresholds=[0.5], tp=[8], fp=[9], tn=[10], fn=[11]) ], 'ddof': 1, 'expected_mean': binary_confusion_matrices.Matrices( thresholds=[0.5], tp=np.mean([0, 4, 8]), fp=np.mean([1, 5, 9]), tn=np.mean([2, 6, 10]), fn=np.mean([3, 7, 11])), 'expected_std': binary_confusion_matrices.Matrices( thresholds=[0.5], tp=np.std([0, 4, 8], ddof=1), fp=np.std([1, 5, 9], ddof=1), tn=np.std([2, 6, 10], ddof=1), fn=np.std([3, 7, 11], ddof=1)), }) def test_mean_and_std(self, values, ddof, expected_mean, expected_std): actual_mean, actual_std = confidence_intervals_util.mean_and_std( values, ddof) self.assertEqual(expected_mean, actual_mean) self.assertEqual(expected_std, actual_std) def test_sample_combine_fn(self): metric_key = metric_types.MetricKey('metric') array_metric_key = metric_types.MetricKey('array_metric') missing_sample_metric_key = metric_types.MetricKey('missing_metric') non_numeric_metric_key = metric_types.MetricKey('non_numeric_metric') non_numeric_array_metric_key = metric_types.MetricKey('non_numeric_array') skipped_metric_key = metric_types.MetricKey('skipped_metric') slice_key1 = (('slice_feature', 1),) slice_key2 = (('slice_feature', 2),) # the sample value is irrelevant for this test as we only verify counters. samples = [ # unsampled value for slice 1 (slice_key1, confidence_intervals_util.SampleMetrics( sample_id=_FULL_SAMPLE_ID, metrics={ metric_key: 2.1, array_metric_key: np.array([1, 2]), missing_sample_metric_key: 3, non_numeric_metric_key: 'a', non_numeric_array_metric_key: np.array(['a', 'aaa']), skipped_metric_key: 16 })), # sample values for slice 1 (slice_key1, confidence_intervals_util.SampleMetrics( sample_id=0, metrics={ metric_key: 1, array_metric_key: np.array([2, 3]), missing_sample_metric_key: 2, non_numeric_metric_key: 'b', non_numeric_array_metric_key: np.array(['a', 'aaa']), skipped_metric_key: 7 })), # sample values for slice 1 missing missing_sample_metric_key (slice_key1, confidence_intervals_util.SampleMetrics( sample_id=1, metrics={ metric_key: 2, array_metric_key: np.array([0, 1]), non_numeric_metric_key: 'c', non_numeric_array_metric_key: np.array(['a', 'aaa']), skipped_metric_key: 8 })), # unsampled value for slice 2 (slice_key2, confidence_intervals_util.SampleMetrics( sample_id=_FULL_SAMPLE_ID, metrics={ metric_key: 6.3, array_metric_key: np.array([10, 20]), missing_sample_metric_key: 6, non_numeric_metric_key: 'd', non_numeric_array_metric_key: np.array(['a', 'aaa']), skipped_metric_key: 10000 })), # Only 1 sample value (missing sample ID 1) for slice 2 (slice_key2, confidence_intervals_util.SampleMetrics( sample_id=0, metrics={ metric_key: 3, array_metric_key: np.array([20, 30]), missing_sample_metric_key: 12, non_numeric_metric_key: 'd', non_numeric_array_metric_key: np.array(['a', 'aaa']), skipped_metric_key: 5000 })), ] with beam.Pipeline() as pipeline: result = ( pipeline | 'Create' >> beam.Create(samples, reshuffle=False) | 'CombineSamplesPerKey' >> beam.CombinePerKey( _ValidateSampleCombineFn( num_samples=2, full_sample_id=_FULL_SAMPLE_ID, skip_ci_metric_keys=[skipped_metric_key]))) def check_result(got_pcoll): self.assertLen(got_pcoll, 2) accumulators_by_slice = dict(got_pcoll) self.assertIn(slice_key1, accumulators_by_slice) slice1_accumulator = accumulators_by_slice[slice_key1] # check unsampled value self.assertIn(metric_key, slice1_accumulator.point_estimates) self.assertEqual(2.1, slice1_accumulator.point_estimates[metric_key]) # check numeric case sample_values self.assertIn(metric_key, slice1_accumulator.metric_samples) self.assertEqual([1, 2], slice1_accumulator.metric_samples[metric_key]) # check numeric array in sample_values self.assertIn(array_metric_key, slice1_accumulator.metric_samples) array_metric_samples = ( slice1_accumulator.metric_samples[array_metric_key]) self.assertLen(array_metric_samples, 2) testing.assert_array_equal(np.array([2, 3]), array_metric_samples[0]) testing.assert_array_equal(np.array([0, 1]), array_metric_samples[1]) # check that non-numeric metric sample_values are not present self.assertIn(non_numeric_metric_key, slice1_accumulator.point_estimates) self.assertNotIn(non_numeric_metric_key, slice1_accumulator.metric_samples) self.assertIn(non_numeric_array_metric_key, slice1_accumulator.point_estimates) self.assertNotIn(non_numeric_array_metric_key, slice1_accumulator.metric_samples) # check that single metric missing samples generates error error_key = metric_types.MetricKey('__ERROR__') self.assertIn(error_key, slice1_accumulator.point_estimates) self.assertRegex(slice1_accumulator.point_estimates[error_key], 'CI not computed for.*missing_metric.*') # check that skipped metrics have no samples self.assertNotIn(skipped_metric_key, slice1_accumulator.metric_samples) self.assertIn(slice_key2, accumulators_by_slice) slice2_accumulator = accumulators_by_slice[slice_key2] # check unsampled value self.assertIn(metric_key, slice2_accumulator.point_estimates) self.assertEqual(6.3, slice2_accumulator.point_estimates[metric_key]) # check that entirely missing sample generates error self.assertIn( metric_types.MetricKey('__ERROR__'), slice2_accumulator.point_estimates) self.assertRegex(slice2_accumulator.point_estimates[error_key], 'CI not computed because only 1.*Expected 2.*') util.assert_that(result, check_result) runner_result = pipeline.run() # we expect one missing samples counter increment for slice2, since we # expected 2 samples, but only saw 1. metric_filter = beam.metrics.metric.MetricsFilter().with_name( 'num_slices_missing_samples') counters = runner_result.metrics().query(filter=metric_filter)['counters'] self.assertLen(counters, 1) self.assertEqual(1, counters[0].committed) # verify total slice counter metric_filter = beam.metrics.metric.MetricsFilter().with_name( 'num_slices') counters = runner_result.metrics().query(filter=metric_filter)['counters'] self.assertLen(counters, 1) self.assertEqual(2, counters[0].committed)