def test_compute(self): multiclass_paired_metrics = metrics.MulticlassPairedMetrics() indices = ['7f7f85', '345ac4', '3a3112', '88bcda'] metas = [{'parentId': '345ac4'}, {}, {}, {'parentId': '3a3112'}] # No swaps. result = multiclass_paired_metrics.compute_with_metadata( ['1', '1', '0', '0'], [[0, 1], [0, 1], [1, 0], [1, 0]], types.CategoryLabel(), types.MulticlassPreds(vocab=['0', '1'], null_idx=0), indices, metas) testing_utils.assert_dicts_almost_equal(self, result, { 'mean_jsd': 0.0, 'num_pairs': 2, 'swap_rate': 0.0 }) # One swap. result = multiclass_paired_metrics.compute_with_metadata( ['1', '1', '0', '0'], [[0, 1], [1, 0], [1, 0], [1, 0]], types.CategoryLabel(), types.MulticlassPreds(vocab=['0', '1'], null_idx=0), indices, metas) testing_utils.assert_dicts_almost_equal(self, result, { 'mean_jsd': 0.34657, 'num_pairs': 2, 'swap_rate': 0.5 }) # Two swaps. result = multiclass_paired_metrics.compute_with_metadata( ['1', '1', '0', '0'], [[0, 1], [1, 0], [1, 0], [0, 1]], types.CategoryLabel(), types.MulticlassPreds(vocab=['0', '1'], null_idx=0), indices, metas) testing_utils.assert_dicts_almost_equal(self, result, { 'mean_jsd': 0.69315, 'num_pairs': 2, 'swap_rate': 1.0 }) # Two swaps, no null index. result = multiclass_paired_metrics.compute_with_metadata( ['1', '1', '0', '0'], [[0, 1], [1, 0], [1, 0], [0, 1]], types.CategoryLabel(), types.MulticlassPreds(vocab=['0', '1']), indices, metas) testing_utils.assert_dicts_almost_equal(self, result, { 'mean_jsd': 0.69315, 'num_pairs': 2, 'swap_rate': 1.0 }) # Empty predictions, indices, and meta. result = multiclass_paired_metrics.compute_with_metadata( [], [], types.CategoryLabel(), types.MulticlassPreds(vocab=['0', '1'], null_idx=0), [], []) testing_utils.assert_dicts_almost_equal(self, result, {})
def test_compute(self): multiclass_metrics = metrics.MulticlassMetrics() # All correct predictions. result = multiclass_metrics.compute( ['1', '2', '0', '1'], [[0, 1, 0], [0, 0, 1], [1, 0, 0], [0, 1, 0]], types.CategoryLabel(), types.MulticlassPreds(vocab=['0', '1', '2'], null_idx=0)) testing_utils.assert_dicts_almost_equal(self, result, { 'accuracy': 1.0, 'f1': 1.0, 'precision': 1.0, 'recall': 1.0 }) # Some incorrect predictions. result = multiclass_metrics.compute( ['1', '2', '0', '1'], [[.1, .4, .5], [0, .1, .9], [.1, 0, .9], [0, 1, 0]], types.CategoryLabel(), types.MulticlassPreds(vocab=['0', '1', '2'], null_idx=0)) testing_utils.assert_dicts_almost_equal(self, result, { 'accuracy': 0.5, 'f1': 0.57143, 'precision': 0.5, 'recall': 0.66666 }) # All incorrect predictions. result = multiclass_metrics.compute( ['1', '2', '0', '1'], [[.1, .4, .5], [.2, .7, .1], [.1, 0, .9], [1, 0, 0]], types.CategoryLabel(), types.MulticlassPreds(vocab=['0', '1', '2'], null_idx=0)) testing_utils.assert_dicts_almost_equal(self, result, { 'accuracy': 0.0, 'f1': 0.0, 'precision': 0.0, 'recall': 0.0 }) # No null index. result = multiclass_metrics.compute( ['1', '2', '0', '1'], [[.1, .4, .5], [0, .1, .9], [.1, 0, .9], [0, 1, 0]], types.CategoryLabel(), types.MulticlassPreds(vocab=['0', '1', '2'])) testing_utils.assert_dicts_almost_equal(self, result, {'accuracy': 0.5}) # Empty labels and predictions result = multiclass_metrics.compute([], [], types.CategoryLabel(), types.MulticlassPreds( vocab=['0', '1', '2'], null_idx=0)) testing_utils.assert_dicts_almost_equal(self, result, {})
def test_compute(self): corpusblue_metrics = metrics.CorpusBLEU() # All correct predictions. result = corpusblue_metrics.compute( ['This is a test.', 'Test two', 'A third test example'], ['This is a test.', 'Test two', 'A third test example'], types.GeneratedText(), types.GeneratedText()) testing_utils.assert_dicts_almost_equal(self, result, {'corpus_bleu': 100.00000}) # Some incorrect predictions. result = corpusblue_metrics.compute( ['This is a test.', 'Test one', 'A third test'], ['This is a test.', 'Test two', 'A third test example'], types.GeneratedText(), types.GeneratedText()) testing_utils.assert_dicts_almost_equal(self, result, {'corpus_bleu': 68.037493}) result = corpusblue_metrics.compute( ['This is a test.', 'Test one', 'A third test'], ['these test.', 'Test two', 'A third test example'], types.GeneratedText(), types.GeneratedText()) testing_utils.assert_dicts_almost_equal( self, result, {'corpus_bleu': 29.508062388758525}) # Empty labels and predictions result = corpusblue_metrics.compute([], [], types.GeneratedText(), types.GeneratedText()) testing_utils.assert_dicts_almost_equal(self, result, {})
def test_compute(self): regression_metrics = metrics.RegressionMetrics() # All correct predictions. result = regression_metrics.compute([1, 2, 3, 4], [1, 2, 3, 4], types.RegressionScore(), types.RegressionScore()) testing_utils.assert_dicts_almost_equal(self, result, { 'mse': 0, 'pearsonr': 1.0, 'spearmanr': 1.0 }) # Some incorrect predictions. result = regression_metrics.compute([1, 2, 3, 4], [1, 2, 5.5, 6.3], types.RegressionScore(), types.RegressionScore()) testing_utils.assert_dicts_almost_equal(self, result, { 'mse': 2.885, 'pearsonr': 0.96566, 'spearmanr': 1.0 }) # All incorrect predictions (and not monotonic). result = regression_metrics.compute([1, 2, 3, 4], [-5, -10, 5, 6], types.RegressionScore(), types.RegressionScore()) testing_utils.assert_dicts_almost_equal(self, result, { 'mse': 47.0, 'pearsonr': 0.79559, 'spearmanr': 0.79999 }) # Empty labels and predictions result = regression_metrics.compute([], [], types.RegressionScore(), types.RegressionScore()) testing_utils.assert_dicts_almost_equal(self, result, {})