Beispiel #1
0
    def test_compute(self):
        multiclass_paired_metrics = metrics.MulticlassPairedMetrics()

        indices = ['7f7f85', '345ac4', '3a3112', '88bcda']
        metas = [{'parentId': '345ac4'}, {}, {}, {'parentId': '3a3112'}]

        # No swaps.
        result = multiclass_paired_metrics.compute_with_metadata(
            ['1', '1', '0', '0'], [[0, 1], [0, 1], [1, 0], [1, 0]],
            types.CategoryLabel(),
            types.MulticlassPreds(vocab=['0', '1'],
                                  null_idx=0), indices, metas)
        testing_utils.assert_dicts_almost_equal(self, result, {
            'mean_jsd': 0.0,
            'num_pairs': 2,
            'swap_rate': 0.0
        })

        # One swap.
        result = multiclass_paired_metrics.compute_with_metadata(
            ['1', '1', '0', '0'], [[0, 1], [1, 0], [1, 0], [1, 0]],
            types.CategoryLabel(),
            types.MulticlassPreds(vocab=['0', '1'],
                                  null_idx=0), indices, metas)
        testing_utils.assert_dicts_almost_equal(self, result, {
            'mean_jsd': 0.34657,
            'num_pairs': 2,
            'swap_rate': 0.5
        })

        # Two swaps.
        result = multiclass_paired_metrics.compute_with_metadata(
            ['1', '1', '0', '0'], [[0, 1], [1, 0], [1, 0], [0, 1]],
            types.CategoryLabel(),
            types.MulticlassPreds(vocab=['0', '1'],
                                  null_idx=0), indices, metas)
        testing_utils.assert_dicts_almost_equal(self, result, {
            'mean_jsd': 0.69315,
            'num_pairs': 2,
            'swap_rate': 1.0
        })

        # Two swaps, no null index.
        result = multiclass_paired_metrics.compute_with_metadata(
            ['1', '1', '0', '0'], [[0, 1], [1, 0], [1, 0], [0, 1]],
            types.CategoryLabel(), types.MulticlassPreds(vocab=['0', '1']),
            indices, metas)
        testing_utils.assert_dicts_almost_equal(self, result, {
            'mean_jsd': 0.69315,
            'num_pairs': 2,
            'swap_rate': 1.0
        })

        # Empty predictions, indices, and meta.
        result = multiclass_paired_metrics.compute_with_metadata(
            [], [], types.CategoryLabel(),
            types.MulticlassPreds(vocab=['0', '1'], null_idx=0), [], [])
        testing_utils.assert_dicts_almost_equal(self, result, {})
Beispiel #2
0
    def test_compute(self):
        multiclass_metrics = metrics.MulticlassMetrics()

        # All correct predictions.
        result = multiclass_metrics.compute(
            ['1', '2', '0', '1'], [[0, 1, 0], [0, 0, 1], [1, 0, 0], [0, 1, 0]],
            types.CategoryLabel(),
            types.MulticlassPreds(vocab=['0', '1', '2'], null_idx=0))
        testing_utils.assert_dicts_almost_equal(self, result, {
            'accuracy': 1.0,
            'f1': 1.0,
            'precision': 1.0,
            'recall': 1.0
        })

        # Some incorrect predictions.
        result = multiclass_metrics.compute(
            ['1', '2', '0', '1'],
            [[.1, .4, .5], [0, .1, .9], [.1, 0, .9], [0, 1, 0]],
            types.CategoryLabel(),
            types.MulticlassPreds(vocab=['0', '1', '2'], null_idx=0))
        testing_utils.assert_dicts_almost_equal(self, result, {
            'accuracy': 0.5,
            'f1': 0.57143,
            'precision': 0.5,
            'recall': 0.66666
        })

        # All incorrect predictions.
        result = multiclass_metrics.compute(
            ['1', '2', '0', '1'],
            [[.1, .4, .5], [.2, .7, .1], [.1, 0, .9], [1, 0, 0]],
            types.CategoryLabel(),
            types.MulticlassPreds(vocab=['0', '1', '2'], null_idx=0))
        testing_utils.assert_dicts_almost_equal(self, result, {
            'accuracy': 0.0,
            'f1': 0.0,
            'precision': 0.0,
            'recall': 0.0
        })

        # No null index.
        result = multiclass_metrics.compute(
            ['1', '2', '0', '1'],
            [[.1, .4, .5], [0, .1, .9], [.1, 0, .9], [0, 1, 0]],
            types.CategoryLabel(),
            types.MulticlassPreds(vocab=['0', '1', '2']))
        testing_utils.assert_dicts_almost_equal(self, result,
                                                {'accuracy': 0.5})

        # Empty labels and predictions
        result = multiclass_metrics.compute([], [], types.CategoryLabel(),
                                            types.MulticlassPreds(
                                                vocab=['0', '1', '2'],
                                                null_idx=0))
        testing_utils.assert_dicts_almost_equal(self, result, {})
Beispiel #3
0
    def test_compute(self):
        corpusblue_metrics = metrics.CorpusBLEU()

        # All correct predictions.
        result = corpusblue_metrics.compute(
            ['This is a test.', 'Test two', 'A third test example'],
            ['This is a test.', 'Test two', 'A third test example'],
            types.GeneratedText(), types.GeneratedText())
        testing_utils.assert_dicts_almost_equal(self, result,
                                                {'corpus_bleu': 100.00000})

        # Some incorrect predictions.
        result = corpusblue_metrics.compute(
            ['This is a test.', 'Test one', 'A third test'],
            ['This is a test.', 'Test two', 'A third test example'],
            types.GeneratedText(), types.GeneratedText())
        testing_utils.assert_dicts_almost_equal(self, result,
                                                {'corpus_bleu': 68.037493})

        result = corpusblue_metrics.compute(
            ['This is a test.', 'Test one', 'A third test'],
            ['these test.', 'Test two', 'A third test example'],
            types.GeneratedText(), types.GeneratedText())
        testing_utils.assert_dicts_almost_equal(
            self, result, {'corpus_bleu': 29.508062388758525})

        # Empty labels and predictions
        result = corpusblue_metrics.compute([], [], types.GeneratedText(),
                                            types.GeneratedText())
        testing_utils.assert_dicts_almost_equal(self, result, {})
Beispiel #4
0
    def test_compute(self):
        regression_metrics = metrics.RegressionMetrics()

        # All correct predictions.
        result = regression_metrics.compute([1, 2, 3, 4], [1, 2, 3, 4],
                                            types.RegressionScore(),
                                            types.RegressionScore())
        testing_utils.assert_dicts_almost_equal(self, result, {
            'mse': 0,
            'pearsonr': 1.0,
            'spearmanr': 1.0
        })

        # Some incorrect predictions.
        result = regression_metrics.compute([1, 2, 3, 4], [1, 2, 5.5, 6.3],
                                            types.RegressionScore(),
                                            types.RegressionScore())
        testing_utils.assert_dicts_almost_equal(self, result, {
            'mse': 2.885,
            'pearsonr': 0.96566,
            'spearmanr': 1.0
        })

        # All incorrect predictions (and not monotonic).
        result = regression_metrics.compute([1, 2, 3, 4], [-5, -10, 5, 6],
                                            types.RegressionScore(),
                                            types.RegressionScore())
        testing_utils.assert_dicts_almost_equal(self, result, {
            'mse': 47.0,
            'pearsonr': 0.79559,
            'spearmanr': 0.79999
        })

        # Empty labels and predictions
        result = regression_metrics.compute([], [], types.RegressionScore(),
                                            types.RegressionScore())
        testing_utils.assert_dicts_almost_equal(self, result, {})