def test_task2_system_level(self):
        summary_level_metrics = JsonlReader(_task2_metrics_file_path,
                                            Metrics).read()
        system_level_metrics = aggregate_metrics(summary_level_metrics)

        # Check a few metrics to make sure they are equal to what's in the NIST files
        # updateEval/ROUGE/rouge2.jk.m.avg
        assert system_level_metrics['D']['rouge-2_jk'][
            'recall'] == pytest.approx(14.499, 1e-2)
        assert system_level_metrics['C']['rouge-2_jk'][
            'recall'] == pytest.approx(14.330, 1e-2)
        assert system_level_metrics['G']['rouge-2_jk'][
            'recall'] == pytest.approx(13.942, 1e-2)
        assert system_level_metrics['40']['rouge-2_jk'][
            'recall'] == pytest.approx(11.189, 1e-2)
        assert system_level_metrics['55']['rouge-2_jk'][
            'recall'] == pytest.approx(9.851, 1e-2)

        # updateEval/ROUGE/rougeSU4.jk.m.avg
        assert system_level_metrics['D']['rouge-su4_jk'][
            'recall'] == pytest.approx(17.998, 1e-2)
        assert system_level_metrics['C']['rouge-su4_jk'][
            'recall'] == pytest.approx(17.923, 1e-2)
        assert system_level_metrics['E']['rouge-su4_jk'][
            'recall'] == pytest.approx(17.689, 1e-2)
        assert system_level_metrics['40']['rouge-su4_jk'][
            'recall'] == pytest.approx(14.306, 1e-2)
        assert system_level_metrics['44']['rouge-su4_jk'][
            'recall'] == pytest.approx(13.607, 1e-2)

        # updateEval/manual/Responsiveness/avg_content.all
        assert system_level_metrics['D'][
            'content_responsiveness'] == pytest.approx(4.833, 1e-2)
        assert system_level_metrics['C'][
            'content_responsiveness'] == pytest.approx(4.833, 1e-2)
        assert system_level_metrics['E'][
            'content_responsiveness'] == pytest.approx(4.750, 1e-2)
        assert system_level_metrics['40'][
            'content_responsiveness'] == pytest.approx(2.967, 1e-2)
        assert system_level_metrics['36'][
            'content_responsiveness'] == pytest.approx(2.800, 1e-2)

        # updateEval/BE/simple.jk.m.hm.avg
        assert system_level_metrics['D']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(10.687, 1e-2)
        assert system_level_metrics['C']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(10.214, 1e-2)
        assert system_level_metrics['E']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(10.177, 1e-2)
        assert system_level_metrics['40']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(7.219, 1e-2)
        assert system_level_metrics['44']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(5.544, 1e-2)
    def test_task1_system_level(self):
        summary_level_metrics = JsonlReader(_task1_metrics_file_path,
                                            Metrics).read()
        system_level_metrics = aggregate_metrics(summary_level_metrics)

        # Check a few metrics to make sure they are equal to what's in the NIST files
        # mainEval/ROUGE/rouge2.jk.m.avg
        assert system_level_metrics['D']['rouge-2_jk'][
            'recall'] == pytest.approx(17.528, 1e-2)
        assert system_level_metrics['C']['rouge-2_jk'][
            'recall'] == pytest.approx(15.055, 1e-2)
        assert system_level_metrics['B']['rouge-2_jk'][
            'recall'] == pytest.approx(13.992, 1e-2)
        assert system_level_metrics['15']['rouge-2_jk'][
            'recall'] == pytest.approx(12.448, 1e-2)
        assert system_level_metrics['29']['rouge-2_jk'][
            'recall'] == pytest.approx(12.028, 1e-2)

        # mainEval/ROUGE/rougeSU4.jk.m.avg
        assert system_level_metrics['D']['rouge-su4_jk'][
            'recall'] == pytest.approx(21.892, 1e-2)
        assert system_level_metrics['C']['rouge-su4_jk'][
            'recall'] == pytest.approx(19.921, 1e-2)
        assert system_level_metrics['E']['rouge-su4_jk'][
            'recall'] == pytest.approx(19.396, 1e-2)
        assert system_level_metrics['15']['rouge-su4_jk'][
            'recall'] == pytest.approx(17.711, 1e-2)
        assert system_level_metrics['24']['rouge-su4_jk'][
            'recall'] == pytest.approx(17.593, 1e-2)

        # mainEval/manual/avg_content
        assert system_level_metrics['D'][
            'content_responsiveness'] == pytest.approx(4.944, 1e-2)
        assert system_level_metrics['I'][
            'content_responsiveness'] == pytest.approx(4.889, 1e-2)
        assert system_level_metrics['G'][
            'content_responsiveness'] == pytest.approx(4.889, 1e-2)
        assert system_level_metrics['4'][
            'content_responsiveness'] == pytest.approx(3.400, 1e-2)
        assert system_level_metrics['23'][
            'content_responsiveness'] == pytest.approx(3.311, 1e-2)

        # mainEval/BE/simple.jk.m.hm.avg
        assert system_level_metrics['D']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(12.284, 1e-2)
        assert system_level_metrics['C']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(9.593, 1e-2)
        assert system_level_metrics['B']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(9.146, 1e-2)
        assert system_level_metrics['15']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(6.632, 1e-2)
        assert system_level_metrics['24']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(6.577, 1e-2)
Ejemplo n.º 3
0
    def test_system_level(self):
        summary_level_metrics = JsonlReader(_metrics_file_path, Metrics).read()
        system_level_metrics = aggregate_metrics(summary_level_metrics)

        # Check a few metrics to make sure they are equal to what's in the NIST files
        # ROUGE/rouge2.jk.m.avg
        assert system_level_metrics['32']['rouge-1_jk']['recall'] == pytest.approx(32.4835488888889, 1e-2)
        assert system_level_metrics['32']['rouge-1_jk']['precision'] == pytest.approx(33.3402216666667, 1e-2)
        assert system_level_metrics['21']['rouge-2_jk']['recall'] == pytest.approx(5.72633166666666, 1e-2)
        assert system_level_metrics['21']['rouge-2_jk']['precision'] == pytest.approx(5.70379111111111, 1e-2)
        assert system_level_metrics['E']['rouge-2_jk']['recall'] == pytest.approx(10.5482258064516, 1e-2)
        assert system_level_metrics['E']['rouge-2_jk']['precision'] == pytest.approx(11.9319677419355, 1e-2)

        # ROUGE/rougeSU4.jk.m.avg
        assert system_level_metrics['H']['rouge-su4_jk']['recall'] == pytest.approx(14.8429, 1e-2)
        assert system_level_metrics['F']['rouge-su4_jk']['recall'] == pytest.approx(15.8717666666667, 1e-2)
        assert system_level_metrics['E']['rouge-su4_jk']['recall'] == pytest.approx(15.9369677419355, 1e-2)
        assert system_level_metrics['23']['rouge-su4_jk']['recall'] == pytest.approx(5.56854166666667, 1e-2)
        assert system_level_metrics['1']['rouge-su4_jk']['recall'] == pytest.approx(8.71616333333333, 1e-2)
Ejemplo n.º 4
0
    def test_system_level(self):
        summary_level_metrics = JsonlReader(_metrics_file_path, Metrics).read()
        system_level_metrics = aggregate_metrics(summary_level_metrics)

        # Check a few metrics to make sure they are equal to what's in the NIST files
        # NISTeval2/ROUGE/rouge2.jk.m.avg
        assert system_level_metrics['C']['rouge-2_jk']['recall'] == pytest.approx(13.260, 1e-2)
        assert system_level_metrics['D']['rouge-2_jk']['recall'] == pytest.approx(12.380, 1e-2)
        assert system_level_metrics['B']['rouge-2_jk']['recall'] == pytest.approx(11.788, 1e-2)
        assert system_level_metrics['24']['rouge-2_jk']['recall'] == pytest.approx(9.558, 1e-2)
        assert system_level_metrics['15']['rouge-2_jk']['recall'] == pytest.approx(9.097, 1e-2)

        # NISTeval2/ROUGE/rougeSU4.jk.m.avg
        assert system_level_metrics['C']['rouge-su4_jk']['recall'] == pytest.approx(18.385, 1e-2)
        assert system_level_metrics['D']['rouge-su4_jk']['recall'] == pytest.approx(17.814, 1e-2)
        assert system_level_metrics['B']['rouge-su4_jk']['recall'] == pytest.approx(17.665, 1e-2)
        assert system_level_metrics['24']['rouge-su4_jk']['recall'] == pytest.approx(15.529, 1e-2)
        assert system_level_metrics['12']['rouge-su4_jk']['recall'] == pytest.approx(14.755, 1e-2)

        # NISTeval/responsiveness/avg_content
        assert system_level_metrics['D']['content_responsiveness'] == pytest.approx(4.9, 1e-2)
        assert system_level_metrics['C']['content_responsiveness'] == pytest.approx(4.9, 1e-2)
        assert system_level_metrics['B']['content_responsiveness'] == pytest.approx(4.9, 1e-2)
        assert system_level_metrics['27']['content_responsiveness'] == pytest.approx(3.08, 1e-2)
        assert system_level_metrics['23']['content_responsiveness'] == pytest.approx(3.0, 1e-2)

        # NISTeval/responsiveness/avg_overall
        assert system_level_metrics['E']['overall_responsiveness'] == pytest.approx(4.9, 1e-2)
        assert system_level_metrics['D']['overall_responsiveness'] == pytest.approx(4.9, 1e-2)
        assert system_level_metrics['I']['overall_responsiveness'] == pytest.approx(4.8, 1e-2)
        assert system_level_metrics['27']['overall_responsiveness'] == pytest.approx(2.84, 1e-2)
        assert system_level_metrics['23']['overall_responsiveness'] == pytest.approx(2.76, 1e-2)

        # NISTeval2/BE/simple.jk.m.hm.avg
        assert system_level_metrics['C']['rouge-be-hm_jk']['recall'] == pytest.approx(9.905, 1e-2)
        assert system_level_metrics['B']['rouge-be-hm_jk']['recall'] == pytest.approx(7.847, 1e-2)
        assert system_level_metrics['D']['rouge-be-hm_jk']['recall'] == pytest.approx(7.466, 1e-2)
        assert system_level_metrics['24']['rouge-be-hm_jk']['recall'] == pytest.approx(5.107, 1e-2)
        assert system_level_metrics['23']['rouge-be-hm_jk']['recall'] == pytest.approx(5.049, 1e-2)
    def test_system_level(self):
        summary_level_metrics = JsonlReader(_metrics_file_path, Metrics).read()
        system_level_metrics = aggregate_metrics(summary_level_metrics)

        # Check a few metrics to make sure they are equal to what's in the NIST files
        # ROUGE/rouge2.m.avg
        assert system_level_metrics['43']['rouge-2'][
            'recall'] == pytest.approx(10.382, 1e-2)
        assert system_level_metrics['13']['rouge-2'][
            'recall'] == pytest.approx(9.900, 1e-2)
        assert system_level_metrics['14']['rouge-2'][
            'recall'] == pytest.approx(9.773, 1e-2)
        assert system_level_metrics['2']['rouge-2']['recall'] == pytest.approx(
            9.610, 1e-2)
        assert system_level_metrics['65']['rouge-2'][
            'recall'] == pytest.approx(9.558, 1e-2)

        # ROUGE/rouge2.jk.m.avg
        assert system_level_metrics['D']['rouge-2_jk'][
            'recall'] == pytest.approx(13.197, 1e-2)
        assert system_level_metrics['F']['rouge-2_jk'][
            'recall'] == pytest.approx(12.896, 1e-2)
        assert system_level_metrics['H']['rouge-2_jk'][
            'recall'] == pytest.approx(12.010, 1e-2)
        assert system_level_metrics['43']['rouge-2_jk'][
            'recall'] == pytest.approx(10.395, 1e-2)
        assert system_level_metrics['13']['rouge-2_jk'][
            'recall'] == pytest.approx(9.901, 1e-2)

        # ROUGE/rougeSU4.m.avg
        assert system_level_metrics['43']['rouge-su4'][
            'recall'] == pytest.approx(13.625, 1e-2)
        assert system_level_metrics['37']['rouge-su4'][
            'recall'] == pytest.approx(13.574, 1e-2)
        assert system_level_metrics['60']['rouge-su4'][
            'recall'] == pytest.approx(13.570, 1e-2)
        assert system_level_metrics['2']['rouge-su4'][
            'recall'] == pytest.approx(13.419, 1e-2)
        assert system_level_metrics['14']['rouge-su4'][
            'recall'] == pytest.approx(13.283, 1e-2)

        # ROUGE/rougeSU4.jk.m.avg
        assert system_level_metrics['D']['rouge-su4_jk'][
            'recall'] == pytest.approx(16.878, 1e-2)
        assert system_level_metrics['F']['rouge-su4_jk'][
            'recall'] == pytest.approx(16.490, 1e-2)
        assert system_level_metrics['H']['rouge-su4_jk'][
            'recall'] == pytest.approx(15.565, 1e-2)
        assert system_level_metrics['43']['rouge-su4_jk'][
            'recall'] == pytest.approx(13.646, 1e-2)
        assert system_level_metrics['37']['rouge-su4_jk'][
            'recall'] == pytest.approx(13.592, 1e-2)

        # manual/manual.model.avg
        assert system_level_metrics['A']['num_scus_jk'] == pytest.approx(
            8.021, 1e-2)
        assert system_level_metrics['B']['num_scus_jk'] == pytest.approx(
            8.479, 1e-2)
        assert system_level_metrics['C']['num_scus_jk'] == pytest.approx(
            8.208, 1e-2)

        assert system_level_metrics['A'][
            'modified_pyramid_score_jk'] == pytest.approx(0.608, 1e-2)
        assert system_level_metrics['B'][
            'modified_pyramid_score_jk'] == pytest.approx(0.625, 1e-2)
        assert system_level_metrics['C'][
            'modified_pyramid_score_jk'] == pytest.approx(0.651, 1e-2)

        assert system_level_metrics['A'][
            'linguistic_quality'] == pytest.approx(4.833, 1e-2)
        assert system_level_metrics['B'][
            'linguistic_quality'] == pytest.approx(4.812, 1e-2)
        assert system_level_metrics['C'][
            'linguistic_quality'] == pytest.approx(4.604, 1e-2)

        assert system_level_metrics['A'][
            'overall_responsiveness'] == pytest.approx(4.688, 1e-2)
        assert system_level_metrics['B'][
            'overall_responsiveness'] == pytest.approx(4.583, 1e-2)
        assert system_level_metrics['C'][
            'overall_responsiveness'] == pytest.approx(4.500, 1e-2)

        # manual/manual.peer.avg
        assert system_level_metrics['0'][
            'modified_pyramid_score'] == pytest.approx(0.166, 1e-2)
        assert system_level_metrics['1'][
            'modified_pyramid_score'] == pytest.approx(0.265, 1e-2)
        assert system_level_metrics['2'][
            'modified_pyramid_score'] == pytest.approx(0.280, 1e-2)

        assert system_level_metrics['0']['num_scus'] == pytest.approx(
            2.635, 1e-2)
        assert system_level_metrics['1']['num_scus'] == pytest.approx(
            3.854, 1e-2)
        assert system_level_metrics['2']['num_scus'] == pytest.approx(
            4.000, 1e-2)

        assert system_level_metrics['0']['num_repetitions'] == pytest.approx(
            0.688, 1e-2)
        assert system_level_metrics['1']['num_repetitions'] == pytest.approx(
            0.885, 1e-2)
        assert system_level_metrics['2']['num_repetitions'] == pytest.approx(
            1.156, 1e-2)

        assert system_level_metrics['0'][
            'modified_pyramid_score_jk'] == pytest.approx(0.163, 1e-2)
        assert system_level_metrics['1'][
            'modified_pyramid_score_jk'] == pytest.approx(0.261, 1e-2)
        assert system_level_metrics['2'][
            'modified_pyramid_score_jk'] == pytest.approx(0.276, 1e-2)

        assert system_level_metrics['0'][
            'linguistic_quality'] == pytest.approx(3.333, 1e-2)
        assert system_level_metrics['1'][
            'linguistic_quality'] == pytest.approx(2.719, 1e-2)
        assert system_level_metrics['2'][
            'linguistic_quality'] == pytest.approx(2.354, 1e-2)

        assert system_level_metrics['0'][
            'overall_responsiveness'] == pytest.approx(2.073, 1e-2)
        assert system_level_metrics['1'][
            'overall_responsiveness'] == pytest.approx(2.427, 1e-2)
        assert system_level_metrics['2'][
            'overall_responsiveness'] == pytest.approx(2.385, 1e-2)

        # BE/simple.m.hm.avg
        assert system_level_metrics['14']['rouge-be-hm'][
            'recall'] == pytest.approx(6.462, 1e-2)
        assert system_level_metrics['65']['rouge-be-hm'][
            'recall'] == pytest.approx(6.276, 1e-2)
        assert system_level_metrics['43']['rouge-be-hm'][
            'recall'] == pytest.approx(6.257, 1e-2)
        assert system_level_metrics['49']['rouge-be-hm'][
            'recall'] == pytest.approx(6.247, 1e-2)
        assert system_level_metrics['60']['rouge-be-hm'][
            'recall'] == pytest.approx(6.198, 1e-2)

        # BE/simple.jk.m.hm.avg
        assert system_level_metrics['D']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(9.959, 1e-2)
        assert system_level_metrics['F']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(9.553, 1e-2)
        assert system_level_metrics['G']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(9.154, 1e-2)
        assert system_level_metrics['14']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(6.480, 1e-2)
        assert system_level_metrics['65']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(6.293, 1e-2)
Ejemplo n.º 6
0
    def test_system_level_B(self):
        summary_level_metrics = JsonlReader(_metrics_B_file_path,
                                            Metrics).read()
        system_level_metrics = aggregate_metrics(summary_level_metrics)

        # Check a few metrics to make sure they are equal to what's in the NIST files
        # ROUGE/rouge2_B.m.avg
        assert system_level_metrics['43']['rouge-2'][
            'recall'] == pytest.approx(9.581, 1e-2)
        assert system_level_metrics['25']['rouge-2'][
            'recall'] == pytest.approx(9.259, 1e-2)
        assert system_level_metrics['17']['rouge-2'][
            'recall'] == pytest.approx(8.855, 1e-2)

        # ROUGE/rouge2_B.jk.m.avg
        assert system_level_metrics['E']['rouge-2_jk'][
            'recall'] == pytest.approx(11.474, 1e-2)
        assert system_level_metrics['H']['rouge-2_jk'][
            'recall'] == pytest.approx(10.865, 1e-1)
        assert system_level_metrics['43']['rouge-2_jk'][
            'recall'] == pytest.approx(9.589, 1e-2)

        # ROUGE/rougeSU4_B.m.avg
        assert system_level_metrics['43']['rouge-su4'][
            'recall'] == pytest.approx(13.080, 1e-2)
        assert system_level_metrics['24']['rouge-su4'][
            'recall'] == pytest.approx(12.803, 1e-2)
        assert system_level_metrics['17']['rouge-su4'][
            'recall'] == pytest.approx(12.792, 1e-2)

        # ROUGE/rougeSU4_B.jk.m.avg
        assert system_level_metrics['E']['rouge-su4_jk'][
            'recall'] == pytest.approx(14.941, 1e-2)
        assert system_level_metrics['D']['rouge-su4_jk'][
            'recall'] == pytest.approx(14.368, 1e-2)
        assert system_level_metrics['43']['rouge-su4_jk'][
            'recall'] == pytest.approx(13.086, 1e-2)

        # manual/manual.model.B.avg
        assert system_level_metrics['A']['num_scus_jk'] == pytest.approx(
            6.682, 1e-2)
        assert system_level_metrics['B']['num_scus_jk'] == pytest.approx(
            5.409, 1e-2)
        assert system_level_metrics['C']['num_scus_jk'] == pytest.approx(
            5.864, 1e-2)

        assert system_level_metrics['A'][
            'modified_pyramid_score_jk'] == pytest.approx(0.663, 1e-2)
        assert system_level_metrics['B'][
            'modified_pyramid_score_jk'] == pytest.approx(0.554, 1e-2)
        assert system_level_metrics['C'][
            'modified_pyramid_score_jk'] == pytest.approx(0.565, 1e-2)

        assert system_level_metrics['A'][
            'linguistic_quality'] == pytest.approx(4.909, 1e-2)
        assert system_level_metrics['B'][
            'linguistic_quality'] == pytest.approx(4.909, 1e-2)
        assert system_level_metrics['C'][
            'linguistic_quality'] == pytest.approx(4.955, 1e-2)

        assert system_level_metrics['A'][
            'overall_responsiveness'] == pytest.approx(4.773, 1e-2)
        assert system_level_metrics['B'][
            'overall_responsiveness'] == pytest.approx(4.500, 1e-2)
        assert system_level_metrics['C'][
            'overall_responsiveness'] == pytest.approx(4.682, 1e-2)

        # manual/manual.peer.B.avg
        assert system_level_metrics['1'][
            'modified_pyramid_score'] == pytest.approx(0.237, 1e-2)
        assert system_level_metrics['2'][
            'modified_pyramid_score'] == pytest.approx(0.284, 1e-2)
        assert system_level_metrics['3'][
            'modified_pyramid_score'] == pytest.approx(0.327, 1e-2)

        assert system_level_metrics['1']['num_scus'] == pytest.approx(
            2.636, 1e-2)
        assert system_level_metrics['2']['num_scus'] == pytest.approx(
            3.136, 1e-2)
        assert system_level_metrics['3']['num_scus'] == pytest.approx(
            3.682, 1e-2)

        assert system_level_metrics['1']['num_repetitions'] == pytest.approx(
            0.364, 1e-2)
        assert system_level_metrics['2']['num_repetitions'] == pytest.approx(
            0.568, 1e-2)
        assert system_level_metrics['3']['num_repetitions'] == pytest.approx(
            0.727, 1e-2)

        assert system_level_metrics['1'][
            'modified_pyramid_score_jk'] == pytest.approx(0.234, 1e-2)
        assert system_level_metrics['2'][
            'modified_pyramid_score_jk'] == pytest.approx(0.280, 1e-2)
        assert system_level_metrics['3'][
            'modified_pyramid_score_jk'] == pytest.approx(0.322, 1e-2)

        assert system_level_metrics['1'][
            'linguistic_quality'] == pytest.approx(3.455, 1e-2)
        assert system_level_metrics['2'][
            'linguistic_quality'] == pytest.approx(2.841, 1e-2)
        assert system_level_metrics['3'][
            'linguistic_quality'] == pytest.approx(2.886, 1e-2)

        assert system_level_metrics['1'][
            'overall_responsiveness'] == pytest.approx(2.091, 1e-2)
        assert system_level_metrics['2'][
            'overall_responsiveness'] == pytest.approx(2.114, 1e-2)
        assert system_level_metrics['3'][
            'overall_responsiveness'] == pytest.approx(2.500, 1e-2)

        # BE/simple_B.m.hm.avg
        assert system_level_metrics['43']['rouge-be-hm'][
            'recall'] == pytest.approx(6.473, 1e-2)
        assert system_level_metrics['25']['rouge-be-hm'][
            'recall'] == pytest.approx(5.937, 1e-2)
        assert system_level_metrics['26']['rouge-be-hm'][
            'recall'] == pytest.approx(5.717, 1e-1)

        # BE/simplejk_B.m.hm.avg
        assert system_level_metrics['E']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(7.970, 1e-2)
        assert system_level_metrics['D']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(7.341, 1e-1)
        assert system_level_metrics['43']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(6.480, 1e-1)

        # aesop_allpeers_B
        assert system_level_metrics['B']['aesop']['2'] == pytest.approx(
            0.1278890909, 1e-2)
        assert system_level_metrics['E']['aesop']['4'] == pytest.approx(
            0.4831818182, 1e-2)
        assert system_level_metrics['6']['aesop']['8'] == pytest.approx(
            1.003988068, 1e-2)
Ejemplo n.º 7
0
    def test_system_level_A(self):
        summary_level_metrics = JsonlReader(_metrics_A_file_path,
                                            Metrics).read()
        system_level_metrics = aggregate_metrics(summary_level_metrics)

        # Check a few metrics to make sure they are equal to what's in the NIST files
        # ROUGE/rouge2_A.m.avg
        assert system_level_metrics['43']['rouge-2'][
            'recall'] == pytest.approx(13.440, 1e-2)
        assert system_level_metrics['17']['rouge-2'][
            'recall'] == pytest.approx(12.994, 1e-2)
        assert system_level_metrics['25']['rouge-2'][
            'recall'] == pytest.approx(12.821, 1e-2)

        # ROUGE/rouge2_A.jk.m.avg
        assert system_level_metrics['D']['rouge-2_jk'][
            'recall'] == pytest.approx(12.820, 1e-2)
        assert system_level_metrics['43']['rouge-2_jk'][
            'recall'] == pytest.approx(13.447, 1e-2)
        assert system_level_metrics['17']['rouge-2_jk'][
            'recall'] == pytest.approx(12.993, 1e-2)

        # ROUGE/rougeSU4_A.m.avg
        assert system_level_metrics['43']['rouge-su4'][
            'recall'] == pytest.approx(16.519, 1e-2)
        assert system_level_metrics['17']['rouge-su4'][
            'recall'] == pytest.approx(15.984, 1e-2)
        assert system_level_metrics['24']['rouge-su4'][
            'recall'] == pytest.approx(15.975, 1e-2)

        # ROUGE/rougeSU4_A.jk.m.avg
        assert system_level_metrics['D']['rouge-su4_jk'][
            'recall'] == pytest.approx(16.412, 1e-2)
        assert system_level_metrics['A']['rouge-su4_jk'][
            'recall'] == pytest.approx(16.118, 1e-2)
        assert system_level_metrics['43']['rouge-su4_jk'][
            'recall'] == pytest.approx(16.519, 1e-2)

        # manual/manual.model.A.avg
        assert system_level_metrics['A']['num_scus_jk'] == pytest.approx(
            10.227, 1e-2)
        assert system_level_metrics['B']['num_scus_jk'] == pytest.approx(
            9.773, 1e-2)
        assert system_level_metrics['C']['num_scus_jk'] == pytest.approx(
            9.818, 1e-2)

        assert system_level_metrics['A'][
            'modified_pyramid_score_jk'] == pytest.approx(0.771, 1e-2)
        assert system_level_metrics['B'][
            'modified_pyramid_score_jk'] == pytest.approx(0.781, 1e-2)
        assert system_level_metrics['C'][
            'modified_pyramid_score_jk'] == pytest.approx(0.752, 1e-2)

        assert system_level_metrics['A'][
            'linguistic_quality'] == pytest.approx(4.864, 1e-2)
        assert system_level_metrics['B'][
            'linguistic_quality'] == pytest.approx(4.818, 1e-2)
        assert system_level_metrics['C'][
            'linguistic_quality'] == pytest.approx(5.000, 1e-2)

        assert system_level_metrics['A'][
            'overall_responsiveness'] == pytest.approx(4.818, 1e-2)
        assert system_level_metrics['B'][
            'overall_responsiveness'] == pytest.approx(4.727, 1e-2)
        assert system_level_metrics['C'][
            'overall_responsiveness'] == pytest.approx(4.955, 1e-2)

        # manual/manual.peer.A.avg
        assert system_level_metrics['1'][
            'modified_pyramid_score'] == pytest.approx(0.304, 1e-2)
        assert system_level_metrics['2'][
            'modified_pyramid_score'] == pytest.approx(0.362, 1e-2)
        assert system_level_metrics['3'][
            'modified_pyramid_score'] == pytest.approx(0.439, 1e-2)

        assert system_level_metrics['1']['num_scus'] == pytest.approx(
            3.909, 1e-2)
        assert system_level_metrics['2']['num_scus'] == pytest.approx(
            4.614, 1e-2)
        assert system_level_metrics['3']['num_scus'] == pytest.approx(
            5.750, 1e-2)

        assert system_level_metrics['1']['num_repetitions'] == pytest.approx(
            0.455, 1e-2)
        assert system_level_metrics['2']['num_repetitions'] == pytest.approx(
            1.432, 1e-2)
        assert system_level_metrics['3']['num_repetitions'] == pytest.approx(
            1.409, 1e-2)

        assert system_level_metrics['1'][
            'modified_pyramid_score_jk'] == pytest.approx(0.300, 1e-2)
        assert system_level_metrics['2'][
            'modified_pyramid_score_jk'] == pytest.approx(0.358, 1e-2)
        assert system_level_metrics['3'][
            'modified_pyramid_score_jk'] == pytest.approx(0.433, 1e-2)

        assert system_level_metrics['1'][
            'linguistic_quality'] == pytest.approx(3.205, 1e-2)
        assert system_level_metrics['2'][
            'linguistic_quality'] == pytest.approx(2.818, 1e-2)
        assert system_level_metrics['3'][
            'linguistic_quality'] == pytest.approx(2.705, 1e-2)

        assert system_level_metrics['1'][
            'overall_responsiveness'] == pytest.approx(2.500, 1e-2)
        assert system_level_metrics['2'][
            'overall_responsiveness'] == pytest.approx(2.841, 1e-2)
        assert system_level_metrics['3'][
            'overall_responsiveness'] == pytest.approx(3.045, 1e-2)

        # BE/simple_A.m.hm.avg
        assert system_level_metrics['43']['rouge-be-hm'][
            'recall'] == pytest.approx(8.565, 1e-2)
        assert system_level_metrics['17']['rouge-be-hm'][
            'recall'] == pytest.approx(8.153, 1e-2)
        assert system_level_metrics['25']['rouge-be-hm'][
            'recall'] == pytest.approx(8.012, 1e-2)

        # BE/simplejk_A.m.hm.avg
        assert system_level_metrics['D']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(9.085, 1e-2)
        assert system_level_metrics['E']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(8.628, 1e-2)
        assert system_level_metrics['43']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(8.553, 1e-2)

        # aesop_allpeers_A
        assert system_level_metrics['A']['aesop']['1'] == pytest.approx(
            0.1191786364, 1e-2)
        assert system_level_metrics['C']['aesop']['8'] == pytest.approx(
            3.853212409, 1e-2)
        assert system_level_metrics['4']['aesop']['13'] == pytest.approx(
            0.4008335416, 1e-2)
    def test_system_level_B(self):
        summary_level_metrics = JsonlReader(_metrics_B_file_path,
                                            Metrics).read()
        system_level_metrics = aggregate_metrics(summary_level_metrics)

        # Check a few metrics to make sure they are equal to what's in the NIST files
        # ROUGE/rouge2_B.m.avg
        assert system_level_metrics['16']['rouge-2'][
            'recall'] == pytest.approx(8.024, 1e-2)
        assert system_level_metrics['13']['rouge-2'][
            'recall'] == pytest.approx(7.913, 1e-2)
        assert system_level_metrics['36']['rouge-2'][
            'recall'] == pytest.approx(7.311, 1e-2)
        assert system_level_metrics['8']['rouge-2']['recall'] == pytest.approx(
            7.251, 1e-2)
        assert system_level_metrics['4']['rouge-2']['recall'] == pytest.approx(
            7.058, 1e-2)

        # ROUGE/rouge2_B.jk.m.avg
        assert system_level_metrics['D']['rouge-2_jk'][
            'recall'] == pytest.approx(13.021, 1e-2)
        assert system_level_metrics['E']['rouge-2_jk'][
            'recall'] == pytest.approx(10.196, 1e-1)
        assert system_level_metrics['F']['rouge-2_jk'][
            'recall'] == pytest.approx(9.777, 1e-2)
        assert system_level_metrics['16']['rouge-2_jk'][
            'recall'] == pytest.approx(7.993, 1e-2)
        assert system_level_metrics['13']['rouge-2_jk'][
            'recall'] == pytest.approx(7.902, 1e-2)

        # ROUGE/rougeSU4_B.m.avg
        assert system_level_metrics['16']['rouge-su4'][
            'recall'] == pytest.approx(12.006, 1e-2)
        assert system_level_metrics['13']['rouge-su4'][
            'recall'] == pytest.approx(11.878, 1e-2)
        assert system_level_metrics['6']['rouge-su4'][
            'recall'] == pytest.approx(11.198, 1e-2)
        assert system_level_metrics['22']['rouge-su4'][
            'recall'] == pytest.approx(11.107, 1e-2)
        assert system_level_metrics['8']['rouge-su4'][
            'recall'] == pytest.approx(11.039, 1e-2)

        # ROUGE/rougeSU4_B.jk.m.avg
        assert system_level_metrics['D']['rouge-su4_jk'][
            'recall'] == pytest.approx(16.193, 1e-2)
        assert system_level_metrics['E']['rouge-su4_jk'][
            'recall'] == pytest.approx(13.978, 1e-2)
        assert system_level_metrics['G']['rouge-su4_jk'][
            'recall'] == pytest.approx(13.573, 1e-2)
        assert system_level_metrics['16']['rouge-su4_jk'][
            'recall'] == pytest.approx(11.979, 1e-2)
        assert system_level_metrics['13']['rouge-su4_jk'][
            'recall'] == pytest.approx(11.869, 1e-2)

        # manual/manual.model.B.avg
        assert system_level_metrics['A']['num_scus_jk'] == pytest.approx(
            6.609, 1e-2)
        assert system_level_metrics['B']['num_scus_jk'] == pytest.approx(
            7.696, 1e-2)
        assert system_level_metrics['C']['num_scus_jk'] == pytest.approx(
            5.913, 1e-2)

        assert system_level_metrics['A'][
            'modified_pyramid_score_jk'] == pytest.approx(0.629, 1e-2)
        assert system_level_metrics['B'][
            'modified_pyramid_score_jk'] == pytest.approx(0.729, 1e-2)
        assert system_level_metrics['C'][
            'modified_pyramid_score_jk'] == pytest.approx(0.551, 1e-2)

        assert system_level_metrics['A'][
            'linguistic_quality'] == pytest.approx(4.913, 1e-2)
        assert system_level_metrics['B'][
            'linguistic_quality'] == pytest.approx(4.826, 1e-2)
        assert system_level_metrics['C'][
            'linguistic_quality'] == pytest.approx(4.870, 1e-2)

        assert system_level_metrics['A'][
            'overall_responsiveness'] == pytest.approx(4.783, 1e-2)
        assert system_level_metrics['B'][
            'overall_responsiveness'] == pytest.approx(4.783, 1e-2)
        assert system_level_metrics['C'][
            'overall_responsiveness'] == pytest.approx(4.826, 1e-2)

        # manual/manual.peer.B.avg
        assert system_level_metrics['1'][
            'modified_pyramid_score'] == pytest.approx(0.187, 1e-2)
        assert system_level_metrics['2'][
            'modified_pyramid_score'] == pytest.approx(0.262, 1e-2)
        assert system_level_metrics['3'][
            'modified_pyramid_score'] == pytest.approx(0.235, 1e-2)

        assert system_level_metrics['1']['num_scus'] == pytest.approx(
            2.065, 1e-2)
        assert system_level_metrics['2']['num_scus'] == pytest.approx(
            2.804, 1e-2)
        assert system_level_metrics['3']['num_scus'] == pytest.approx(
            2.609, 1e-2)

        assert system_level_metrics['1']['num_repetitions'] == pytest.approx(
            0.348, 1e-2)
        assert system_level_metrics['2']['num_repetitions'] == pytest.approx(
            0.522, 1e-2)
        assert system_level_metrics['3']['num_repetitions'] == pytest.approx(
            0.348, 1e-2)

        assert system_level_metrics['1'][
            'modified_pyramid_score_jk'] == pytest.approx(0.184, 1e-2)
        assert system_level_metrics['2'][
            'modified_pyramid_score_jk'] == pytest.approx(0.256, 1e-2)
        assert system_level_metrics['3'][
            'modified_pyramid_score_jk'] == pytest.approx(0.228, 1e-2)

        assert system_level_metrics['1'][
            'linguistic_quality'] == pytest.approx(3.739, 1e-2)
        assert system_level_metrics['2'][
            'linguistic_quality'] == pytest.approx(2.696, 1e-2)
        assert system_level_metrics['3'][
            'linguistic_quality'] == pytest.approx(2.957, 1e-2)

        assert system_level_metrics['1'][
            'overall_responsiveness'] == pytest.approx(2.022, 1e-2)
        assert system_level_metrics['2'][
            'overall_responsiveness'] == pytest.approx(2.478, 1e-2)
        assert system_level_metrics['3'][
            'overall_responsiveness'] == pytest.approx(2.217, 1e-2)

        # BE/simple_B.m.hm.avg
        assert system_level_metrics['16']['rouge-be-hm'][
            'recall'] == pytest.approx(4.445, 1e-2)
        assert system_level_metrics['13']['rouge-be-hm'][
            'recall'] == pytest.approx(4.417, 1e-2)
        assert system_level_metrics['8']['rouge-be-hm'][
            'recall'] == pytest.approx(4.350, 1e-1)
        assert system_level_metrics['4']['rouge-be-hm'][
            'recall'] == pytest.approx(4.115, 1e-2)
        assert system_level_metrics['22']['rouge-be-hm'][
            'recall'] == pytest.approx(4.050, 1e-2)

        # BE/simplejk_B.m.hm.avg
        assert system_level_metrics['D']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(8.842, 1e-2)
        assert system_level_metrics['F']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(7.842, 1e-1)
        assert system_level_metrics['B']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(7.081, 1e-1)
        assert system_level_metrics['16']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(4.411, 1e-2)
        assert system_level_metrics['13']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(4.402, 1e-2)

        # aesop_allpeers_B
        assert system_level_metrics['B']['aesop']['2'] == pytest.approx(
            0.1358091304, 1e-2)
        assert system_level_metrics['E']['aesop']['4'] == pytest.approx(
            0.1376682609, 1e-2)
        assert system_level_metrics['6']['aesop']['7'] == pytest.approx(
            0.2641304348, 1e-2)
        assert system_level_metrics['9']['aesop']['20'] == pytest.approx(
            0.09438347826, 1e-2)
        assert system_level_metrics['14']['aesop']['22'] == pytest.approx(
            0.3394478261, 1e-2)
    def test_system_level_A(self):
        summary_level_metrics = JsonlReader(_metrics_A_file_path,
                                            Metrics).read()
        system_level_metrics = aggregate_metrics(summary_level_metrics)

        # Check a few metrics to make sure they are equal to what's in the NIST files
        # ROUGE/rouge2_A.m.avg
        assert system_level_metrics['22']['rouge-2'][
            'recall'] == pytest.approx(9.574, 1e-2)
        assert system_level_metrics['18']['rouge-2'][
            'recall'] == pytest.approx(9.418, 1e-2)
        assert system_level_metrics['23']['rouge-2'][
            'recall'] == pytest.approx(9.404, 1e-2)
        assert system_level_metrics['24']['rouge-2'][
            'recall'] == pytest.approx(9.196, 1e-2)
        assert system_level_metrics['36']['rouge-2'][
            'recall'] == pytest.approx(9.194, 1e-2)

        # ROUGE/rouge2_A.jk.m.avg
        assert system_level_metrics['D']['rouge-2_jk'][
            'recall'] == pytest.approx(12.862, 1e-2)
        assert system_level_metrics['H']['rouge-2_jk'][
            'recall'] == pytest.approx(12.841, 1e-1)
        assert system_level_metrics['F']['rouge-2_jk'][
            'recall'] == pytest.approx(12.556, 1e-2)
        assert system_level_metrics['22']['rouge-2_jk'][
            'recall'] == pytest.approx(9.620, 1e-2)
        assert system_level_metrics['18']['rouge-2_jk'][
            'recall'] == pytest.approx(9.451, 1e-2)

        # ROUGE/rougeSU4_A.m.avg
        assert system_level_metrics['22']['rouge-su4'][
            'recall'] == pytest.approx(13.014, 1e-2)
        assert system_level_metrics['23']['rouge-su4'][
            'recall'] == pytest.approx(12.963, 1e-2)
        assert system_level_metrics['24']['rouge-su4'][
            'recall'] == pytest.approx(12.829, 1e-2)
        assert system_level_metrics['18']['rouge-su4'][
            'recall'] == pytest.approx(12.407, 1e-2)
        assert system_level_metrics['34']['rouge-su4'][
            'recall'] == pytest.approx(12.283, 1e-2)

        # ROUGE/rougeSU4_A.jk.m.avg
        assert system_level_metrics['H']['rouge-su4_jk'][
            'recall'] == pytest.approx(16.294, 1e-2)
        assert system_level_metrics['F']['rouge-su4_jk'][
            'recall'] == pytest.approx(16.212, 1e-2)
        assert system_level_metrics['D']['rouge-su4_jk'][
            'recall'] == pytest.approx(16.200, 1e-2)
        assert system_level_metrics['22']['rouge-su4_jk'][
            'recall'] == pytest.approx(13.049, 1e-2)
        assert system_level_metrics['23']['rouge-su4_jk'][
            'recall'] == pytest.approx(12.978, 1e-2)

        # manual/manual.model.A.avg
        assert system_level_metrics['A']['num_scus_jk'] == pytest.approx(
            10.870, 1e-2)
        assert system_level_metrics['B']['num_scus_jk'] == pytest.approx(
            11.087, 1e-2)
        assert system_level_metrics['C']['num_scus_jk'] == pytest.approx(
            9.826, 1e-2)

        assert system_level_metrics['A'][
            'modified_pyramid_score_jk'] == pytest.approx(0.779, 1e-2)
        assert system_level_metrics['B'][
            'modified_pyramid_score_jk'] == pytest.approx(0.747, 1e-2)
        assert system_level_metrics['C'][
            'modified_pyramid_score_jk'] == pytest.approx(0.661, 1e-2)

        assert system_level_metrics['A'][
            'linguistic_quality'] == pytest.approx(4.913, 1e-2)
        assert system_level_metrics['B'][
            'linguistic_quality'] == pytest.approx(4.870, 1e-2)
        assert system_level_metrics['C'][
            'linguistic_quality'] == pytest.approx(4.826, 1e-2)

        assert system_level_metrics['A'][
            'overall_responsiveness'] == pytest.approx(4.783, 1e-2)
        assert system_level_metrics['B'][
            'overall_responsiveness'] == pytest.approx(4.696, 1e-2)
        assert system_level_metrics['C'][
            'overall_responsiveness'] == pytest.approx(4.565, 1e-2)

        # manual/manual.peer.A.avg
        assert system_level_metrics['1'][
            'modified_pyramid_score'] == pytest.approx(0.233, 1e-2)
        assert system_level_metrics['2'][
            'modified_pyramid_score'] == pytest.approx(0.296, 1e-2)
        assert system_level_metrics['3'][
            'modified_pyramid_score'] == pytest.approx(0.399, 1e-2)

        assert system_level_metrics['1']['num_scus'] == pytest.approx(
            3.304, 1e-2)
        assert system_level_metrics['2']['num_scus'] == pytest.approx(
            4.217, 1e-2)
        assert system_level_metrics['3']['num_scus'] == pytest.approx(
            5.500, 1e-2)

        assert system_level_metrics['1']['num_repetitions'] == pytest.approx(
            0.522, 1e-2)
        assert system_level_metrics['2']['num_repetitions'] == pytest.approx(
            1.217, 1e-2)
        assert system_level_metrics['3']['num_repetitions'] == pytest.approx(
            1.413, 1e-2)

        assert system_level_metrics['1'][
            'modified_pyramid_score_jk'] == pytest.approx(0.229, 1e-2)
        assert system_level_metrics['2'][
            'modified_pyramid_score_jk'] == pytest.approx(0.291, 1e-2)
        assert system_level_metrics['3'][
            'modified_pyramid_score_jk'] == pytest.approx(0.393, 1e-2)

        assert system_level_metrics['1'][
            'linguistic_quality'] == pytest.approx(3.652, 1e-2)
        assert system_level_metrics['2'][
            'linguistic_quality'] == pytest.approx(2.717, 1e-2)
        assert system_level_metrics['3'][
            'linguistic_quality'] == pytest.approx(3.043, 1e-2)

        assert system_level_metrics['1'][
            'overall_responsiveness'] == pytest.approx(2.174, 1e-2)
        assert system_level_metrics['2'][
            'overall_responsiveness'] == pytest.approx(2.500, 1e-2)
        assert system_level_metrics['3'][
            'overall_responsiveness'] == pytest.approx(2.978, 1e-2)

        # BE/simple_A.m.hm.avg
        assert system_level_metrics['22']['rouge-be-hm'][
            'recall'] == pytest.approx(5.937, 1e-2)
        assert system_level_metrics['23']['rouge-be-hm'][
            'recall'] == pytest.approx(5.809, 1e-2)
        assert system_level_metrics['18']['rouge-be-hm'][
            'recall'] == pytest.approx(5.749, 1e-2)
        assert system_level_metrics['13']['rouge-be-hm'][
            'recall'] == pytest.approx(5.553, 1e-2)
        assert system_level_metrics['16']['rouge-be-hm'][
            'recall'] == pytest.approx(5.497, 1e-2)

        # BE/simplejk_A.m.hm.avg
        assert system_level_metrics['F']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(9.114, 1e-2)
        assert system_level_metrics['H']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(8.690, 1e-1)
        assert system_level_metrics['D']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(8.449, 1e-1)
        assert system_level_metrics['22']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(5.973, 1e-2)
        assert system_level_metrics['23']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(5.828, 1e-2)

        # aesop_allpeers_A
        assert system_level_metrics['A']['aesop']['1'] == pytest.approx(
            0.09517478261, 1e-2)
        assert system_level_metrics['C']['aesop']['8'] == pytest.approx(
            0.0, 1e-2)
        assert system_level_metrics['4']['aesop']['13'] == pytest.approx(
            0.6150630435, 1e-2)
        assert system_level_metrics['8']['aesop']['22'] == pytest.approx(
            0.3684913043, 1e-2)
        assert system_level_metrics['16']['aesop']['27'] == pytest.approx(
            11.80434783, 1e-2)
    def test_system_level_B(self):
        summary_level_metrics = JsonlReader(_metrics_B_file_path,
                                            Metrics).read()
        system_level_metrics = aggregate_metrics(summary_level_metrics)

        # Check a few metrics to make sure they are equal to what's in the NIST files
        # ROUGE/rouge2_B.m.avg
        assert system_level_metrics['2']['rouge-2']['recall'] == pytest.approx(
            31.956, 1e-2)
        assert system_level_metrics['34']['rouge-2'][
            'recall'] == pytest.approx(10.386, 1e-2)
        assert system_level_metrics['40']['rouge-2'][
            'recall'] == pytest.approx(10.373, 1e-2)
        assert system_level_metrics['35']['rouge-2'][
            'recall'] == pytest.approx(10.104, 1e-2)
        assert system_level_metrics['3']['rouge-2']['recall'] == pytest.approx(
            9.820, 1e-2)

        # ROUGE/rouge2_B.jk.m.avg
        # C is off by a bit?
        # assert system_level_metrics['C']['rouge-2_jk']['recall'] == pytest.approx(12.550, 1e-2)
        assert system_level_metrics['H']['rouge-2_jk'][
            'recall'] == pytest.approx(12.436, 1e-2)
        assert system_level_metrics['E']['rouge-2_jk'][
            'recall'] == pytest.approx(11.001, 1e-2)
        assert system_level_metrics['2']['rouge-2_jk'][
            'recall'] == pytest.approx(31.932, 1e-2)
        assert system_level_metrics['34']['rouge-2_jk'][
            'recall'] == pytest.approx(10.417, 1e-2)

        # ROUGE/rougeSU4_B.m.avg
        assert system_level_metrics['2']['rouge-su4'][
            'recall'] == pytest.approx(33.688, 1e-2)
        assert system_level_metrics['40']['rouge-su4'][
            'recall'] == pytest.approx(13.948, 1e-2)
        assert system_level_metrics['34']['rouge-su4'][
            'recall'] == pytest.approx(13.851, 1e-2)
        assert system_level_metrics['35']['rouge-su4'][
            'recall'] == pytest.approx(13.839, 1e-2)
        assert system_level_metrics['51']['rouge-su4'][
            'recall'] == pytest.approx(13.650, 1e-2)

        # ROUGE/rougeSU4_B.jk.m.avg
        assert system_level_metrics['C']['rouge-su4_jk'][
            'recall'] == pytest.approx(16.386, 1e-2)
        assert system_level_metrics['H']['rouge-su4_jk'][
            'recall'] == pytest.approx(16.602, 1e-2)
        assert system_level_metrics['E']['rouge-su4_jk'][
            'recall'] == pytest.approx(15.152, 1e-2)
        assert system_level_metrics['2']['rouge-su4_jk'][
            'recall'] == pytest.approx(33.668, 1e-2)
        assert system_level_metrics['40']['rouge-su4_jk'][
            'recall'] == pytest.approx(13.959, 1e-2)

        # manual/manual.model.B.avg
        assert system_level_metrics['A']['num_scus_jk'] == pytest.approx(
            6.455, 1e-2)
        assert system_level_metrics['B']['num_scus_jk'] == pytest.approx(
            8.591, 1e-2)
        assert system_level_metrics['C']['num_scus_jk'] == pytest.approx(
            8.545, 1e-2)

        assert system_level_metrics['A'][
            'modified_pyramid_score_jk'] == pytest.approx(0.481, 1e-2)
        assert system_level_metrics['B'][
            'modified_pyramid_score_jk'] == pytest.approx(0.663, 1e-2)
        assert system_level_metrics['C'][
            'modified_pyramid_score_jk'] == pytest.approx(0.640, 1e-2)

        assert system_level_metrics['A'][
            'linguistic_quality'] == pytest.approx(8.727, 1e-2)
        assert system_level_metrics['B'][
            'linguistic_quality'] == pytest.approx(8.545, 1e-2)
        assert system_level_metrics['C'][
            'linguistic_quality'] == pytest.approx(9.364, 1e-2)

        assert system_level_metrics['A'][
            'overall_responsiveness'] == pytest.approx(8.364, 1e-2)
        assert system_level_metrics['B'][
            'overall_responsiveness'] == pytest.approx(8.318, 1e-2)
        assert system_level_metrics['C'][
            'overall_responsiveness'] == pytest.approx(9.136, 1e-2)

        # manual/manual.peer.B.avg
        assert system_level_metrics['1'][
            'modified_pyramid_score'] == pytest.approx(0.160, 1e-2)
        assert system_level_metrics['2'][
            'modified_pyramid_score'] == pytest.approx(0.690, 1e-2)
        assert system_level_metrics['3'][
            'modified_pyramid_score'] == pytest.approx(0.329, 1e-2)

        assert system_level_metrics['1']['num_scus'] == pytest.approx(
            2.386, 1e-2)
        assert system_level_metrics['2']['num_scus'] == pytest.approx(
            9.886, 1e-2)
        assert system_level_metrics['3']['num_scus'] == pytest.approx(
            4.545, 1e-2)

        assert system_level_metrics['1']['num_repetitions'] == pytest.approx(
            0.841, 1e-2)
        assert system_level_metrics['2']['num_repetitions'] == pytest.approx(
            1.955, 1e-2)
        assert system_level_metrics['3']['num_repetitions'] == pytest.approx(
            0.955, 1e-2)

        assert system_level_metrics['1'][
            'modified_pyramid_score_jk'] == pytest.approx(0.158, 1e-2)
        assert system_level_metrics['2'][
            'modified_pyramid_score_jk'] == pytest.approx(0.677, 1e-2)
        assert system_level_metrics['3'][
            'modified_pyramid_score_jk'] == pytest.approx(0.324, 1e-2)

        assert system_level_metrics['1'][
            'linguistic_quality'] == pytest.approx(6.455, 1e-2)
        assert system_level_metrics['2'][
            'linguistic_quality'] == pytest.approx(5.886, 1e-2)
        assert system_level_metrics['3'][
            'linguistic_quality'] == pytest.approx(7.250, 1e-2)

        assert system_level_metrics['1'][
            'overall_responsiveness'] == pytest.approx(4.318, 1e-2)
        assert system_level_metrics['2'][
            'overall_responsiveness'] == pytest.approx(6.182, 1e-2)
        assert system_level_metrics['3'][
            'overall_responsiveness'] == pytest.approx(6.114, 1e-2)

        # BE/simple_B.m.hm.avg
        assert system_level_metrics['2']['rouge-be-hm'][
            'recall'] == pytest.approx(25.041, 1e-2)
        assert system_level_metrics['24']['rouge-be-hm'][
            'recall'] == pytest.approx(6.389, 1e-2)
        assert system_level_metrics['40']['rouge-be-hm'][
            'recall'] == pytest.approx(6.162, 1e-2)
        assert system_level_metrics['34']['rouge-be-hm'][
            'recall'] == pytest.approx(6.118, 1e-2)
        assert system_level_metrics['35']['rouge-be-hm'][
            'recall'] == pytest.approx(5.813, 1e-2)

        # BE/simplejk_B.m.hm.avg
        # F is off by a little
        assert system_level_metrics['C']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(6.795, 1e-2)
        assert system_level_metrics['H']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(7.040, 1e-2)
        assert system_level_metrics['F']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(6.094, 1e-1)
        assert system_level_metrics['2']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(25.042, 1e-2)
        assert system_level_metrics['34']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(6.134, 1e-2)

        # aesop_allpeers_B
        assert system_level_metrics['B']['aesop']['2'] == pytest.approx(
            0.04890409091, 1e-2)
        assert system_level_metrics['E']['aesop']['4'] == pytest.approx(
            0.2740872727, 1e-2)
        assert system_level_metrics['6']['aesop']['7'] == pytest.approx(
            0.5850288957, 1e-2)
        assert system_level_metrics['9']['aesop']['20'] == pytest.approx(
            0.06261788636, 1e-2)
        assert system_level_metrics['14']['aesop']['34'] == pytest.approx(
            0.3664196656, 1e-2)
    def test_system_level_A(self):
        summary_level_metrics = JsonlReader(_metrics_A_file_path,
                                            Metrics).read()
        system_level_metrics = aggregate_metrics(summary_level_metrics)

        # Check a few metrics to make sure they are equal to what's in the NIST files
        # ROUGE/rouge2_A.m.avg
        assert system_level_metrics['2']['rouge-2']['recall'] == pytest.approx(
            33.165, 1e-2)
        assert system_level_metrics['34']['rouge-2'][
            'recall'] == pytest.approx(12.163, 1e-2)
        assert system_level_metrics['40']['rouge-2'][
            'recall'] == pytest.approx(12.089, 1e-2)
        assert system_level_metrics['35']['rouge-2'][
            'recall'] == pytest.approx(10.869, 1e-2)
        assert system_level_metrics['3']['rouge-2']['recall'] == pytest.approx(
            10.655, 1e-2)

        # ROUGE/rouge2_A.jk.m.avg
        assert system_level_metrics['C']['rouge-2_jk'][
            'recall'] == pytest.approx(14.864, 1e-2)
        assert system_level_metrics['H']['rouge-2_jk'][
            'recall'] == pytest.approx(13.457, 1e-2)
        assert system_level_metrics['E']['rouge-2_jk'][
            'recall'] == pytest.approx(13.341, 1e-2)
        assert system_level_metrics['2']['rouge-2_jk'][
            'recall'] == pytest.approx(33.133, 1e-2)
        assert system_level_metrics['34']['rouge-2_jk'][
            'recall'] == pytest.approx(12.184, 1e-2)

        # ROUGE/rougeSU4_A.m.avg
        assert system_level_metrics['2']['rouge-su4'][
            'recall'] == pytest.approx(34.421, 1e-2)
        assert system_level_metrics['40']['rouge-su4'][
            'recall'] == pytest.approx(15.101, 1e-2)
        assert system_level_metrics['34']['rouge-su4'][
            'recall'] == pytest.approx(15.030, 1e-2)
        assert system_level_metrics['35']['rouge-su4'][
            'recall'] == pytest.approx(14.487, 1e-2)
        assert system_level_metrics['51']['rouge-su4'][
            'recall'] == pytest.approx(14.165, 1e-2)

        # ROUGE/rougeSU4_A.jk.m.avg
        assert system_level_metrics['C']['rouge-su4_jk'][
            'recall'] == pytest.approx(18.355, 1e-2)
        assert system_level_metrics['H']['rouge-su4_jk'][
            'recall'] == pytest.approx(17.199, 1e-2)
        assert system_level_metrics['E']['rouge-su4_jk'][
            'recall'] == pytest.approx(16.917, 1e-2)
        assert system_level_metrics['2']['rouge-su4_jk'][
            'recall'] == pytest.approx(34.399, 1e-2)
        assert system_level_metrics['40']['rouge-su4_jk'][
            'recall'] == pytest.approx(15.131, 1e-2)

        # manual/manual.model.A.avg
        assert system_level_metrics['A']['num_scus_jk'] == pytest.approx(
            10.364, 1e-2)
        assert system_level_metrics['B']['num_scus_jk'] == pytest.approx(
            9.5, 1e-2)
        assert system_level_metrics['C']['num_scus_jk'] == pytest.approx(
            12.364, 1e-2)

        assert system_level_metrics['A'][
            'modified_pyramid_score_jk'] == pytest.approx(0.685, 1e-2)
        assert system_level_metrics['B'][
            'modified_pyramid_score_jk'] == pytest.approx(0.616, 1e-2)
        assert system_level_metrics['C'][
            'modified_pyramid_score_jk'] == pytest.approx(0.720, 1e-2)

        assert system_level_metrics['A'][
            'linguistic_quality'] == pytest.approx(8.636, 1e-2)
        assert system_level_metrics['B'][
            'linguistic_quality'] == pytest.approx(9.136, 1e-2)
        assert system_level_metrics['C'][
            'linguistic_quality'] == pytest.approx(9.136, 1e-2)

        assert system_level_metrics['A'][
            'overall_responsiveness'] == pytest.approx(8.455, 1e-2)
        assert system_level_metrics['B'][
            'overall_responsiveness'] == pytest.approx(8.727, 1e-2)
        assert system_level_metrics['C'][
            'overall_responsiveness'] == pytest.approx(9.318, 1e-2)

        # manual/manual.peer.A.avg
        assert system_level_metrics['1'][
            'modified_pyramid_score'] == pytest.approx(0.175, 1e-2)
        assert system_level_metrics['2'][
            'modified_pyramid_score'] == pytest.approx(0.646, 1e-2)
        assert system_level_metrics['3'][
            'modified_pyramid_score'] == pytest.approx(0.358, 1e-2)

        assert system_level_metrics['1']['num_scus'] == pytest.approx(
            3.182, 1e-2)
        assert system_level_metrics['2']['num_scus'] == pytest.approx(
            11.977, 1e-2)
        assert system_level_metrics['3']['num_scus'] == pytest.approx(
            6.00, 1e-2)

        assert system_level_metrics['1']['num_repetitions'] == pytest.approx(
            1.318, 1e-2)
        assert system_level_metrics['2']['num_repetitions'] == pytest.approx(
            2.455, 1e-2)
        assert system_level_metrics['3']['num_repetitions'] == pytest.approx(
            1.568, 1e-2)

        assert system_level_metrics['1'][
            'modified_pyramid_score_jk'] == pytest.approx(0.172, 1e-2)
        assert system_level_metrics['2'][
            'modified_pyramid_score_jk'] == pytest.approx(0.635, 1e-2)
        assert system_level_metrics['3'][
            'modified_pyramid_score_jk'] == pytest.approx(0.352, 1e-2)

        assert system_level_metrics['1'][
            'linguistic_quality'] == pytest.approx(6.705, 1e-2)
        assert system_level_metrics['2'][
            'linguistic_quality'] == pytest.approx(5.477, 1e-2)
        assert system_level_metrics['3'][
            'linguistic_quality'] == pytest.approx(7.477, 1e-2)

        assert system_level_metrics['1'][
            'overall_responsiveness'] == pytest.approx(3.636, 1e-2)
        assert system_level_metrics['2'][
            'overall_responsiveness'] == pytest.approx(6.364, 1e-2)
        assert system_level_metrics['3'][
            'overall_responsiveness'] == pytest.approx(6.341, 1e-2)

        # BE/simple_A.m.hm.avg
        assert system_level_metrics['2']['rouge-be-hm'][
            'recall'] == pytest.approx(24.820, 1e-2)
        assert system_level_metrics['34']['rouge-be-hm'][
            'recall'] == pytest.approx(6.356, 1e-2)
        assert system_level_metrics['40']['rouge-be-hm'][
            'recall'] == pytest.approx(6.321, 1e-2)
        assert system_level_metrics['45']['rouge-be-hm'][
            'recall'] == pytest.approx(5.899, 1e-2)
        assert system_level_metrics['4']['rouge-be-hm'][
            'recall'] == pytest.approx(5.843, 1e-2)

        # BE/simplejk_A.m.hm.avg
        assert system_level_metrics['C']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(7.876, 1e-2)
        assert system_level_metrics['E']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(6.909, 1e-2)
        assert system_level_metrics['F']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(6.840, 1e-2)
        assert system_level_metrics['2']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(24.830, 1e-2)
        assert system_level_metrics['34']['rouge-be-hm_jk'][
            'recall'] == pytest.approx(6.379, 1e-2)

        # aesop_allpeers_A
        assert system_level_metrics['A']['aesop']['1'] == pytest.approx(
            0.154895909090909, 1e-2)
        assert system_level_metrics['C']['aesop']['8'] == pytest.approx(
            0.0419939626932389, 1e-2)
        assert system_level_metrics['4']['aesop']['13'] == pytest.approx(
            0.2186197727, 1e-2)
        assert system_level_metrics['8']['aesop']['22'] == pytest.approx(
            0.1286081818, 1e-2)
        assert system_level_metrics['16']['aesop']['30'] == pytest.approx(
            0.2341865909, 1e-2)