def test_single_metric(self):
     config_string = "SENTENCEBLEU n=4"
     segment = self.tokenize(
         "Consistency is the last refuge of the unimaginative")
     reference_scorer = SentenceBleuScorer('n=4')
     provided_scorer = ScorerProvider().get(config_string)
     reference_scorer.set_reference(segment)
     provided_scorer.set_reference(segment)
     self.assertEqual(reference_scorer.score(segment),
                      provided_scorer.score(segment))
 def test_interpolated_metrics(self):
     config_string = "INTERPOLATE w=0.3,0.7; SENTENCEBLEU n=4; SENTENCEBLEU n=4"
     segment = self.tokenize("Consistency is the last refuge of the unimaginative")
     reference_scorer = SentenceBleuScorer('n=4')
     provided_scorer = ScorerProvider().get(config_string) # interpolating BLEU with BLEU should obviously result in the same as just using a single BLEU scorer
     reference_scorer.set_reference(segment)
     provided_scorer.set_reference(segment)
     self.assertEqual(
         reference_scorer.score(segment),
         provided_scorer.score(segment)
     )
 def test_single_metric(self):
     config_string = "SENTENCEBLEU n=4"
     segment = self.tokenize("Consistency is the last refuge of the unimaginative")
     reference_scorer = SentenceBleuScorer('n=4')
     provided_scorer = ScorerProvider().get(config_string)
     reference_scorer.set_reference(segment)
     provided_scorer.set_reference(segment)
     self.assertEqual(
         reference_scorer.score(segment),
         provided_scorer.score(segment)
     )
 def test_interpolated_metrics(self):
     config_string = "INTERPOLATE w=0.3,0.7; SENTENCEBLEU n=4; SENTENCEBLEU n=4"
     segment = self.tokenize(
         "Consistency is the last refuge of the unimaginative")
     reference_scorer = SentenceBleuScorer('n=4')
     provided_scorer = ScorerProvider().get(
         config_string
     )  # interpolating BLEU with BLEU should obviously result in the same as just using a single BLEU scorer
     reference_scorer.set_reference(segment)
     provided_scorer.set_reference(segment)
     self.assertEqual(reference_scorer.score(segment),
                      provided_scorer.score(segment))
def evaluate_from_file(args):
    """
    Evaluate translation hypotheses from a file or a list of files of references.
    :param args: Evaluation parameters
    :return: None
    """
    sentence_bleu_scorer = SentenceBleuScorer('')

    bleus_hypotheses = [] * len(args.hypotheses)
    for n_system, data_filename in list(enumerate(args.hypotheses)):
        hypotheses = file2list(data_filename)
        references = file2list(args.references[n_system])
        bleus = []
        for hyp_line, ref_line in zip(hypotheses, references):
            sentence_bleu_scorer.set_reference(ref_line.split())
            bleu = sentence_bleu_scorer.score(hyp_line.split()) * 100
            bleus.append(bleu)
        bleus_hypotheses.append(bleus)
    bleus_hypotheses = np.asarray(bleus_hypotheses)
    average_bleus = np.transpose(bleus_hypotheses).mean(axis=1)

    bleus_baselines = [] * len(args.baselines)
    for n_system, data_filename in list(enumerate(args.baselines)):
        hypotheses = file2list(data_filename)
        references = file2list(args.base_references[n_system])
        bleus = []
        for hyp_line, ref_line in zip(hypotheses, references):
            sentence_bleu_scorer.set_reference(ref_line.split())
            bleu = sentence_bleu_scorer.score(hyp_line.split()) * 100
            bleus.append(bleu)
        bleus_baselines.append(bleus)
    bleus_baselines = np.asarray(bleus_baselines)
    average_bleu_baselines = np.transpose(bleus_baselines).mean(axis=1)

    return average_bleus, average_bleu_baselines
Example #6
0
    def get(self, config_string):
        """
        Returns a scorer matching the metric and parameters defined in @param
        config string.

        Example: ScorerProvider.get("BLEU n=4") returns a SmoothedBleuScorer
                 object that considers n-gram precision up to n=4.

        If more than one metrics are provided (separated by `;`),
        an interpolated scorer will be returned.

        Example: ScorerProvider.get("INTERPOLATE w=0.5,0.5; SENTENCEBLEU n=4; METEOR meteor_language=fr, meteor_path=/foo/bar/meteor")
                 returns an InterpolatedScorer object that scores hypotheses
                 using 0.5 * bleu_score + 0.5 * meteor_score.
        """
        # interpolation
        if config_string.startswith("INTERPOLATE"):
            return si.ScorerInterpolator(config_string)
        try:
            scorer, arguments = config_string.split(" ", 1)
        except ValueError:
            scorer = config_string
            arguments = ''
        if scorer == 'SENTENCEBLEU':
            return SentenceBleuScorer(arguments)
        elif scorer == 'METEOR':
            return MeteorScorer(arguments)
        elif scorer == 'BEER':
            return BeerScorer(arguments)
        elif scorer == 'CHRF':
            return CharacterFScorer(arguments)
        # add other scorers here
        else:
            raise NotImplementedError("No such scorer: %s" % scorer)
 def test_clipping(self):
     segment_a = self.tokenize("The very nice man")
     segment_b = self.tokenize("man man man man")
     scorer = SentenceBleuScorer('n=1')
     scorer.set_reference(segment_a)
     self.assertNotEqual(scorer.score(segment_b), 1.0)
 def test_completely_different_segments(self):
     segment_a = self.tokenize("A A A")
     segment_b = self.tokenize("B B B")
     scorer = SentenceBleuScorer('n=4')
     scorer.set_reference(segment_a)
     self.assertEqual(scorer.score(segment_b), 0.0)
 def test_identical_segments(self):
     segment = self.tokenize(
         "Consistency is the last refuge of the unimaginative")
     scorer = SentenceBleuScorer('n=4')
     scorer.set_reference(segment)
     self.assertEqual(scorer.score(segment), 1.0)
Example #10
0
 def test_clipping(self):
     segment_a = self.tokenize("The very nice man")
     segment_b = self.tokenize("man man man man")
     scorer = SentenceBleuScorer('n=1')
     scorer.set_reference(segment_a)
     self.assertNotEqual(scorer.score(segment_b), 1.0)
Example #11
0
 def test_completely_different_segments(self):
     segment_a = self.tokenize("A A A")
     segment_b = self.tokenize("B B B")
     scorer = SentenceBleuScorer('n=4')
     scorer.set_reference(segment_a)
     self.assertEqual(scorer.score(segment_b), 0.0)
Example #12
0
 def test_identical_segments(self):
     segment = self.tokenize("Consistency is the last refuge of the unimaginative")
     scorer = SentenceBleuScorer('n=4')
     scorer.set_reference(segment)
     self.assertEqual(scorer.score(segment), 1.0)