def test_single_metric(self): config_string = "SENTENCEBLEU n=4" segment = self.tokenize( "Consistency is the last refuge of the unimaginative") reference_scorer = SentenceBleuScorer('n=4') provided_scorer = ScorerProvider().get(config_string) reference_scorer.set_reference(segment) provided_scorer.set_reference(segment) self.assertEqual(reference_scorer.score(segment), provided_scorer.score(segment))
def test_interpolated_metrics(self): config_string = "INTERPOLATE w=0.3,0.7; SENTENCEBLEU n=4; SENTENCEBLEU n=4" segment = self.tokenize("Consistency is the last refuge of the unimaginative") reference_scorer = SentenceBleuScorer('n=4') provided_scorer = ScorerProvider().get(config_string) # interpolating BLEU with BLEU should obviously result in the same as just using a single BLEU scorer reference_scorer.set_reference(segment) provided_scorer.set_reference(segment) self.assertEqual( reference_scorer.score(segment), provided_scorer.score(segment) )
def test_single_metric(self): config_string = "SENTENCEBLEU n=4" segment = self.tokenize("Consistency is the last refuge of the unimaginative") reference_scorer = SentenceBleuScorer('n=4') provided_scorer = ScorerProvider().get(config_string) reference_scorer.set_reference(segment) provided_scorer.set_reference(segment) self.assertEqual( reference_scorer.score(segment), provided_scorer.score(segment) )
def test_interpolated_metrics(self): config_string = "INTERPOLATE w=0.3,0.7; SENTENCEBLEU n=4; SENTENCEBLEU n=4" segment = self.tokenize( "Consistency is the last refuge of the unimaginative") reference_scorer = SentenceBleuScorer('n=4') provided_scorer = ScorerProvider().get( config_string ) # interpolating BLEU with BLEU should obviously result in the same as just using a single BLEU scorer reference_scorer.set_reference(segment) provided_scorer.set_reference(segment) self.assertEqual(reference_scorer.score(segment), provided_scorer.score(segment))
def evaluate_from_file(args): """ Evaluate translation hypotheses from a file or a list of files of references. :param args: Evaluation parameters :return: None """ sentence_bleu_scorer = SentenceBleuScorer('') bleus_hypotheses = [] * len(args.hypotheses) for n_system, data_filename in list(enumerate(args.hypotheses)): hypotheses = file2list(data_filename) references = file2list(args.references[n_system]) bleus = [] for hyp_line, ref_line in zip(hypotheses, references): sentence_bleu_scorer.set_reference(ref_line.split()) bleu = sentence_bleu_scorer.score(hyp_line.split()) * 100 bleus.append(bleu) bleus_hypotheses.append(bleus) bleus_hypotheses = np.asarray(bleus_hypotheses) average_bleus = np.transpose(bleus_hypotheses).mean(axis=1) bleus_baselines = [] * len(args.baselines) for n_system, data_filename in list(enumerate(args.baselines)): hypotheses = file2list(data_filename) references = file2list(args.base_references[n_system]) bleus = [] for hyp_line, ref_line in zip(hypotheses, references): sentence_bleu_scorer.set_reference(ref_line.split()) bleu = sentence_bleu_scorer.score(hyp_line.split()) * 100 bleus.append(bleu) bleus_baselines.append(bleus) bleus_baselines = np.asarray(bleus_baselines) average_bleu_baselines = np.transpose(bleus_baselines).mean(axis=1) return average_bleus, average_bleu_baselines
def get(self, config_string): """ Returns a scorer matching the metric and parameters defined in @param config string. Example: ScorerProvider.get("BLEU n=4") returns a SmoothedBleuScorer object that considers n-gram precision up to n=4. If more than one metrics are provided (separated by `;`), an interpolated scorer will be returned. Example: ScorerProvider.get("INTERPOLATE w=0.5,0.5; SENTENCEBLEU n=4; METEOR meteor_language=fr, meteor_path=/foo/bar/meteor") returns an InterpolatedScorer object that scores hypotheses using 0.5 * bleu_score + 0.5 * meteor_score. """ # interpolation if config_string.startswith("INTERPOLATE"): return si.ScorerInterpolator(config_string) try: scorer, arguments = config_string.split(" ", 1) except ValueError: scorer = config_string arguments = '' if scorer == 'SENTENCEBLEU': return SentenceBleuScorer(arguments) elif scorer == 'METEOR': return MeteorScorer(arguments) elif scorer == 'BEER': return BeerScorer(arguments) elif scorer == 'CHRF': return CharacterFScorer(arguments) # add other scorers here else: raise NotImplementedError("No such scorer: %s" % scorer)
def test_clipping(self): segment_a = self.tokenize("The very nice man") segment_b = self.tokenize("man man man man") scorer = SentenceBleuScorer('n=1') scorer.set_reference(segment_a) self.assertNotEqual(scorer.score(segment_b), 1.0)
def test_completely_different_segments(self): segment_a = self.tokenize("A A A") segment_b = self.tokenize("B B B") scorer = SentenceBleuScorer('n=4') scorer.set_reference(segment_a) self.assertEqual(scorer.score(segment_b), 0.0)
def test_identical_segments(self): segment = self.tokenize( "Consistency is the last refuge of the unimaginative") scorer = SentenceBleuScorer('n=4') scorer.set_reference(segment) self.assertEqual(scorer.score(segment), 1.0)
def test_identical_segments(self): segment = self.tokenize("Consistency is the last refuge of the unimaginative") scorer = SentenceBleuScorer('n=4') scorer.set_reference(segment) self.assertEqual(scorer.score(segment), 1.0)