def test_from_file(self): expected_scores = Scores([ Score([2, 3]), Score([4, 12]), Score([22, 500]), Score([3.1, 4.355]), ]) self.assertEqual( expected_scores, Scores.from_file( open( os.path.dirname(os.path.realpath(__file__)) + "/resources/example_scores")))
def get_numerators_and_denominators(score_file): """Transform score files obtained by the CoNLL scorer. This function transforms files obtained by the reference coreference scorer (https://code.google.com/p/reference-coreference-scorers/) into a format suitable for performing significance testing for differences in F1 score. Args score_file: A file obtained via running the reference coreference scorer for a single metric, as in $ perl scorer.pl muc key response > conll_score_file Returns A Scores objects containing numerator/denominator for recall and precision for each document described in the score file. """ scores_from_file = Scores() temp_mapping = {} for line in score_file.readlines(): if line == '====== TOTALS =======': break elif line.startswith("("): identifier = line.strip() elif line.startswith('Recall:'): entries = line.split() recall_numerator = entries[1].replace("(", "") recall_denominator = entries[3].replace(")", "") precision_numerator = entries[6].replace("(", "") precision_denominator = entries[8].replace(")", "") temp_mapping[identifier] = [ recall_numerator, recall_denominator, precision_numerator, precision_denominator ] identifier = None for identifier in sorted(temp_mapping.keys()): scores_from_file.append( Score(temp_mapping[identifier]) ) return scores_from_file
def test_average(self): scores_for_average = Scores([ Score([1]), Score([5]), Score([3]), Score([0]), ]) self.assertEqual(9.0 / 4, aggregators.average(scores_for_average))
def evaluate_from_file(args): """ Evaluate translation hypotheses from a file or a list of files of references. :param args: Evaluation parameters :return: None """ hypotheses = file2list(args.hypotheses) references = file2list(args.references) baseline = file2list(args.baseline) base_refs = args.references if args.base_references is None else args.base_references baseline_refs = file2list(base_refs) sentence_bleu_scorer = SentenceBleuScorer('') bleus = [] for hyp_line, ref_line in zip(hypotheses, references): sentence_bleu_scorer.set_reference(ref_line.split()) bleu = sentence_bleu_scorer.score(hyp_line.split()) bleus.append(bleu) bleus_baseline = [] for hyp_line, ref_line in zip(baseline, baseline_refs): sentence_bleu_scorer.set_reference(ref_line.split()) bleu = sentence_bleu_scorer.score(hyp_line.split()) bleus_baseline.append(bleu) print ("Average BLEU hypotheses: " + str(float(sum(bleus))/len(bleus))) print ("Average BLEU baseline: " + str(float(sum(bleus_baseline))/len(bleus_baseline))) scores_system = [] scores_baseline = [] for bleu in bleus: scores_system.append(Score([bleu])) for bleu in bleus_baseline: scores_baseline.append(Score([bleu])) scores_sys = Scores(scores_system) scores_bsln = Scores(scores_baseline) test = significance_tests.ApproximateRandomizationTest( scores_system, scores_baseline, aggregators.average, trials=int(args.n_reps)) print ("\t Significance level:", test.run())
def get_numerators_and_denominators(score_file): """Transform score files obtained by the CoNLL scorer. This function transforms files obtained by the reference coreference scorer (https://code.google.com/p/reference-coreference-scorers/) into a format suitable for performing significance testing for differences in F1 score. Args score_file: A file obtained via running the reference coreference scorer for a single metric, as in $ perl scorer.pl muc key response > conll_score_file Returns A Scores objects containing numerator/denominator for recall and precision for each document described in the score file. """ scores_from_file = Scores() temp_mapping = {} for line in score_file.readlines(): if line == '====== TOTALS =======': break elif line.startswith("("): identifier = line.strip() elif line.startswith('Recall:'): entries = line.split() recall_numerator = entries[1].replace("(", "") recall_denominator = entries[3].replace(")", "") precision_numerator = entries[6].replace("(", "") precision_denominator = entries[8].replace(")", "") temp_mapping[identifier] = [ recall_numerator, recall_denominator, precision_numerator, precision_denominator ] identifier = None for identifier in sorted(temp_mapping.keys()): scores_from_file.append(Score(temp_mapping[identifier])) return scores_from_file
def test_enum_sum_div_by_denom_sum(self): scores_for_enum_sum_div_by_denom_sum = Scores([ Score([2, 3]), Score([4, 12]), Score([22, 500]), Score([3.1, 4.355]), ]) self.assertEqual( 31.1 / 519.355, aggregators.enum_sum_div_by_denom_sum( scores_for_enum_sum_div_by_denom_sum))
def test_from_file(self): expected_scores = Scores( [ Score([2, 3]), Score([4, 12]), Score([22, 500]), Score([3.1, 4.355]), ] ) self.assertEqual(expected_scores, Scores.from_file(open( os.path.dirname(os.path.realpath(__file__)) + "/resources/example_scores")))
def run(self): """Compute the statistical significance of a difference between the systems via a paired two-sided approximate randomization test. Returns: An approximation of the probability of observing corpus-wide differences in scores at least as extreme as observed here, when there is no difference between the systems. """ absolute_difference = math.fabs( self.aggregator(self.system1_scores) - self.aggregator(self.system2_scores)) shuffled_was_at_least_as_high = 0 for i in range(0, self.trials): pseudo_system1_scores = Scores() pseudo_system2_scores = Scores() for score1, score2 in zip(self.system1_scores, self.system2_scores): if random.randint(0, 1) == 0: pseudo_system1_scores.append(score1) pseudo_system2_scores.append(score2) else: pseudo_system1_scores.append(score2) pseudo_system2_scores.append(score1) pseudo_difference = math.fabs( self.aggregator(pseudo_system1_scores) - self.aggregator(pseudo_system2_scores)) if pseudo_difference >= absolute_difference: shuffled_was_at_least_as_high += 1 significance_level = (shuffled_was_at_least_as_high + 1) / ( self.trials + 1) return significance_level
def test_f_1(self): scores_for_f_1 = Scores([ Score([2, 3, 7, 8]), Score([4, 12, 33, 50]), Score([22, 500, 12.3, 15.9]), Score([3.1, 4.355, 1, 2]), ]) recall = 31.1 / 519.355 precision = 53.3 / 75.9 f_1 = 2 * recall * precision / (recall + precision) self.assertEqual(f_1, aggregators.f_1(scores_for_f_1))
for base_line, ref_line in zip(baselines, references): chrf3_score = computeChrF([ref_line.strip()],[base_line.strip()]) baseline_scores.append(chrf3_score) system_scores=[] for sys_line, ref_line in zip(system, references): chrf3_score = computeChrF([ref_line.strip()],[sys_line.strip()]) system_scores.append(chrf3_score) sc_system=[] sc_baseline=[] for chrf3_score in baseline_scores: sc_system.append(Score([chrf3_score])) for chrf3_score in system_scores: sc_baseline.append(Score([chrf3_score])) sc_system_ar = Scores(sc_system) sc_baseline_ar = Scores(sc_baseline) test = significance_tests.ApproximateRandomizationTest( sc_system_ar, sc_baseline_ar, aggregators.average,trials=int(10000)) print ("\t Significance level:", test.run())
def run(self): """Compute the statistical significance of a difference between the systems via a paired two-sided approximate randomization test. Returns: An approximation of the probability of observing corpus-wide differences in scores at least as extreme as observed here, when there is no difference between the systems. """ absolute_difference = math.fabs( self.aggregator(self.system1_scores) - self.aggregator(self.system2_scores)) shuffled_was_at_least_as_high = 0 for i in range(0, self.trials): pseudo_system1_scores = Scores() pseudo_system2_scores = Scores() for score1, score2 in zip(self.system1_scores, self.system2_scores): if random.randint(0, 1) == 0: pseudo_system1_scores.append(score1) pseudo_system2_scores.append(score2) else: pseudo_system1_scores.append(score2) pseudo_system2_scores.append(score1) pseudo_difference = math.fabs( self.aggregator(pseudo_system1_scores) - self.aggregator(pseudo_system2_scores)) if pseudo_difference >= absolute_difference: shuffled_was_at_least_as_high += 1 significance_level = (shuffled_was_at_least_as_high + 1) / (self.trials + 1) return significance_level