Esempio n. 1
0
 def test_from_file(self):
     expected_scores = Scores([
         Score([2, 3]),
         Score([4, 12]),
         Score([22, 500]),
         Score([3.1, 4.355]),
     ])
     self.assertEqual(
         expected_scores,
         Scores.from_file(
             open(
                 os.path.dirname(os.path.realpath(__file__)) +
                 "/resources/example_scores")))
def get_numerators_and_denominators(score_file):
    """Transform score files obtained by the CoNLL scorer.

    This function transforms files obtained by the reference coreference
    scorer (https://code.google.com/p/reference-coreference-scorers/) into
    a format suitable for performing significance testing for differences in
    F1 score.

    Args
        score_file: A file obtained via running the reference coreference
                    scorer for a single metric, as in
                     $ perl scorer.pl muc key response > conll_score_file

    Returns
        A Scores objects containing numerator/denominator for recall and
        precision for each document described in the score file.
    """
    scores_from_file = Scores()

    temp_mapping = {}

    for line in score_file.readlines():
        if line == '====== TOTALS =======':
            break
        elif line.startswith("("):
            identifier = line.strip()
        elif line.startswith('Recall:'):
            entries = line.split()
            recall_numerator = entries[1].replace("(", "")
            recall_denominator = entries[3].replace(")", "")
            precision_numerator = entries[6].replace("(", "")
            precision_denominator = entries[8].replace(")", "")

            temp_mapping[identifier] = [
                    recall_numerator,
                    recall_denominator,
                    precision_numerator,
                    precision_denominator
            ]

            identifier = None

    for identifier in sorted(temp_mapping.keys()):        
        scores_from_file.append(
            Score(temp_mapping[identifier])
        )

    return scores_from_file
Esempio n. 3
0
 def test_average(self):
     scores_for_average = Scores([
         Score([1]),
         Score([5]),
         Score([3]),
         Score([0]),
     ])
     self.assertEqual(9.0 / 4, aggregators.average(scores_for_average))
Esempio n. 4
0
def evaluate_from_file(args):
    """
    Evaluate translation hypotheses from a file or a list of files of references.
    :param args: Evaluation parameters
    :return: None
    """
    hypotheses = file2list(args.hypotheses)
    references = file2list(args.references)
    baseline = file2list(args.baseline)

    base_refs = args.references if args.base_references is None else args.base_references
    baseline_refs = file2list(base_refs)
    sentence_bleu_scorer = SentenceBleuScorer('')
    bleus = []
    for hyp_line, ref_line in zip(hypotheses, references):
        sentence_bleu_scorer.set_reference(ref_line.split())
        bleu = sentence_bleu_scorer.score(hyp_line.split())
        bleus.append(bleu)
    bleus_baseline = []
    for hyp_line, ref_line in zip(baseline, baseline_refs):
        sentence_bleu_scorer.set_reference(ref_line.split())
        bleu = sentence_bleu_scorer.score(hyp_line.split())
        bleus_baseline.append(bleu)

    print ("Average BLEU hypotheses: " + str(float(sum(bleus))/len(bleus)))
    print ("Average BLEU baseline:   " + str(float(sum(bleus_baseline))/len(bleus_baseline)))

    scores_system = []
    scores_baseline = []
    for bleu in bleus:
        scores_system.append(Score([bleu]))

    for bleu in bleus_baseline:
        scores_baseline.append(Score([bleu]))

    scores_sys = Scores(scores_system)
    scores_bsln = Scores(scores_baseline)

    test = significance_tests.ApproximateRandomizationTest(
        scores_system,
        scores_baseline,
        aggregators.average,
        trials=int(args.n_reps))

    print ("\t Significance level:",  test.run())
def get_numerators_and_denominators(score_file):
    """Transform score files obtained by the CoNLL scorer.

    This function transforms files obtained by the reference coreference
    scorer (https://code.google.com/p/reference-coreference-scorers/) into
    a format suitable for performing significance testing for differences in
    F1 score.

    Args
        score_file: A file obtained via running the reference coreference
                    scorer for a single metric, as in
                     $ perl scorer.pl muc key response > conll_score_file

    Returns
        A Scores objects containing numerator/denominator for recall and
        precision for each document described in the score file.
    """
    scores_from_file = Scores()

    temp_mapping = {}

    for line in score_file.readlines():
        if line == '====== TOTALS =======':
            break
        elif line.startswith("("):
            identifier = line.strip()
        elif line.startswith('Recall:'):
            entries = line.split()
            recall_numerator = entries[1].replace("(", "")
            recall_denominator = entries[3].replace(")", "")
            precision_numerator = entries[6].replace("(", "")
            precision_denominator = entries[8].replace(")", "")

            temp_mapping[identifier] = [
                recall_numerator, recall_denominator, precision_numerator,
                precision_denominator
            ]

            identifier = None

    for identifier in sorted(temp_mapping.keys()):
        scores_from_file.append(Score(temp_mapping[identifier]))

    return scores_from_file
Esempio n. 6
0
 def test_enum_sum_div_by_denom_sum(self):
     scores_for_enum_sum_div_by_denom_sum = Scores([
         Score([2, 3]),
         Score([4, 12]),
         Score([22, 500]),
         Score([3.1, 4.355]),
     ])
     self.assertEqual(
         31.1 / 519.355,
         aggregators.enum_sum_div_by_denom_sum(
             scores_for_enum_sum_div_by_denom_sum))
Esempio n. 7
0
 def test_from_file(self):
     expected_scores = Scores(
         [
             Score([2, 3]),
             Score([4, 12]),
             Score([22, 500]),
             Score([3.1, 4.355]),
         ]
     )
     self.assertEqual(expected_scores, Scores.from_file(open(
         os.path.dirname(os.path.realpath(__file__)) +
     "/resources/example_scores")))
Esempio n. 8
0
    def run(self):
        """Compute the statistical significance of a difference between
        the systems via a paired two-sided approximate randomization test.

        Returns:
            An approximation of the probability of observing corpus-wide
            differences in scores at least as extreme as observed here, when
            there is no difference between the systems.
        """

        absolute_difference = math.fabs(
            self.aggregator(self.system1_scores) -
            self.aggregator(self.system2_scores))
        shuffled_was_at_least_as_high = 0

        for i in range(0, self.trials):
            pseudo_system1_scores = Scores()
            pseudo_system2_scores = Scores()

            for score1, score2 in zip(self.system1_scores,
                                      self.system2_scores):
                if random.randint(0, 1) == 0:
                    pseudo_system1_scores.append(score1)
                    pseudo_system2_scores.append(score2)
                else:
                    pseudo_system1_scores.append(score2)
                    pseudo_system2_scores.append(score1)

            pseudo_difference = math.fabs(
                self.aggregator(pseudo_system1_scores) -
                self.aggregator(pseudo_system2_scores))

            if pseudo_difference >= absolute_difference:
                shuffled_was_at_least_as_high += 1

        significance_level = (shuffled_was_at_least_as_high + 1) / (
            self.trials + 1)

        return significance_level
Esempio n. 9
0
    def test_f_1(self):
        scores_for_f_1 = Scores([
            Score([2, 3, 7, 8]),
            Score([4, 12, 33, 50]),
            Score([22, 500, 12.3, 15.9]),
            Score([3.1, 4.355, 1, 2]),
        ])

        recall = 31.1 / 519.355
        precision = 53.3 / 75.9
        f_1 = 2 * recall * precision / (recall + precision)

        self.assertEqual(f_1, aggregators.f_1(scores_for_f_1))
for base_line, ref_line in zip(baselines, references):
	chrf3_score = computeChrF([ref_line.strip()],[base_line.strip()])
	baseline_scores.append(chrf3_score)



system_scores=[]
for sys_line, ref_line in zip(system, references):
	chrf3_score = computeChrF([ref_line.strip()],[sys_line.strip()])
	system_scores.append(chrf3_score)


sc_system=[]
sc_baseline=[]
for chrf3_score in baseline_scores:
	sc_system.append(Score([chrf3_score]))

for chrf3_score in system_scores:
	sc_baseline.append(Score([chrf3_score]))

sc_system_ar = Scores(sc_system)
sc_baseline_ar = Scores(sc_baseline)


test = significance_tests.ApproximateRandomizationTest( sc_system_ar, sc_baseline_ar, aggregators.average,trials=int(10000))
print ("\t Significance level:", test.run())




Esempio n. 11
0
    def run(self):
        """Compute the statistical significance of a difference between
        the systems via a paired two-sided approximate randomization test.

        Returns:
            An approximation of the probability of observing corpus-wide
            differences in scores at least as extreme as observed here, when
            there is no difference between the systems.
        """

        absolute_difference = math.fabs(
            self.aggregator(self.system1_scores) -
            self.aggregator(self.system2_scores))
        shuffled_was_at_least_as_high = 0

        for i in range(0, self.trials):
            pseudo_system1_scores = Scores()
            pseudo_system2_scores = Scores()

            for score1, score2 in zip(self.system1_scores,
                                      self.system2_scores):
                if random.randint(0, 1) == 0:
                    pseudo_system1_scores.append(score1)
                    pseudo_system2_scores.append(score2)
                else:
                    pseudo_system1_scores.append(score2)
                    pseudo_system2_scores.append(score1)

            pseudo_difference = math.fabs(
                self.aggregator(pseudo_system1_scores) -
                self.aggregator(pseudo_system2_scores))

            if pseudo_difference >= absolute_difference:
                shuffled_was_at_least_as_high += 1

        significance_level = (shuffled_was_at_least_as_high +
                              1) / (self.trials + 1)

        return significance_level