Ejemplo n.º 1
0
    def __init__(self):
        """
			Recall Oriented Understudy of Gisting Evaluation.
			Use 'rouge-metric' package as backend.
		"""
        super().__init__()
        self.rouge = PyRouge(rouge_l=True)
def evaluation_metrics(summaries, hypotheses_list):
    rouge = PyRouge(rouge_n=(1, 2, 4),
                    rouge_l=True,
                    rouge_w=True,
                    rouge_w_weight=1.2,
                    rouge_s=True,
                    rouge_su=True,
                    skip_gap=4)
    actual_summary_list = []
    references_list = []
    folder_path = "../BBCNewsSummary/Summaries/business"
    for filename in glob.glob(os.path.join(folder_path, '*.txt')):
        with open(filename, 'r') as f:
            text = f.read()
            references = []
            actual_summary_list.append(text)
            # Pre-process and tokenize the summaries as you like
            references.append(text.split())
            references_list.append(references)

    for i in range(len(summaries)):
        fuzz_ratio.append(fuzz.ratio(summaries[i], actual_summary_list[i]))
        scores = rouge.evaluate_tokenized(hypotheses_list[i],
                                          references_list[i])
        recall_scores_list.append(scores['rouge-1']['r'])
        f_scores_list.append(scores['rouge-1']['f'])
Ejemplo n.º 3
0
def calc_metrics(refs, hyps, metric="all"):
    metrics = dict()
    metrics["count"] = len(hyps)
    metrics["ref_example"] = refs[-1][-1]
    metrics["hyp_example"] = hyps[-1]
    if metric in ("bleu", "all"):
        metrics["bleu"] = corpus_bleu(refs, hyps)
    if metric in ("rouge", "all"):
        rouge = PyRouge(rouge_l=True, multi_ref_mode="best")
        scores = rouge.evaluate(hyps, refs)
        metrics.update(scores)
    return metrics
Ejemplo n.º 4
0
def test_compare_multi_ref_summaries():
    for hyp, ref in load_multi_ref_summary_pairs():
        gt = PerlRouge(MAX_N, True, True, 1.2, True, True, 4,
                       'best').evaluate([hyp], [ref])
        out = PyRouge(MAX_N, True, True, 1.2, True, True, 4,
                      'best').evaluate([hyp], [ref])
        assert_close_rouge(out, gt)

        gt = PerlRouge(MAX_N, True, True, 1.2, True, True, 4,
                       'average').evaluate([hyp], [ref])
        out = PyRouge(MAX_N, True, True, 1.2, True, True, 4,
                      'average').evaluate([hyp], [ref])
        assert_close_rouge(out, gt)
Ejemplo n.º 5
0
class RougeL(Module):
    def __init__(self):
        """
			Recall Oriented Understudy of Gisting Evaluation.
			Use 'rouge-metric' package as backend.
		"""
        super().__init__()
        self.rouge = PyRouge(rouge_l=True)

    def forward(self, hypothesis: List[List[str]],
                references: List[List[List[str]]]) -> float:
        if len(hypothesis) != len(references):
            raise ValueError(
                f'Batch size of hypothesis and references are different ({len(hypothesis)} != {len(references)}).'
            )

        hypothesis = [' '.join(hyp) for hyp in hypothesis]
        references = [[' '.join(ref) for ref in refs] for refs in references]

        scores = self.rouge.evaluate(hypotheses=hypothesis,
                                     multi_references=references)
        rouge_l_scores = scores['rouge-l']
        # 3 scores = Recall r, Precision p, FScore f
        # {'r': ..., 'p': ..., 'f': ...}
        f_score = rouge_l_scores['f']

        return f_score
Ejemplo n.º 6
0
def test_compare_all_multi_ref_summaries():
    hyp, ref = load_all_multi_ref_summaries()
    avg = PyRouge(MAX_N, True, True, 1.2, True, True, 4,
                  mode='average').evaluate(hyp, ref)
    indiv = PyRouge(MAX_N, True, True, 1.2, True, True, 4,
                    mode='individual').evaluate(hyp, ref)
    for key, avg_score in avg.items():
        assert avg_score['f'] == py_rouge._f_score(avg_score['p'],
                                                   avg_score['r'], 0.5)
        avg_score['p'] *= len(indiv)
        avg_score['r'] *= len(indiv)
        for case in indiv:
            avg_score['p'] -= case[key]['p']
            avg_score['r'] -= case[key]['r']
    for score in avg.values():
        assert isclose(score['p'], 0, abs_tol=1e-9)
        assert isclose(score['r'], 0, abs_tol=1e-9)
Ejemplo n.º 7
0
def rouge(hypotheses, references):
    """
    calculate the rouge score for each system
    :param hypotheses: dict type, hypotheses data 
    :param references: dict type, references data 
    :return rougeScore: dict type, including rouge-1, rouge-2, rouge-L
    """
    rougeScore = dict()
    for ids in range(1, 21):
        systemId = 'S_' + str(ids)
        rougeScore[systemId] = list()
    r_references = list(map(list, zip(*references)))

    rouge = PyRouge(rouge_n=2, rouge_l=True)
    for systemId in hypotheses:
        pred = hypotheses[systemId]
        scores = rouge.evaluate(pred, r_references)
        rougeScore[systemId].append(scores)
    return rougeScore
Ejemplo n.º 8
0
Archivo: rouge.py Proyecto: Labbeti/MLU
class RougeL(Metric):
	def __init__(self):
		"""
			Recall Oriented Understudy of Gisting Evaluation.
			Use 'rouge-metric' package as backend.
		"""
		super().__init__()
		self.rouge = PyRouge(rouge_l=True)

	def compute_score(self, references: List[str], hypothesis: List[List[str]]) -> Tensor:
		scores = self.rouge.evaluate(references, hypothesis)
		rouge_l_scores = scores['rouge-l']
		# 3 scores = Recall r, Precision p, FScore f
		# {'r': ..., 'p': ..., 'f': ...}
		f_score = rouge_l_scores['f']
		return torch.scalar_tensor(f_score)
Ejemplo n.º 9
0
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if precision == 0 and recall == 0:
        f1 = 0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)
    f_scores.append(f1)
    #print(precision, recall, f1)

print('F1-Score:', mean(f_scores))

#-------ROUGE---------

references = []
for i in range(len(predictions_df)):
    for k in range(3):
        references.append(test_df['eng'].loc[
            test_df['de'] == predictions_df['de'].iloc[i]].reset_index(
                drop=True).to_list())

# Evaluate document-wise ROUGE scores
rouge = PyRouge(rouge_n=False,
                rouge_l=True,
                rouge_w=True,
                rouge_w_weight=1.2,
                rouge_s=True,
                rouge_su=True,
                skip_gap=4)
scores = rouge.evaluate(preds_bleu_3, references)
print(scores)