def test_bleu(self): bleu = BLEUCalculator() score = bleu.bleu( "I am waiting on the beach", "He is walking on the beach", ) score_from_list = bleu.bleu("I am waiting on the beach".split(), ["He is walking on the beach".split()]) self.assertLess(abs(score - score_from_list), 1e-8) bleu = BLEUCalculator(lang="ja") score_ja = bleu.bleu("私はビーチで待ってる", "彼がベンチで待ってる") self.assertLess(abs(score - score_ja), 1e-8)
def convert_excel_to_df_and_evaluate(self): """エクセルをDataFrameへ""" input_df = pd.ExcelFile(self.input_file).parse() bleu_ja = BLEUCalculator(lang="ja") rouge = RougeCalculator(lang="ja") output_dict = { 'bleu': [], 'rouge_1': [], 'rouge_2': [], 'rouge_long': [] } for index, row in input_df.iterrows(): output_dict['bleu'].append( bleu_ja.bleu(row['ref_text'], row['input_text'])) # BLEUでの評価 output_dict['rouge_1'].append( rouge.rouge_n(summary=row['ref_text'], references=row['input_text'], n=1)) # ROUGE(n1)での評価 output_dict['rouge_2'].append( rouge.rouge_n(summary=row['ref_text'], references=row['input_text'], n=2)) # ROUGE(n2)での評価 output_dict['rouge_long'].append( rouge.rouge_l( summary=row['ref_text'], references=row['input_text'])) # ROUGE(rouge_l)での評価 # input_microphone_df['rouge_be'] = rouge.rouge_be(summary=row['ref_text'], references=row['input_text']) # ROUGE(rouge_be)での評価 input_df['bleu'] = output_dict['bleu'] input_df['rouge_1'] = output_dict['rouge_1'] input_df['rouge_2'] = output_dict['rouge_2'] input_df['rouge_long'] = output_dict['rouge_long'] # print(input_df) return input_df
def eval_rouges(refrence_summary, model_summary): # refrence_summary = "tokyo shares close up #.## percent" # model_summary = "tokyo stocks close up # percent to fresh record high" rouge = RougeCalculator(stopwords=True, lang="en") rouge_1 = rouge.rouge_n( summary=model_summary, references=refrence_summary, n=1) rouge_2 = rouge.rouge_n( summary=model_summary, references=[refrence_summary], n=2) rouge_l = rouge.rouge_l( summary=model_summary, references=[refrence_summary]) # You need spaCy to calculate ROUGE-BE rouge_be = rouge.rouge_be( summary=model_summary, references=[refrence_summary]) bleu = BLEUCalculator() bleu_score = bleu.bleu(summary=model_summary, references=[refrence_summary]) # print("ROUGE-1: {}, ROUGE-2: {}, ROUGE-L: {}, ROUGE-BE: {}".format( # rouge_1, rouge_2, rouge_l, rouge_be # ).replace(", ", "\n")) return rouge_1, rouge_2, rouge_l, rouge_be, bleu_score
def main(args): system_out = read_file(args.system_output) reference_list = read_file(args.reference) bleu = BLEUCalculator() bleu_list = [] for index, snt in enumerate(system_out): bleu_list.append( bleu.bleu(summary=snt, references=reference_list[index])) print('SACRE_BLEU\t%.6f' % (np.average(bleu_list)))
def evaluate_bleu(summary, references, lang="zh"): bleu_calc = BLEUCalculator(lang=lang) assert len(summary) == len(references), "number of summary and references should be equal" scores = [] for s, rs in zip(summary, references): score = bleu_calc.bleu(s, rs) scores.append(score) score_avg = sum(scores) / len(scores) return score_avg, scores
def cal_bleu(prediction_str, target_str): bleu = BLEUCalculator() total_bleu = [] for index in range(len(prediction_str)): prediction_rel = ' '.join(prediction_str[index]) eos_index = prediction_rel.find('<eos>') if (eos_index > 0): prediction_rel = prediction_rel[:eos_index - 1] target_rel = ' '.join(target_str[index]) target_rel = target_rel[:target_rel.find('<eos>') - 1] total_bleu.append(bleu.bleu(prediction_rel, target_rel)) return np.mean(total_bleu)
def myeval(valid_x, valid_y, vocab, model): rouge = RougeCalculator(stopwords=True, lang="zh") bleu_ch = BLEUCalculator(lang="zh") model.eval() eval_batch_num = 0 sum_rouge_1 = 0 sum_rouge_2 = 0 sum_rouge_L = 0 score_ch = 0 sum_loss = 0 limit = 63 logging.info('Evaluating on %d minibatches...' % limit) i2w = {key: value for value, key in vocab.items()} ckpt_file = args.ckpt_file[9:] fout_pred = open(os.path.join('tmp/systems', '%s.txt' % ckpt_file), "w") fout_y = open(os.path.join('tmp/models', 'ref_%s.txt' % ckpt_file), "w") while eval_batch_num < limit: with torch.no_grad(): loss = run_batch(valid_x, valid_y, model) sum_loss += loss _, x = valid_x.next_batch() pred = greedy(model, x, vocab) _, y = valid_y.next_batch() y = y[:,1:].tolist() for idx in range(len(pred)): line_pred = [i2w[tok] for tok in pred[idx] if tok != vocab[config.end_tok] and tok != vocab[config.pad_tok]] line_y = [i2w[tok] for tok in y[idx] if tok != vocab[config.end_tok] and tok != vocab[config.pad_tok]] fout_pred.write(" ".join(line_pred) + "\n") fout_y.write(" ".join(line_y) + "\n") sum_rouge_1 += rouge.rouge_n(references=" ".join(line_y),summary=" ".join(line_pred),n=1) sum_rouge_2 += rouge.rouge_n(references=" ".join(line_y),summary=" ".join(line_pred),n=2) sum_rouge_L += rouge.rouge_l(references=" ".join(line_y),summary=" ".join(line_pred)) score_ch += bleu_ch.bleu(" ".join(line_y), " ".join(line_pred)) eval_batch_num += 1 fout_pred.close() fout_y.close() avg_rouge_1 = sum_rouge_1/(len(pred) * limit) avg_rouge_2 = sum_rouge_2/(len(pred) * limit) avg_rouge_L = sum_rouge_L/(len(pred) * limit) avg_bleu_ch = score_ch/(len(pred) * limit) avg_loss = sum_loss/limit print("ROUGE_1 = ",avg_rouge_1) print("ROUGE_2 = ",avg_rouge_2) print("ROUGE_L = ",avg_rouge_L) print("BLEU = ", avg_bleu_ch) print("Perplexity = ", math.pow(2, avg_loss)) model.train()
class SacreBleu(): def __init__(self): self.bleu_calc = BLEUCalculator() def compute_reward(self, samples, sequence, model): references = [ pair.get_text(pair.full_target_tokens, model.vocab).split(" EOS")[0] for pair in samples ] summaries = [ " ".join([str(token) for token in s]).split(" EOS")[0] for s in sequence ] scores = [] for i in range(len(references)): scores.append( self.bleu_calc.bleu(summaries[i], references[i]) / 100) return scores
def computeSacreBleu(translation_path, reference_path, lang, detokenize_trans=True, detokenize_ref=False): bleu = BLEUCalculator(lang=lang) trans_raw = trans = readSentences(translation_path) reference_raw = reference = readSentences(reference_path) if detokenize_trans or detokenize_ref: detok = MosesDetokenizer(lang) if detokenize_trans: trans = [detok([d]) for d in trans_raw] if detokenize_ref: reference = [detok([d]) for d in reference_raw] bleu_score = bleu.bleu(summary=trans, references=[reference], score_only=True) print(bleu_score) return bleu_score
def test_custom_lang(self): class Custom(BaseLang): def __init__(self): super(Custom, self).__init__("cs") def tokenize(self, text): return text.split("/") lang = Custom() rouge = RougeCalculator(lang=lang) rouge_score = rouge.rouge_n( summary="I/went/to/the/Mars/from/my/living/town.", references="I/went/to/Mars", n=1) bleu = BLEUCalculator(lang=lang) bleu_score = bleu.bleu("I/am/waiting/on/the/beach", "He/is/walking/on/the/beach") self.assertGreater(rouge_score, 0) self.assertGreater(bleu_score, 0)
class SumEvaluator: """Evaluator class for generation. A wrapper class of sumeval library """ def __init__(self, metrics: List[str] = [ "rouge_1", "rouge_2", "rouge_l", "rouge_be", "bleu" ], lang: str = "en", stopwords: bool = True, stemming: bool = True, use_porter=True): if use_porter: self.rouge = RougeCalculator(stopwords=stopwords, stemming=stemming, lang="en-porter") else: self.rouge = RougeCalculator(stopwords=stopwords, stemming=stemming, lang="en") self.bleu = BLEUCalculator(lang=lang) self.metrics = sorted(metrics) def eval(self, true_gens: List[str], pred_gens: List[str]): assert len(true_gens) == len(pred_gens) eval_list = [] colnames = [] for i, (true_gen, pred_gen) in enumerate(zip(true_gens, pred_gens)): evals = [] # BLEU if "bleu" in self.metrics: bleu_score = self.bleu.bleu(pred_gen, true_gen) / 100.0 # align scale evals.append(bleu_score) # ROUGE if "rouge_1" in self.metrics: rouge_1 = self.rouge.rouge_n(summary=pred_gen, references=[true_gen], n=1) evals.append(rouge_1) if "rouge_2" in self.metrics: rouge_2 = self.rouge.rouge_n(summary=pred_gen, references=[true_gen], n=2) evals.append(rouge_2) if "rouge_be" in self.metrics: rouge_be = self.rouge.rouge_be(summary=pred_gen, references=[true_gen]) evals.append(rouge_be) if "rouge_l" in self.metrics: rouge_l = self.rouge.rouge_l(summary=pred_gen, references=[true_gen]) evals.append(rouge_l) eval_list.append([pred_gen, true_gen] + evals) eval_df = pd.DataFrame(eval_list, columns=["pred", "true"] + self.metrics) return eval_df
#Evaluation task ################################## from sumeval.metrics.rouge import RougeCalculator rouge = RougeCalculator(stopwords=True, lang="en") rouge_1 = rouge.rouge_n(summary="I went to the Mars from my living town.", references="I went to Mars", n=1) rouge_2 = rouge.rouge_n(summary="I went to the Mars from my living town.", references=["I went to Mars", "It's my living town"], n=2) rouge_l = rouge.rouge_l(summary="I went to the Mars from my living town.", references=["I went to Mars", "It's my living town"]) # You need spaCy to calculate ROUGE-BE rouge_be = rouge.rouge_be(summary="I went to the Mars from my living town.", references=["I went to Mars", "It's my living town"]) print("ROUGE-1: {}, ROUGE-2: {}, ROUGE-L: {}, ROUGE-BE: {}".format( rouge_1, rouge_2, rouge_l, rouge_be).replace(", ", "\n")) from sumeval.metrics.bleu import BLEUCalculator bleu = BLEUCalculator() score = bleu.bleu("I am waiting on the beach", "He is walking on the beach") bleu_ja = BLEUCalculator(lang="en")
n=2)) rouge_2_scores.append(rouge.rouge_n( summary=summary, references=summaries[2], n=2)) rouge_2_scores.append(rouge.rouge_n( summary=summary, references=summaries[3], n=2)) rouge_2_scores.append(rouge.rouge_n( summary=summary, references=summaries[4], n=2)) mean_rouge_2_score = np.mean(rouge_2_scores) bleu_scores = [] bleu_scores.append(bleu.bleu(summary=summary, references=summaries[0])) bleu_scores.append(bleu.bleu(summary=summary, references=summaries[1])) bleu_scores.append(bleu.bleu(summary=summary, references=summaries[2])) bleu_scores.append(bleu.bleu(summary=summary, references=summaries[3])) bleu_scores.append(bleu.bleu(summary=summary, references=summaries[4])) mean_bleu_scores = np.mean(bleu_scores)
rouge_2_scores = [] rouge_2_scores.append( rouge.rouge_n(summary=generated_summary, references=summaries[0], n=2)) rouge_2_scores.append( rouge.rouge_n(summary=generated_summary, references=summaries[1], n=2)) rouge_2_scores.append( rouge.rouge_n(summary=generated_summary, references=summaries[2], n=2)) rouge_2_scores.append( rouge.rouge_n(summary=generated_summary, references=summaries[3], n=2)) rouge_2_scores.append( rouge.rouge_n(summary=generated_summary, references=summaries[4], n=2)) mean_rouge_2_score = np.mean(rouge_2_scores) bleu_scores = [] bleu_scores.append( bleu.bleu(summary=generated_summary, references=summaries[0])) bleu_scores.append( bleu.bleu(summary=generated_summary, references=summaries[1])) bleu_scores.append( bleu.bleu(summary=generated_summary, references=summaries[2])) bleu_scores.append( bleu.bleu(summary=generated_summary, references=summaries[3])) bleu_scores.append( bleu.bleu(summary=generated_summary, references=summaries[4])) mean_bleu_scores = np.mean(bleu_scores)
if abs((currentR - R).sum()) < 0.00001: break R[0] *= 3 R[-1] *= 3 max_index = np.argmax(R) if max_index == 0: count_max_0 += 1 ans.append(sents[max_index]) print(index) # print(ref[index]) # print(ans[index]) rouge_1 += rouge.rouge_n(summary=ans[index], references=ref[index], n=1) rouge_2 += rouge.rouge_n(summary=ans[index], references=ref[index], n=2) rouge_l += rouge.rouge_l(summary=ans[index], references=ref[index]) bleu_score += bleu.bleu(ans[index], ref[index][0]) data_number -= empty print('average rouge_1 = ', rouge_1 / data_number) print('average rouge_2 = ', rouge_2 / data_number) print('average rouge_L = ', rouge_l / data_number) print('average bleu = ', bleu_score / data_number) print('选择第一句的概率: ', count_max_0 / data_number) rouge_1 = 0 rouge_2 = 0 rouge_l = 0 bleu_score = 0 from gensim.summarization.summarizer import summarize count = 0 for story in stories: