def on_epoch_end(self, epoch, logs=None): # 训练过程中观察一两个例子 print_sentence(s1, self.model) print_sentence(s2, self.model) summarization = test_df['summarization'].values text = test_df['text'].values pred = [] for t, s in tqdm(zip(text, summarization)): pred.append(gen_sent(t, self.model)) rouge_1 = rouge.Rouge().get_scores( pred, summarization.tolist())[0]['rouge-1']['f'] rouge_2 = rouge.Rouge().get_scores( pred, summarization.tolist())[0]['rouge-2']['f'] rouge_l = rouge.Rouge().get_scores( pred, summarization.tolist())[0]['rouge-l']['f'] print('rouge-1:', rouge_1) print('rouge-2:', rouge_2) print('rouge-l:', rouge_l) # 保存最优模型 if logs['loss'] <= self.lowest: self.lowest = logs['loss'] model.save_weights('best_model.weights')
def evaluate_rouge(reference_path, generated_path, multi=True): # Evaluate model scores actual_word_lists = [] multi_flag = True with open(reference_path) as f: for line in f: if multi: sents = line.strip().lower().split('#') actual_word_lists.append([x for x in sents]) else: actual_word_lists.append([line.strip().lower()]) generated_word_lists = [] with open(generated_path) as f: for line in f: generated_word_lists.append(line.strip().lower()) actual_word_lists = actual_word_lists[:len(generated_word_lists)] print(actual_word_lists[0],len(generated_word_lists)) for aggregator in ['Avg', 'Best', 'Individual']: print('Evaluation with {}'.format(aggregator)) apply_avg = aggregator == 'Avg' apply_best = aggregator == 'Best' import rouge evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'], max_n=4, limit_length=True, length_limit=100, length_limit_type='words', apply_avg=apply_avg, apply_best=apply_best, alpha=0.5, # Default F1_score weight_factor=1.2, stemming=True) all_hypothesis = generated_word_lists all_references = actual_word_lists scores = evaluator.get_scores(all_hypothesis, all_references) for metric, results in sorted(scores.items(), key=lambda x: x[0]): if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference for hypothesis_id, results_per_ref in enumerate(results): nb_references = len(results_per_ref['p']) for reference_id in range(nb_references): print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id)) print('\t' + prepare_results(metric,results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id])) print() else: print(prepare_results(metric, results['p'], results['r'], results['f'])) rouge = Pythonrouge(summary_file_exist=False, summary=generated_word_lists, reference=actual_word_lists, n_gram=2, ROUGE_SU4=True, ROUGE_L=False, recall_only=True, stemming=True, stopwords=True, word_level=True, length_limit=True, length=50, use_cf=False, cf=95, scoring_formula='average', resampling=True, samples=1000, favor=True, p=0.5) score = rouge.calc_score() print(score)
def compute_rouge_document(string1, string2): evaluator = rouge.Rouge(metrics=['rouge-n'], max_n=2, limit_length=True, length_limit=1000) scores = evaluator.get_scores(string1, string2) return scores['rouge-1']['f']
def test_rouge_metrics(candidates, references): for multiref in ["average", "best"]: # PERL 1.5.5 reference apply_avg = multiref == "average" apply_best = multiref == "best" evaluator = pyrouge.Rouge( metrics=["rouge-n", "rouge-l"], max_n=4, apply_avg=apply_avg, apply_best=apply_best, alpha=0.5, stemming=False, ensure_compatibility=False, ) scores = evaluator.get_scores(candidates, references) lower_split_references = [ [ref.lower().split() for ref in refs_per_candidate] for refs_per_candidate in references ] lower_split_candidates = [candidate.lower().split() for candidate in candidates] m = Rouge(variants=[1, 2, 4, "L"], multiref=multiref, alpha=0.5) for candidate, references_per_candidate in zip(lower_split_candidates, lower_split_references): m.update((candidate, references_per_candidate)) results = m.compute() for key in ["1", "2", "4", "L"]: assert pytest.approx(results[f"Rouge-{key}-R"], abs=1e-4) == scores[f"rouge-{key.lower()}"]["r"] assert pytest.approx(results[f"Rouge-{key}-P"], abs=1e-4) == scores[f"rouge-{key.lower()}"]["p"] assert pytest.approx(results[f"Rouge-{key}-F"], abs=1e-4) == scores[f"rouge-{key.lower()}"]["f"]
def __init__(self, vocab=None, specials=[], type='n', n=1, max_length=None, tokenizer=None, alpha=0.5, which='f', tokenize_sent=False, eos='\n', dump_file=None): """Initalizes the Perplexity metrc.""" self.vocab = vocab self.specials = specials self.n = n self.type = type self.rouge = rouge.Rouge( metrics=['rouge-' + type], max_n=n, limit_length=max_length is not None, length_limit=max_length, length_limit_type='words', apply_avg=True, apply_best=False, alpha=alpha, # Default F1_score weight_factor=1.0, # Correct bug stemming=True) self.tokenizer = tokenizer self.which = which self.tokenize_sent = tokenize_sent self.eos = eos self.dump_file = dump_file nltk.download('punkt', quiet=True)
def _test(metric_device): engine = Engine(update) m = Rouge(variants=[1, 2, "L"], alpha=0.5, device=metric_device) m.attach(engine, "rouge") engine.run(data=list(range(size)), max_epochs=1) assert "rouge" in engine.state.metrics evaluator = pyrouge.Rouge( metrics=["rouge-n", "rouge-l"], max_n=4, apply_avg=True, apply_best=False, alpha=0.5, stemming=False, ensure_compatibility=False, ) rouge_1_f, rouge_2_f, rouge_l_f = (0, 0, 0) for candidate, references in data: scores = evaluator.get_scores([candidate], [references]) rouge_1_f += scores["rouge-1"]["f"] rouge_2_f += scores["rouge-2"]["f"] rouge_l_f += scores["rouge-l"]["f"] assert pytest.approx(engine.state.metrics["Rouge-1-F"], abs=1e-4) == rouge_1_f / len(data) assert pytest.approx(engine.state.metrics["Rouge-2-F"], abs=1e-4) == rouge_2_f / len(data) assert pytest.approx(engine.state.metrics["Rouge-L-F"], abs=1e-4) == rouge_l_f / len(data)
def get_average_scores(hyps, refs, maxlen=400, stop_words=[]): rouge_scorer = rouge.Rouge() averaged_scores = { 'rouge-1': { 'f': 0, 'p': 0, 'r': 0 }, 'rouge-2': { 'f': 0, 'p': 0, 'r': 0 }, 'rouge-l': { 'f': 0, 'p': 0, 'r': 0 } } scores = rouge_scorer.get_scores(hyps, refs) for metric in averaged_scores.keys(): for values in scores: for sub_metric in averaged_scores[metric]: averaged_scores[metric][sub_metric] += values[metric][ sub_metric] for key in averaged_scores.keys(): for sub_key in averaged_scores[key].keys(): averaged_scores[key][sub_key] /= len(hyps) return averaged_scores
def get_oracle_single_paper(batch): candidates = batch['candidates'] references = batch['references'] evaluator = rouge.Rouge() max_r1 = 0. max_score = None max_sent = '' def check_length(c): l = len(c) if l < 5 or l > 2500: return 0 return 1 # sentences with too long or too short of characters will break the script candidates = [c.strip() for c in candidates if check_length(c)] for tgt in references: ref = [tgt] * len(candidates) scores = evaluator.get_scores(candidates, ref) r1 = [s['rouge-1']['f'] for s in scores] max_idx = r1.index(max(r1)) if max_r1 < r1[max_idx]: max_r1 = r1[max_idx] max_score = scores[max_idx] max_sent = candidates[max_idx] return max_sent.replace('\n', '').strip()
def main(): # Read train-test-splits and get test ids splits = pd.read_csv(SPLITS_PATH, sep=";") test_ids = sorted( [int(fn[-3:]) for fn in splits[splits.SET == "TEST"].ID.values]) # Read data files from disk with open(ESSAYS_PATH, "r", encoding="utf-8") as f: data = json.load(f) with open(PREDICTIONS_PATH, "r", encoding="utf-8") as f: predictions = json.load(f) # Extract prediction labels # Also, make sure they are sorted based on the integer value of their id predictions = sorted(predictions, key=lambda x: int(x["id"])) predicted_texts = [p["prompt"] for p in predictions] # Extract true labels # Also, make sure they are sorted based on their id test_split = list(filter(lambda x: x["id"] in test_ids, data)) test_split = sorted(test_split, key=lambda x: x["id"]) true_prompts = [p['prompt'] for p in test_split] # Instantiate ROUGE evaluator evaluator = rouge.Rouge(metrics=['rouge-l']) print(len(true_prompts)) print(len(predicted_texts)) # Calculate the scores and print them nicely scores = evaluator.get_scores(predicted_texts, true_prompts, avg=True) print(scores)
def rougeScore( goldSummary, machineSummary, rowBuilder, averageArray ): #Function generates ROUGE-1, ROUGE-2, ROUGE-3, ROUGE-4, and ROUGE-L scores given a gold and machine-generated summary. #Create a new Rogue object using the default settings provided in documentation. rogueScores = rouge.Rouge( metrics=['rouge-n', 'rouge-l'], max_n=4, limit_length=False, apply_avg=False, apply_best=False, alpha=0.5, weight_factor=1.2, stemming=True ) #Use default parameters recommended by library developer. scores = rogueScores.get_scores( machineSummary, goldSummary) #Generate the n-gram and rouge-l scores. rowBuilder.append(scores.get('rouge-1')[0].get('f') [0]) #Add ROUGE-1 F1 value to the row. rowBuilder.append(scores.get('rouge-2')[0].get('f') [0]) #Add ROUGE-2 F1 value to the row. rowBuilder.append(scores.get('rouge-3')[0].get('f') [0]) #Add ROUGE-3 F1 value to the row. rowBuilder.append(scores.get('rouge-4')[0].get('f') [0]) #Add ROUGE-4 F1 value to the row. rowBuilder.append(scores.get('rouge-l')[0].get('f') [0]) #Add ROUGE-L F1 value to the row. #Add to existing running totals for average calculations. averageArray[0] = averageArray[0] + scores.get('rouge-1')[0].get('f')[0] averageArray[1] = averageArray[1] + scores.get('rouge-l')[0].get('f')[0] return rowBuilder
def get_sorted(preds, refs, srcs): rouge_evaluator = rouge.Rouge( metrics=['rouge-n', 'rouge-l'], max_n=3, limit_length=True, length_limit=100, length_limit_type='words', apply_avg=False, apply_best=False, alpha=0.5, # Default F1_score weight_factor=1.2, stemming=True) scores = rouge_evaluator.get_scores(preds, refs) all_f_scores = [] for type in scores.keys(): score_list = scores[type] f_scores = [x['f'] for x in score_list] all_f_scores.append(f_scores) all_f_scores = np.array(all_f_scores) mean_f_scores = np.mean(all_f_scores, axis=0) if srcs is not None: res = [{'mean_f': f.item(), 'pred': p, 'ref': r, 'src': s} \ for f, p, r, s in zip(mean_f_scores, preds, refs, srcs)] else: res = [{'mean_f': f.item(), 'pred': p, 'ref': r} \ for f, p, r in zip(mean_f_scores, preds, refs)] sorted_res = sorted(res, key=lambda x: x['mean_f']) return sorted_res
def calculate_rouge(eval_name, eval_set, summ_task, summ_type): """ Evaluate generated summaries with py-rouge Expects two lists, one of reference summaries and the other of generated summaries where the indexes correspond. """ evaluator = rouge.Rouge( metrics=['rouge-n', 'rouge-l'], max_n=3, limit_length=False, #length_limit=100, length_limit_type='words', apply_avg=True, apply_best=False, alpha=1, weight_factor=1.2, stemming=False) # eval_set[1] == gen_summaries # eval_set[0] == ref_summaries metrics = [] rouge_scores = evaluator.get_scores(eval_set[1], eval_set[0]) results_file.write("\nPerformance by {e} on {l}, {t}\n".format( e=eval_name, l=summ_task, t=summ_type)) for metric, results in sorted(rouge_scores.items(), key=lambda x: x[0]): if metric == "rouge-l": result = results['r'] else: result = results['r'] metrics.append(result) results_file.write("{m}\t{r}\n".format(m=metric, r=result)) return metrics
def main(): args = parser.parse_args() data = pd.read_excel(args.pred_file, sheet_name=None, header=None, names=["Sentence", "Correct_answer", "Answer"]) # select sheet with factoid questions # todo make as argument data = data["factoid"] evaluator = rouge.Rouge( metrics=["rouge-n", "rouge-l"], max_n=4, limit_length=True, length_limit=100, length_limit_type="words", apply_avg=True, apply_best=False, alpha=0.5, # Default F1_score weight_factor=1.2, stemming=True, ) with open(args.output, "w") as fout: ground_truth = [str(el) for el in data["Correct_answer"].values] predicted = [str(el) for el in data["Answer"].values] squad_f1 = squad_v1_f1([[el] for el in ground_truth], predicted) scores = evaluator.get_scores(predicted, ground_truth) fout.write(f"squad f1: {squad_f1:.3f}\n") for m in scores: fout.write( f'{m}: p: {scores[m]["p"]:.3f} r: {scores[m]["r"]:.3f} f: {scores[m]["f"]:.3f}\n' )
def run_rouge(generated, references, max_length): if isinstance(generated, list): assert isinstance(references, list) assert len(generated) == len(references) else: assert isinstance(generated, str) assert isinstance(references, str) generated = [generated] references = [references] # most args are from `run_summarization.py` rouge_evaluator = rouge.Rouge( metrics=["rouge-n", "rouge-l"], max_n=2, limit_length=True, length_limit=max_length, length_limit_type="words", apply_avg=True, apply_best=False, alpha=0.5, # Default F1_score weight_factor=1.2, stemming=True, ) scores = rouge_evaluator.get_scores(generated, references) return scores
def best_rouge_score(file, article, summary, greedy, beam): candidates = [beam[0][1:-1], beam[1][2:-1], beam[2][2:-2]] evaluator = rouge.Rouge( metrics=['rouge-n', 'rouge-l'], max_n=2, limit_length=True, length_limit=100, length_limit_type='words', apply_avg=False, apply_best=True, alpha=0.5, # Default F1_score weight_factor=1.2, stemming=False) rouge_scores = [] for candidate in candidates: all_hypothesis = [candidate] all_references = [summary] scores = evaluator.get_scores(all_hypothesis, all_references) for metric, results in sorted(scores.items(), key=lambda x: x[0]): # print(prepare_results(metric, results['p'], results['r'], results['f'])) rouge_scores.append([candidate, round(results['f'] * 100.0, 2)]) break rouge_scores.sort(key=lambda x: x[1], reverse=True) return rouge_scores[0][0], rouge_scores[0][1]
def eval_rouge(instances: List[CFRInstance]): references = [] hypotheses = [] evaluator = rouge.Rouge( metrics=['rouge-n', 'rouge-l', 'rouge-w'], max_n=4, limit_length=True, length_limit=100, length_limit_type='words', apply_avg=True, apply_best=False, alpha=0.5, # Default F1_score weight_factor=1.2, stemming=True) by_instance = [] for instance in instances: _r = [_clean_text(g) for g in instance.gold_cf_endings] _h = _clean_text(instance.predicted_ending) references.append(_r) hypotheses.append(_h) try: by_instance.append(evaluator.get_scores(_h, _r)) except: by_instance.append({}) scores = evaluator.get_scores(hypotheses, references) return { 'rouge_all': scores, #'rouge_by_instance': by_instance }
def __init__(self, config): """Initialization from the configuration""" self.mode = config.controller_mode self.model_name = config.model_name self.model_name_version = config.model_name + "_" + config.model_version self.start_epoch = config.start_epoch self.num_epoch = config.num_epoch self.write_output = config.write_output self.batch_size = config.batch_size self.print_interval = config.train_print_interval self.gpu_id = config.gpu_id self.drop_out = config.drop_out self.dec_start_id = config.dec_start_id self.dec_end_id = config.dec_end_id self.model_path = config.model_path self.output_path = config.output_path self.random_seed = config.random_seed self.bow_pred_method = config.bow_pred_method self.train_log = TrainingLog(config) self.id2word = None self.target_metrics = config.target_metrics self.lm_load_path = config.lm_load_path self.rouge_evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l'], max_n=2) self.save_ckpt = config.save_ckpt self.eval_metrics_list = config.eval_metrics_list self.log_metrics = config.log_metrics self.gumbel_samples = config.gumbel_samples self.is_gumbel = config.is_gumbel return
def create_binary_sentence_classification_dataset_with_rouge(self): """ Create a dataset for training a sentence classification model, where the binary y labels are assigned based on the best rouge score for a sentence in the article when compared to each sentence in the summary """ # Initiate rouge evaluator evaluator = rouge.Rouge(metrics=['rouge-l'], max_n=3, limit_length=False, length_limit_type='words', apply_avg=False, apply_best=True, alpha=1, weight_factor=1.2, stemming=False) bioasq_collection = self._load_bioasq() training_data_dict = {} # snip_id = 0 for i, q in enumerate(bioasq_collection): question = q for snippet in bioasq_collection[q]['snippets']: training_data_dict[snip_id] = {} labels = [] # Sentencize snippet snippet_text = snippet['snippet'] tokenized_snip = self.nlp(snippet_text) snippet_sentences = [ s.text.strip() for s in tokenized_snip.sents ] # Sentencize abstract abstract_text = snippet['article'] tokenized_abs = self.nlp(abstract_text) abstract_sentences = [ s.text.strip() for s in tokenized_abs.sents ] rouge_scores = [] for abs_sen in abstract_sentences: best_rouge = 0 for snip_sen in snippet_sentences: rouge_score = self.calculate_sentence_level_rouge( snip_sen, abs_sen, evaluator) if best_rouge < rouge_score: best_rouge = rouge_score if best_rouge > .9: label = 1 else: label = 0 labels.append(label) training_data_dict[snip_id]['question'] = q training_data_dict[snip_id]['sentences'] = abstract_sentences training_data_dict[snip_id]['labels'] = labels snip_id += 1 with open( "data/bioasq_abs2summ_binary_sent_classification_training.json", "w", encoding="utf=8") as f: json.dump(training_data_dict, f, indent=4)
def sentences_tokens_Rouge(sentences=None, tokens=None, n_gram=4, metrics=None, aggregator='Best'): """ 计算Rouge指标 sentences: inference结果caption句子列表 tokens: 原图对应token列表 n_gram: N-grams for ROUGE-N metrics: What ROUGE score to compute. Available: ROUGE-N, ROUGE-L, ROUGE-W. Default: ROUGE-N 可组合。['rouge-n', 'rouge-l', 'rouge-w'] aggregator: 'Avg', 'Best', 'Individual' :return: """ print('Rouge evaluation with {}'.format(aggregator)) apply_avg = aggregator == 'Avg' apply_best = aggregator == 'Best' evaluator = rouge.Rouge( metrics=metrics, max_n=n_gram, limit_length=True, length_limit=100, length_limit_type='words', apply_avg=apply_avg, apply_best=apply_best, alpha=0.5, # Default F1_score weight_factor=1.2, stemming=True) for hypothesis in sentences: hypothesis = " ".join(hypothesis) for reference in tokens: reference = " ".join(reference) scores = evaluator.get_scores(hypothesis, reference) print_score = True if print_score: for metric, results in sorted(scores.items(), key=lambda x: x[0]): if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference for hypothesis_id, results_per_ref in enumerate(results): nb_references = len(results_per_ref['p']) for reference_id in range(nb_references): print('\tHypothesis #{} & Reference #{}: '.format( hypothesis_id, reference_id)) print('\t' + prepare_results( results_per_ref['p'][reference_id], results_per_ref['r'][reference_id], results_per_ref['f'][reference_id])) print() else: print( prepare_results(results['p'], results['r'], results['f'], metric)) print() return scores
def __init__(self, rouge_type: str, name: str = "ROUGE") -> None: if rouge_type.lower() not in ["1", "2", "l"]: raise ValueError(("Invalid type of rouge metric '{}', " "must be '1', '2' or 'L'").format(rouge_type)) self.name = name self.rouge_type = rouge_type.lower() self.rouge = rouge.Rouge()
def __init__(self, rouge_weight, sari_weight, bleu_weight, eos_idx): import rouge as R self.rouge = R.Rouge(stats=["f"], metrics=["rouge-1", "rouge-2", "rouge-l"]) self.r_weight = rouge_weight self.s_weight = sari_weight self.b_weight = bleu_weight self.eos_idx = eos_idx self.smooth = SmoothingFunction()
def ROUGE_sentence_score(expected, actual): # TODO: fix this so that it actually gives results evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l'], max_n=4, limit_length=True, length_limit=100, length_limit_type='words', apply_avg="Avg") return evaluator.get_scores([actual], [expected])
def get_rouge(candidate, reference): candidate = [i for i in candidate] # print(np.mean([len(i.split()) for i in candidate])) reference = [[i] for i in reference] def prepare_results(m, p, r, f): return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format( m, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f) for aggregator in ['Avg', 'Best', 'Individual']: print('Evaluation with {}'.format(aggregator)) apply_avg = aggregator == 'Avg' apply_best = aggregator == 'Best' evaluator = rouge.Rouge( metrics=['rouge-n', 'rouge-l', 'rouge-w'], max_n=4, limit_length=True, length_limit=100, length_limit_type='words', apply_avg=apply_avg, apply_best=apply_best, alpha=0.5, # Default F1_score weight_factor=1.2, stemming=True) hypothesis_1 = "King Norodom Sihanouk has declined requests to chair a summit of Cambodia 's top political leaders , saying the meeting would not bring any progress in deadlocked negotiations to form a government .\nGovernment and opposition parties have asked King Norodom Sihanouk to host a summit meeting after a series of post-election negotiations between the two opposition groups and Hun Sen 's party to form a new government failed .\nHun Sen 's ruling party narrowly won a majority in elections in July , but the opposition _ claiming widespread intimidation and fraud _ has denied Hun Sen the two-thirds vote in parliament required to approve the next government .\n" references_1 = [ "Prospects were dim for resolution of the political crisis in Cambodia in October 1998.\nPrime Minister Hun Sen insisted that talks take place in Cambodia while opposition leaders Ranariddh and Sam Rainsy, fearing arrest at home, wanted them abroad.\nKing Sihanouk declined to chair talks in either place.\nA U.S. House resolution criticized Hun Sen's regime while the opposition tried to cut off his access to loans.\nBut in November the King announced a coalition government with Hun Sen heading the executive and Ranariddh leading the parliament.\nLeft out, Sam Rainsy sought the King's assurance of Hun Sen's promise of safety and freedom for all politicians.", "Cambodian prime minister Hun Sen rejects demands of 2 opposition parties for talks in Beijing after failing to win a 2/3 majority in recent elections.\nSihanouk refuses to host talks in Beijing.\nOpposition parties ask the Asian Development Bank to stop loans to Hun Sen's government.\nCCP defends Hun Sen to the US Senate.\nFUNCINPEC refuses to share the presidency.\nHun Sen and Ranariddh eventually form a coalition at summit convened by Sihanouk.\nHun Sen remains prime minister, Ranariddh is president of the national assembly, and a new senate will be formed.\nOpposition leader Rainsy left out.\nHe seeks strong assurance of safety should he return to Cambodia.\n", ] hypothesis_2 = "China 's government said Thursday that two prominent dissidents arrested this week are suspected of endangering national security _ the clearest sign yet Chinese leaders plan to quash a would-be opposition party .\nOne leader of a suppressed new political party will be tried on Dec. 17 on a charge of colluding with foreign enemies of China '' to incite the subversion of state power , '' according to court documents given to his wife on Monday .\nWith attorneys locked up , harassed or plain scared , two prominent dissidents will defend themselves against charges of subversion Thursday in China 's highest-profile dissident trials in two years .\n" references_2 = "Hurricane Mitch, category 5 hurricane, brought widespread death and destruction to Central American.\nEspecially hard hit was Honduras where an estimated 6,076 people lost their lives.\nThe hurricane, which lingered off the coast of Honduras for 3 days before moving off, flooded large areas, destroying crops and property.\nThe U.S. and European Union were joined by Pope John Paul II in a call for money and workers to help the stricken area.\nPresident Clinton sent Tipper Gore, wife of Vice President Gore to the area to deliver much needed supplies to the area, demonstrating U.S. commitment to the recovery of the region.\n" all_hypothesis = [hypothesis_1, hypothesis_2] all_references = [references_1, references_2] scores = evaluator.get_scores(candidate, reference) for metric, results in sorted(scores.items(), key=lambda x: x[0]): if not apply_avg and not apply_best: # value is a type of list as we evaluate each summary vs each reference for hypothesis_id, results_per_ref in enumerate(results): nb_references = len(results_per_ref['p']) for reference_id in range(nb_references): pass # print('\tHypothesis #{} & Reference #{}: '.format(hypothesis_id, reference_id)) # print('\t' + prepare_results(metric, results_per_ref['p'][reference_id], # results_per_ref['r'][reference_id], # results_per_ref['f'][reference_id])) # print() else: print( prepare_results(metric, results['p'], results['r'], results['f'])) print()
def setUp(self): self.hyp_path = 'predicted_sentences_task1_ref0_2.txt' self.ref_path = 'task1_ref0_2.txt' self.data_path = 'data2.json' with open(self.data_path) as f: self.data = json.load(f) self.rouge = rouge.Rouge() self.files_rouge = rouge.FilesRouge()
def rouge(self): """ Computes ROUGE Link: https://github.com/Diego999/py-rouge """ evaluator = rouge_score.Rouge(metrics=["rouge-n"], max_n=1) rouge = { "ROUGE": evaluator.get_scores(self.hypothesis, self.target)["rouge-1"]["f"] * 100 } self.metrics.update(rouge)
def setUp(self): self.hyp_path = './tests/hyp.txt' self.ref_path = './tests/ref.txt' self.data_path = './tests/data.json' with open(self.data_path) as f: self.data = json.load(f) self.rouge = rouge.Rouge() self.files_rouge = rouge.FilesRouge()
def on_epoch_end(self, epoch, logs=None): # 训练过程中观察一两个例子 print_sentence(s1, self.model) print_sentence(s2, self.model) if epoch >= 7: predict = [] for t in test['description'].values: predict.append(gen_sent(t, model)) pred = {i: p for i, p in zip(id, predict)} # print(pred) with open("submission" + str(epoch) + ".json", "w", encoding='utf-8') as f: json.dump(pred, f, indent=2, ensure_ascii=False) summarization = test_df['question'].values text = test_df['description'].values pred = [] for t, s in tqdm(zip(text, summarization)): pred.append(gen_sent(t, self.model)) print('epoch:' + epoch + ' ,loss:' + logs['loss']) log.write('epoch:' + str(epoch) + ' ,loss:' + str(logs['loss'])) rouge_1 = rouge.Rouge().get_scores( pred, summarization.tolist())[0]['rouge-1']['f'] rouge_2 = rouge.Rouge().get_scores( pred, summarization.tolist())[0]['rouge-2']['f'] rouge_l = rouge.Rouge().get_scores( pred, summarization.tolist())[0]['rouge-l']['f'] print('rouge-1:', rouge_1) print('rouge-2:', rouge_2) print('rouge-l:', rouge_l) log.write('rouge-1:' + str(rouge_1)) log.write('rouge-2:' + str(rouge_2)) log.write('rouge-l:' + str(rouge_l)) # 保存最优模型 if logs['loss'] <= self.lowest: self.lowest = logs['loss'] model.save_weights('best_model.weights')
def evaluation_metrics(topic_name): reference = '' global final_summary address = os.getcwd() + '/GroundTruth/' + topic_name + '.1' file_open = open(address, 'r') for line in file_open: reference = reference + ''.join(line) hypothesis = final_summary rouge1 = rouge.Rouge() scores = rouge1.get_scores(reference, hypothesis) print scores
def _get_scores_python(hypothesis, references): """Note: py-rouge mixes up recall/precision""" return rouge.Rouge( metrics=["rouge-n", "rouge-l"], max_n=2, limit_length=False, apply_avg=True, alpha=0.5, # Default F1_score stemming=True, ensure_compatibility=True, ).get_scores(hypothesis, references)
def setup_rouge_python(metrics, N, stemming, apply_avg, apply_best, alpha, limit_length, length_type, length_limit, weight_factor): return rouge.Rouge(metrics=metrics, max_n=N, limit_length=limit_length, length_limit=length_limit, length_limit_type=length_type, apply_avg=apply_avg, apply_best=apply_best, alpha=alpha, weight_factor=weight_factor, stemming=stemming)