def make_dict(self, all_gts, all_res, vids): gold = [] pred = [] for vid in vids: gold.extend(all_gts[vid]["sentences"]) pred.extend([pred["sentence"] for pred in all_res[vid]]) self.idf_dict_ref = get_idf_dict(gold) self.idf_dict_hyp = get_idf_dict(pred)
def run_moverscore(self): ''' Computes the mover-1 and mover-2 scores between the set of hypothesis and reference summaries. ''' print('\n===== Moverscore =====\n') from moverscore import get_idf_dict, word_mover_score for hyps_path, refs_path in zip(self.hyps_paths, self.refs_paths): self.load_summs(hyps_path, refs_path) # Truncate hyps and refs if too long (bert positional embeddings max=512) hyps = [' '.join(h.split()[:300]) for h in self.hyps] refs = [' '.join(r.split()[:300]) for r in self.refs] idf_dict_hyp = get_idf_dict(hyps) idf_dict_ref = get_idf_dict(refs) n_grams = [] if 'mover-1' in self.metrics: n_grams.append(1) if 'mover-2' in self.metrics: n_grams.append(2) for n in n_grams: scores = word_mover_score(refs, hyps, idf_dict_ref, idf_dict_hyp, stop_words=[], n_gram=n, remove_subwords=True, batch_size=64) self.df_scores.loc[self.df_scores['hyps_path'] == hyps_path, f'mover-{n}'] = scores self.save_temp_csv() print(np.mean(scores)) del get_idf_dict, word_mover_score, scores torch.cuda.empty_cache()
def micro_averaging(dataset, target, device='cuda:0'): references, summaries = [], [] for topic in dataset: k, v = topic references.extend([' '.join(ref['text']) for ref in v['references']]) summaries.extend( [' '.join(annot['text']) for annot in v['annotations']]) idf_dict_ref = get_idf_dict(references) idf_dict_hyp = get_idf_dict(summaries) correlations = [] for topic in tqdm(dataset): k, v = topic references = [' '.join(ref['text']) for ref in v['references']] num_refs = len(references) target_scores, prediction_scores = [], [] for annot in v['annotations']: if len(annot['text']) > 1: target_scores.append(float(annot[target])) scores = word_mover_score(references, [' '.join(annot['text'])] * num_refs, idf_dict_ref, idf_dict_hyp, stop_words, n_gram=1, remove_subwords=True, batch_size=48) prediction_scores.append(np.mean(scores)) correlations.append([ stats.kendalltau(target_scores, prediction_scores)[0], stats.pearsonr(target_scores, prediction_scores)[0], stats.spearmanr(target_scores, prediction_scores)[0] ]) return np.array(correlations)
"newstest2017-csen-ref.en": "cs-en", # "newstest2017-deen-ref.en": "de-en", # "newstest2017-ruen-ref.en": "ru-en", # "newstest2017-tren-ref.en": "tr-en", # "newstest2017-zhen-ref.en": "zh-en" }) metric = 'MoverScore' data = [] for _ in reference_list.items(): reference_path, lp = _ references = load_data(os.path.join(data_dir, reference_path)) with MosesDetokenizer('en') as detokenize: references = [detokenize(ref.split(' ')) for ref in references] idf_dict_ref = get_idf_dict(references) all_meta_data = load_metadata(os.path.join(data_dir, lp)) for i in tqdm.tqdm(range(len(all_meta_data))): path, testset, lp, system = all_meta_data[i] translations = load_data(path) with MosesDetokenizer('en') as detokenize: translations = [detokenize(hyp.split(' ')) for hyp in translations] idf_dict_hyp = get_idf_dict(translations) df_system = pd.DataFrame(columns=('metric', 'lp', 'testset', 'system', 'sid', 'score')) scores = word_mover_score(references, translations, idf_dict_ref, idf_dict_hyp,
"newstest2017-deen-ref.en": "de-en", "newstest2017-ruen-ref.en": "ru-en", "newstest2017-tren-ref.en": "tr-en", "newstest2017-zhen-ref.en": "zh-en" }) #from collections import defaultdict metric = 'MoverScore' data = [] for _ in reference_list.items(): reference_path, lp = _ references = load_data(os.path.join(data_dir, reference_path)) with MosesDetokenizer('en') as detokenize: references = [detokenize(ref.split(' ')) for ref in references] idf_dict_ref = get_idf_dict(references) #defaultdict(lambda: 1.) all_meta_data = load_metadata(os.path.join(data_dir, lp)) for i in tqdm.tqdm(range(len(all_meta_data))): path, testset, lp, system = all_meta_data[i] translations = load_data(path) with MosesDetokenizer('en') as detokenize: translations = [detokenize(hyp.split(' ')) for hyp in translations] idf_dict_hyp = get_idf_dict(translations) df_system = pd.DataFrame(columns=('metric', 'lp', 'testset', 'system', 'sid', 'score')) scores = word_mover_score(references, translations, idf_dict_ref, idf_dict_hyp,
def main(prefix_string, labels_file, large_hypos, large_refs, results_dir): """ 1. Generate label enums set from labels_file. 2. Using labels and prefix, generate file names to check in results_dir. 3. Check for each file in results_dir. 4. Read all samples from large_hypos and large_refs and compute idf_dict. 5. For each pair of label files in results_dir, compute Moverscore 1, 2. 6. Save as CSV in results_dir. 7. Finally, compute score for all combined label files. """ hypos = read_clean_lines(large_hypos) refs = read_clean_lines(large_refs) assert len(hypos) == len(refs) idf_dict_hyp = get_idf_dict(hypos) idf_dict_ref = get_idf_dict(refs) with open(labels_file) as fp: labels = sorted(set([l.strip() for l in fp.readlines()])) check_names = [] for l in labels: check_names.append(prefix_string + l + '.tgt.txt') check_names.append(prefix_string + l + '.hypo.txt') file_names = set( [f for f in os.listdir(results_dir) if f.endswith('.txt')]) for c in check_names: if c not in file_names: raise AssertionError("%s is not in %s" % (c, results_dir)) print("Found %s" % c) all_target, all_preds = [], [] all_n1, all_n2 = [], [] for i in range(0, len(check_names), 2): target = os.path.join(results_dir, check_names[i]) preds = os.path.join(results_dir, check_names[i + 1]) output = os.path.join(results_dir, 'mvrs_%s.txt' % labels[i // 2]) lines_target = read_clean_lines(target) lines_preds = read_clean_lines(preds) assert len(lines_target) == len(lines_preds) scores_1 = word_mover_score(lines_target, lines_preds, idf_dict_ref, idf_dict_hyp, \ stop_words=[], n_gram=1, remove_subwords=True, batch_size=16) scores_2 = word_mover_score(lines_target, lines_preds, idf_dict_ref, idf_dict_hyp, \ stop_words=[], n_gram=2, remove_subwords=True, batch_size=16) avg_1 = np.mean(scores_1) avg_2 = np.mean(scores_2) all_n1.extend(scores_1) all_n2.extend(scores_2) with open(output, 'w') as fp: fp.write('n1,n2\n') fp.write(str(round(avg_1, 4)) + ',' + str(round(avg_2, 4)) + '\n') all_target.extend(lines_target) all_preds.extend(lines_preds) assert len(all_target) == len(all_preds) scores_1 = word_mover_score(all_target, all_preds, idf_dict_ref, idf_dict_hyp, \ stop_words=[], n_gram=1, remove_subwords=True, batch_size=16) scores_2 = word_mover_score(all_target, all_preds, idf_dict_ref, idf_dict_hyp, \ stop_words=[], n_gram=2, remove_subwords=True, batch_size=16) avg_1 = np.mean(scores_1) avg_2 = np.mean(scores_2) with open(os.path.join(results_dir, 'mvrs_all.txt'), 'w') as fp: fp.write('n1,n2\n') fp.write(str(round(avg_1, 4)) + ',' + str(round(avg_2, 4)) + '\n') print('Done', results_dir, labels)
def get_scores(nrows, metrics=None): ''' Get correlations between metric similarity and label similarity ''' df = pd.read_csv(QQP_DATA_PATH, nrows=nrows) start_time = time() if not metrics: metrics = [ 'mover-1', 'mover-2', 'bleurt', 'bertscore', 'bartscore', 'rouge1', 'rouge2', 'rougeLsum', ] for m in tqdm(metrics): if m.startswith('rouge'): scorer = rouge_scorer.RougeScorer( [met for met in metrics if met.startswith('rouge')], use_stemmer=True) scores = [ scorer.score(r, c)[m].fmeasure for c, r in zip(df.question1, df.question2) ] elif m == 'bertscore': scorer = BERTScorer(lang="en", rescale_with_baseline=True, model_type='roberta-large-mnli') _, _, scores = scorer.score(df.question1.tolist(), df.question2.tolist()) elif m == 'bartscore': scorer = BERTScorer(lang="en", model_type="facebook/bart-large-mnli", num_layers=12) _, _, scores = scorer.score(df.question1.tolist(), df.question2.tolist()) elif m == 'bleurt': checkpoint = "bleurt-large-512" scorer = score.BleurtScorer(checkpoint) scores = scorer.score(df.question1, df.question2, batch_size=50) elif m.startswith('mover'): # Truncate long questions else moverscore gets OOM q1 = df['question1'].apply(lambda s: s[:300]).tolist() q2 = df['question2'].apply(lambda s: s[:300]).tolist() idf_dict_hyp = get_idf_dict(q1) idf_dict_ref = get_idf_dict(q2) if '1' in m: n_gram = 1 else: n_gram = 2 scores = word_mover_score(q2, q1, idf_dict_ref, idf_dict_hyp, stop_words=[], n_gram=n_gram, remove_subwords=True, batch_size=64) df[m] = scores print('\n' * 10, m, '\n' * 10) df.to_csv(QQP_OUT_PATH)
def evaluate_metric(metric, stem, remove_stop, prompt='overall'): ''' Compute the correlation between the human eval scores and the scores awarded by the eval metric. ''' assert metric in ['ROUGE-1-F', 'ROUGE-2-F', 'ROUGE-L-F', 'bert-human', 'bert-score', 'bart-score', 'bleurt-base', 'bleurt-lg', 'mover-1', 'mover-2', 'mover-smd', 'bert-avg-score'] stemmed_str = "_stem" if stem else "" stop_str = "_removestop" if remove_stop else "" ranks_file_path = os.path.join('learned_eval/outputs', 'wref_{}{}{}_{}_rank_correlation.csv'.format(metric, stemmed_str, stop_str, prompt)) print('\n====={}=====\n'.format(ranks_file_path)) ranks_file = open(ranks_file_path, 'w') ranks_file.write('article,summ_id, human_score, metric_score\n') sorted_scores = read_sorted_scores() input_articles, _ = read_articles() corr_data = np.zeros((len(sorted_scores), 3)) stopwords_list = set(stopwords.words("english")) stemmer = PorterStemmer() # Init the metric if metric == 'bert-human': rewarder = Rewarder(os.path.join(MODEL_WEIGHT_DIR, 'sample.model')) elif metric.endswith('score'): from bert_score import BERTScorer if 'bert-score' == metric: rewarder = BERTScorer(lang="en", rescale_with_baseline=True, model_type='roberta-large-mnli') elif 'bart-score' == metric: rewarder = BERTScorer(lang="en", model_type="facebook/bart-large-mnli", num_layers=12) elif 'bert-avg' in metric: r1 = BERTScorer(lang="en", rescale_with_baseline=False, model_type='roberta-large') r2 = BERTScorer(lang="en", rescale_with_baseline=False, model_type='albert-xxlarge-v2') r3 = BERTScorer(lang="en", rescale_with_baseline=False, model_type='bart-large-mnli', num_layers=12) elif metric.startswith('bleurt'): from bleurt import score if 'base' in metric: checkpoint = "bleurt-base-512" elif 'lg' in metric: checkpoint = "bleurt-large-512" rewarder = score.BleurtScorer(checkpoint) elif metric.startswith('mover'): from moverscore import get_idf_dict, word_mover_score hyps = [s['sys_summ'] for score in sorted_scores.values() for s in score if s['sys_name'] != 'reference'] refs = [s['sys_summ'] for score in sorted_scores.values() for s in score if s['sys_name'] == 'reference'] idf_dict_hyp = get_idf_dict(hyps) idf_dict_ref = get_idf_dict(refs) elif 'rouge' in metric.lower(): from rouge_score import rouge_scorer from rouge_score.scoring import BootstrapAggregator # Loop over each article and compute the correlation between human judgement # and the metric scores. for i, (article_id, scores) in tqdm(enumerate(sorted_scores.items())): scores_list = [s for s in scores if s['sys_name'] != 'reference'] human_ranks = [s['scores'][prompt] for s in scores_list] if len(human_ranks) < 2: continue # Must be at least 2 scores to compute the correlation ref_summ = scores_list[0]['ref'] article = [entry['article'] for entry in input_articles if entry['id']==article_id][0] # Pre-processing (if necessary) if stem and remove_stop: sys_summs = [" ".join(sent2stokens_wostop(s['sys_summ'], stemmer, stopwords_list, 'english', True)) for s in scores_list] ref_summ = " ".join(sent2stokens_wostop(ref_summ, stemmer, stopwords_list, 'english', True)) article = " ".join(sent2stokens_wostop(article, stemmer, stopwords_list, 'english', True)) elif not stem and remove_stop: sys_summs = [" ".join(sent2tokens_wostop(s['sys_summ'], stopwords_list, 'english', True)) for s in scores_list] ref_summ = " ".join(sent2tokens_wostop(ref_summ, stopwords_list, 'english', True)) article = " ".join(sent2tokens_wostop(article, stopwords_list, 'english', True)) elif not remove_stop and stem: sys_summs = [" ".join(sent2stokens(s['sys_summ'], stemmer, 'english', True)) for s in scores_list] ref_summ = " ".join(sent2stokens(ref_summ, stemmer, 'english', True)) article = " ".join(sent2stokens(article, stemmer, 'english', True)) else: sys_summs = [s['sys_summ'] for s in scores_list] # Clean summaries summ_ids = [s['summ_id'] for s in scores_list] sys_summs = [text_normalization(s) for s in sys_summs] ref_summ = text_normalization(ref_summ) article = text_normalization(article) # Compute metric scores if 'rouge' in metric.lower(): auto_metric_ranks = [] if '1' in metric: rouge_metric = 'rouge1' elif '2' in metric: rouge_metric = 'rouge2' elif 'L' in metric: rouge_metric = 'rougeL' rew_rouge = rouge_scorer.RougeScorer([rouge_metric], use_stemmer=True) for ss in sys_summs: ss = ss.replace('. ', '\n') ref_summ = ref_summ.replace('. ', '\n') score = rew_rouge.score(ref_summ, ss) auto_metric_ranks.append(score[rouge_metric].fmeasure) if metric == 'bert-human': auto_metric_ranks = [rewarder(ref_summ,ss) for ss in sys_summs] elif metric.endswith('score'): if 'bert-score' == metric: auto_metric_ranks = [rewarder.score([ref_summ], [ss])[-1].item() for ss in sys_summs] elif 'bart-score' == metric: auto_metric_ranks = [rewarder.score([ref_summ], [ss])[-1].item() for ss in sys_summs] elif 'bert-avg' in metric: rewarder_scores = [] for rewarder in [r1, r2, r3]: r_scores = np.array([rewarder.score([ref_summ], [ss])[-1].item() for ss in sys_summs]) r_scores = (r_scores - np.min(r_scores)) / (np.max(r_scores) - np.min(r_scores)) rewarder_scores.append(r_scores) auto_metric_ranks = list(np.mean(rewarder_scores, axis=0)) elif metric.startswith('bleurt'): auto_metric_ranks = [rewarder.score([ref_summ], [ss])[0] for ss in sys_summs] elif metric.startswith('mover'): if '1' in metric: n_gram = 1 elif '2' in metric: n_gram = 2 else: raise ValueError("smd not implemented currently") auto_metric_ranks = [word_mover_score([ref_summ], [ss], idf_dict_ref, idf_dict_hyp, stop_words=[], n_gram=n_gram, remove_subwords=True)[0] for ss in sys_summs] for sid, amr, hr in zip(summ_ids, auto_metric_ranks, human_ranks): ranks_file.write('{},{},{:.2f},{:.4f}\n'.format(article_id, sid, hr, amr)) # Compute correlations spearmanr_result = spearmanr(human_ranks, auto_metric_ranks) pearsonr_result = pearsonr(human_ranks, auto_metric_ranks) kendalltau_result = kendalltau(human_ranks, auto_metric_ranks) corr_data[i, :] = [spearmanr_result[0], pearsonr_result[0], kendalltau_result[0]] corr_mean_all = np.nanmean(corr_data, axis=0) corr_std_all = np.nanstd(corr_data, axis=0) print('\n====={}=====\n'.format(ranks_file_path)) print("Correlation mean on all data spearman/pearsonr/kendall: {}".format(corr_mean_all)) print("Correlation std on all data spearman/pearsonr/kendall: {}".format(corr_std_all)) ranks_file.flush() ranks_file.close() return ranks_file_path