Exemple #1
0
 def make_dict(self, all_gts, all_res, vids):
     gold = []
     pred = []
     for vid in vids:
         gold.extend(all_gts[vid]["sentences"])
         pred.extend([pred["sentence"] for pred in all_res[vid]])
     self.idf_dict_ref = get_idf_dict(gold)
     self.idf_dict_hyp = get_idf_dict(pred)
    def run_moverscore(self):
        ''' Computes the mover-1 and mover-2 scores between the set of hypothesis 
            and reference summaries.
        '''
        print('\n===== Moverscore =====\n')
        from moverscore import get_idf_dict, word_mover_score

        for hyps_path, refs_path in zip(self.hyps_paths, self.refs_paths):
            self.load_summs(hyps_path, refs_path)

            # Truncate hyps and refs if too long (bert positional embeddings max=512)
            hyps = [' '.join(h.split()[:300]) for h in self.hyps]
            refs = [' '.join(r.split()[:300]) for r in self.refs]

            idf_dict_hyp = get_idf_dict(hyps)
            idf_dict_ref = get_idf_dict(refs)

            n_grams = []
            if 'mover-1' in self.metrics:
                n_grams.append(1)
            if 'mover-2' in self.metrics:
                n_grams.append(2)

            for n in n_grams:
                scores = word_mover_score(refs,
                                          hyps,
                                          idf_dict_ref,
                                          idf_dict_hyp,
                                          stop_words=[],
                                          n_gram=n,
                                          remove_subwords=True,
                                          batch_size=64)
                self.df_scores.loc[self.df_scores['hyps_path'] == hyps_path,
                                   f'mover-{n}'] = scores
                self.save_temp_csv()
                print(np.mean(scores))

        del get_idf_dict, word_mover_score, scores
        torch.cuda.empty_cache()
Exemple #3
0
def micro_averaging(dataset, target, device='cuda:0'):
    references, summaries = [], []
    for topic in dataset:
        k, v = topic
        references.extend([' '.join(ref['text']) for ref in v['references']])
        summaries.extend(
            [' '.join(annot['text']) for annot in v['annotations']])

    idf_dict_ref = get_idf_dict(references)
    idf_dict_hyp = get_idf_dict(summaries)

    correlations = []
    for topic in tqdm(dataset):
        k, v = topic
        references = [' '.join(ref['text']) for ref in v['references']]
        num_refs = len(references)
        target_scores, prediction_scores = [], []

        for annot in v['annotations']:
            if len(annot['text']) > 1:
                target_scores.append(float(annot[target]))

                scores = word_mover_score(references,
                                          [' '.join(annot['text'])] * num_refs,
                                          idf_dict_ref,
                                          idf_dict_hyp,
                                          stop_words,
                                          n_gram=1,
                                          remove_subwords=True,
                                          batch_size=48)

                prediction_scores.append(np.mean(scores))

        correlations.append([
            stats.kendalltau(target_scores, prediction_scores)[0],
            stats.pearsonr(target_scores, prediction_scores)[0],
            stats.spearmanr(target_scores, prediction_scores)[0]
        ])
    return np.array(correlations)
    "newstest2017-csen-ref.en": "cs-en",
    #        "newstest2017-deen-ref.en": "de-en",
    #        "newstest2017-ruen-ref.en": "ru-en",
    #        "newstest2017-tren-ref.en": "tr-en",
    #        "newstest2017-zhen-ref.en": "zh-en"
})

metric = 'MoverScore'

data = []
for _ in reference_list.items():
    reference_path, lp = _
    references = load_data(os.path.join(data_dir, reference_path))
    with MosesDetokenizer('en') as detokenize:
        references = [detokenize(ref.split(' ')) for ref in references]
    idf_dict_ref = get_idf_dict(references)

    all_meta_data = load_metadata(os.path.join(data_dir, lp))
    for i in tqdm.tqdm(range(len(all_meta_data))):
        path, testset, lp, system = all_meta_data[i]
        translations = load_data(path)
        with MosesDetokenizer('en') as detokenize:
            translations = [detokenize(hyp.split(' ')) for hyp in translations]
        idf_dict_hyp = get_idf_dict(translations)

        df_system = pd.DataFrame(columns=('metric', 'lp', 'testset', 'system',
                                          'sid', 'score'))
        scores = word_mover_score(references,
                                  translations,
                                  idf_dict_ref,
                                  idf_dict_hyp,
Exemple #5
0
    "newstest2017-deen-ref.en": "de-en",
    "newstest2017-ruen-ref.en": "ru-en",
    "newstest2017-tren-ref.en": "tr-en",
    "newstest2017-zhen-ref.en": "zh-en"
})
#from collections import defaultdict
metric = 'MoverScore'

data = []
for _ in reference_list.items():
    reference_path, lp = _
    references = load_data(os.path.join(data_dir, reference_path))
    with MosesDetokenizer('en') as detokenize:
        references = [detokenize(ref.split(' ')) for ref in references]

    idf_dict_ref = get_idf_dict(references)  #defaultdict(lambda: 1.)

    all_meta_data = load_metadata(os.path.join(data_dir, lp))
    for i in tqdm.tqdm(range(len(all_meta_data))):
        path, testset, lp, system = all_meta_data[i]
        translations = load_data(path)
        with MosesDetokenizer('en') as detokenize:
            translations = [detokenize(hyp.split(' ')) for hyp in translations]
        idf_dict_hyp = get_idf_dict(translations)

        df_system = pd.DataFrame(columns=('metric', 'lp', 'testset', 'system',
                                          'sid', 'score'))
        scores = word_mover_score(references,
                                  translations,
                                  idf_dict_ref,
                                  idf_dict_hyp,
Exemple #6
0
def main(prefix_string, labels_file, large_hypos, large_refs, results_dir):
    """
    1. Generate label enums set from labels_file.
    2. Using labels and prefix, generate file names to check in results_dir.
    3. Check for each file in results_dir.
    4. Read all samples from large_hypos and large_refs and compute idf_dict.
    5. For each pair of label files in results_dir, compute Moverscore 1, 2.
    6. Save as CSV in results_dir.
    7. Finally, compute score for all combined label files.
    """
    hypos = read_clean_lines(large_hypos)
    refs = read_clean_lines(large_refs)
    assert len(hypos) == len(refs)

    idf_dict_hyp = get_idf_dict(hypos)
    idf_dict_ref = get_idf_dict(refs)

    with open(labels_file) as fp:
        labels = sorted(set([l.strip() for l in fp.readlines()]))

    check_names = []
    for l in labels:
        check_names.append(prefix_string + l + '.tgt.txt')
        check_names.append(prefix_string + l + '.hypo.txt')

    file_names = set(
        [f for f in os.listdir(results_dir) if f.endswith('.txt')])

    for c in check_names:
        if c not in file_names:
            raise AssertionError("%s is not in %s" % (c, results_dir))
        print("Found %s" % c)

    all_target, all_preds = [], []
    all_n1, all_n2 = [], []

    for i in range(0, len(check_names), 2):
        target = os.path.join(results_dir, check_names[i])
        preds = os.path.join(results_dir, check_names[i + 1])
        output = os.path.join(results_dir, 'mvrs_%s.txt' % labels[i // 2])

        lines_target = read_clean_lines(target)
        lines_preds = read_clean_lines(preds)
        assert len(lines_target) == len(lines_preds)

        scores_1 = word_mover_score(lines_target, lines_preds, idf_dict_ref, idf_dict_hyp, \
                                    stop_words=[], n_gram=1, remove_subwords=True, batch_size=16)
        scores_2 = word_mover_score(lines_target, lines_preds, idf_dict_ref, idf_dict_hyp, \
                                    stop_words=[], n_gram=2, remove_subwords=True, batch_size=16)

        avg_1 = np.mean(scores_1)
        avg_2 = np.mean(scores_2)

        all_n1.extend(scores_1)
        all_n2.extend(scores_2)

        with open(output, 'w') as fp:
            fp.write('n1,n2\n')
            fp.write(str(round(avg_1, 4)) + ',' + str(round(avg_2, 4)) + '\n')

        all_target.extend(lines_target)
        all_preds.extend(lines_preds)

    assert len(all_target) == len(all_preds)
    scores_1 = word_mover_score(all_target, all_preds, idf_dict_ref, idf_dict_hyp, \
                                stop_words=[], n_gram=1, remove_subwords=True, batch_size=16)
    scores_2 = word_mover_score(all_target, all_preds, idf_dict_ref, idf_dict_hyp, \
                                stop_words=[], n_gram=2, remove_subwords=True, batch_size=16)

    avg_1 = np.mean(scores_1)
    avg_2 = np.mean(scores_2)

    with open(os.path.join(results_dir, 'mvrs_all.txt'), 'w') as fp:
        fp.write('n1,n2\n')
        fp.write(str(round(avg_1, 4)) + ',' + str(round(avg_2, 4)) + '\n')

    print('Done', results_dir, labels)
Exemple #7
0
def get_scores(nrows, metrics=None):
    ''' Get correlations between metric similarity and label similarity '''
    df = pd.read_csv(QQP_DATA_PATH, nrows=nrows)
    start_time = time()
    if not metrics:
        metrics = [
            'mover-1',
            'mover-2',
            'bleurt',
            'bertscore',
            'bartscore',
            'rouge1',
            'rouge2',
            'rougeLsum',
        ]
    for m in tqdm(metrics):
        if m.startswith('rouge'):
            scorer = rouge_scorer.RougeScorer(
                [met for met in metrics if met.startswith('rouge')],
                use_stemmer=True)
            scores = [
                scorer.score(r, c)[m].fmeasure
                for c, r in zip(df.question1, df.question2)
            ]
        elif m == 'bertscore':
            scorer = BERTScorer(lang="en",
                                rescale_with_baseline=True,
                                model_type='roberta-large-mnli')
            _, _, scores = scorer.score(df.question1.tolist(),
                                        df.question2.tolist())
        elif m == 'bartscore':
            scorer = BERTScorer(lang="en",
                                model_type="facebook/bart-large-mnli",
                                num_layers=12)
            _, _, scores = scorer.score(df.question1.tolist(),
                                        df.question2.tolist())
        elif m == 'bleurt':
            checkpoint = "bleurt-large-512"
            scorer = score.BleurtScorer(checkpoint)
            scores = scorer.score(df.question1, df.question2, batch_size=50)
        elif m.startswith('mover'):
            # Truncate long questions else moverscore gets OOM
            q1 = df['question1'].apply(lambda s: s[:300]).tolist()
            q2 = df['question2'].apply(lambda s: s[:300]).tolist()
            idf_dict_hyp = get_idf_dict(q1)
            idf_dict_ref = get_idf_dict(q2)
            if '1' in m:
                n_gram = 1
            else:
                n_gram = 2
            scores = word_mover_score(q2,
                                      q1,
                                      idf_dict_ref,
                                      idf_dict_hyp,
                                      stop_words=[],
                                      n_gram=n_gram,
                                      remove_subwords=True,
                                      batch_size=64)

        df[m] = scores
        print('\n' * 10, m, '\n' * 10)
        df.to_csv(QQP_OUT_PATH)
def evaluate_metric(metric, stem, remove_stop, prompt='overall'):
    ''' Compute the correlation between the human eval scores and the scores awarded by the
        eval metric.
    '''
    assert metric in ['ROUGE-1-F', 'ROUGE-2-F', 'ROUGE-L-F', 'bert-human', 'bert-score', 'bart-score', 
        'bleurt-base', 'bleurt-lg', 'mover-1', 'mover-2', 'mover-smd', 'bert-avg-score']
    stemmed_str = "_stem" if stem else ""
    stop_str = "_removestop" if remove_stop else ""
    ranks_file_path = os.path.join('learned_eval/outputs', 'wref_{}{}{}_{}_rank_correlation.csv'.format(metric, stemmed_str, stop_str, prompt))
    print('\n====={}=====\n'.format(ranks_file_path))

    ranks_file = open(ranks_file_path, 'w')
    ranks_file.write('article,summ_id, human_score, metric_score\n')

    sorted_scores = read_sorted_scores()
    input_articles, _ = read_articles()
    corr_data = np.zeros((len(sorted_scores), 3))

    stopwords_list = set(stopwords.words("english"))
    stemmer = PorterStemmer()

    # Init the metric
    if metric == 'bert-human':
        rewarder = Rewarder(os.path.join(MODEL_WEIGHT_DIR, 'sample.model'))
    elif metric.endswith('score'):   
        from bert_score import BERTScorer
        if 'bert-score' == metric:
            rewarder = BERTScorer(lang="en", rescale_with_baseline=True, model_type='roberta-large-mnli')
        elif 'bart-score' == metric:
            rewarder = BERTScorer(lang="en", model_type="facebook/bart-large-mnli", num_layers=12)
        elif 'bert-avg' in metric:
            r1 = BERTScorer(lang="en", rescale_with_baseline=False, model_type='roberta-large')
            r2 = BERTScorer(lang="en", rescale_with_baseline=False, model_type='albert-xxlarge-v2')
            r3 = BERTScorer(lang="en", rescale_with_baseline=False, model_type='bart-large-mnli', num_layers=12)
    elif metric.startswith('bleurt'):
        from bleurt import score
        if 'base' in metric: 
            checkpoint = "bleurt-base-512"
        elif 'lg' in metric: 
            checkpoint = "bleurt-large-512"
        rewarder = score.BleurtScorer(checkpoint)
    elif metric.startswith('mover'):
        from moverscore import get_idf_dict, word_mover_score
        hyps = [s['sys_summ'] for score in sorted_scores.values() for s in score if s['sys_name'] != 'reference']
        refs = [s['sys_summ'] for score in sorted_scores.values() for s in score if s['sys_name'] == 'reference']
        idf_dict_hyp = get_idf_dict(hyps)
        idf_dict_ref = get_idf_dict(refs)
    elif 'rouge' in metric.lower():
        from rouge_score import rouge_scorer
        from rouge_score.scoring import BootstrapAggregator

    # Loop over each article and compute the correlation between human judgement
    # and the metric scores. 
    for i, (article_id, scores) in tqdm(enumerate(sorted_scores.items())):
        scores_list = [s for s in scores if s['sys_name'] != 'reference']
        human_ranks = [s['scores'][prompt] for s in scores_list]
        if len(human_ranks) < 2: 
            continue    # Must be at least 2 scores to compute the correlation
        ref_summ = scores_list[0]['ref']
        article = [entry['article'] for entry in input_articles if entry['id']==article_id][0]

        # Pre-processing (if necessary)
        if stem and remove_stop:
            sys_summs = [" ".join(sent2stokens_wostop(s['sys_summ'], stemmer, stopwords_list, 'english', True)) for s in scores_list]
            ref_summ = " ".join(sent2stokens_wostop(ref_summ, stemmer, stopwords_list, 'english', True))
            article = " ".join(sent2stokens_wostop(article, stemmer, stopwords_list, 'english', True))
        elif not stem and remove_stop:
            sys_summs = [" ".join(sent2tokens_wostop(s['sys_summ'], stopwords_list, 'english', True)) for s in scores_list]
            ref_summ = " ".join(sent2tokens_wostop(ref_summ, stopwords_list, 'english', True))
            article = " ".join(sent2tokens_wostop(article, stopwords_list, 'english', True))
        elif not remove_stop and stem:
            sys_summs = [" ".join(sent2stokens(s['sys_summ'], stemmer, 'english', True)) for s in scores_list]
            ref_summ = " ".join(sent2stokens(ref_summ, stemmer, 'english', True))
            article = " ".join(sent2stokens(article, stemmer, 'english', True))
        else:
            sys_summs = [s['sys_summ'] for s in scores_list]

        # Clean summaries
        summ_ids = [s['summ_id'] for s in scores_list]
        sys_summs = [text_normalization(s) for s in sys_summs]
        ref_summ = text_normalization(ref_summ)
        article = text_normalization(article)

        # Compute metric scores
        if 'rouge' in metric.lower():
            auto_metric_ranks = []
            if '1' in metric:
                rouge_metric = 'rouge1'
            elif '2' in metric:
                rouge_metric = 'rouge2'
            elif 'L' in metric:
                rouge_metric = 'rougeL'
            rew_rouge = rouge_scorer.RougeScorer([rouge_metric], use_stemmer=True)
            for ss in sys_summs:
                ss = ss.replace('. ', '\n')
                ref_summ = ref_summ.replace('. ', '\n')
                score = rew_rouge.score(ref_summ, ss)
                auto_metric_ranks.append(score[rouge_metric].fmeasure)
        if metric == 'bert-human':
            auto_metric_ranks = [rewarder(ref_summ,ss) for ss in sys_summs]
        elif metric.endswith('score'):   
            if 'bert-score' == metric:
                auto_metric_ranks = [rewarder.score([ref_summ], [ss])[-1].item() for ss in sys_summs]
            elif 'bart-score' == metric:
                auto_metric_ranks = [rewarder.score([ref_summ], [ss])[-1].item() for ss in sys_summs]
            elif 'bert-avg' in metric:
                rewarder_scores = []
                for rewarder in [r1, r2, r3]:
                    r_scores = np.array([rewarder.score([ref_summ], [ss])[-1].item() for ss in sys_summs])
                    r_scores = (r_scores - np.min(r_scores)) / (np.max(r_scores) - np.min(r_scores))
                    rewarder_scores.append(r_scores)
                auto_metric_ranks = list(np.mean(rewarder_scores, axis=0))
        elif metric.startswith('bleurt'):
            auto_metric_ranks = [rewarder.score([ref_summ], [ss])[0] for ss in sys_summs]
        elif metric.startswith('mover'):
            if '1' in metric: 
                n_gram = 1
            elif '2' in metric: 
                n_gram = 2
            else: 
                raise ValueError("smd not implemented currently")
            auto_metric_ranks = [word_mover_score([ref_summ], [ss], idf_dict_ref, idf_dict_hyp,
                                stop_words=[], n_gram=n_gram, remove_subwords=True)[0] for ss in sys_summs]
   
        for sid, amr, hr in zip(summ_ids, auto_metric_ranks, human_ranks):
            ranks_file.write('{},{},{:.2f},{:.4f}\n'.format(article_id, sid, hr, amr))

        # Compute correlations
        spearmanr_result = spearmanr(human_ranks, auto_metric_ranks)
        pearsonr_result = pearsonr(human_ranks, auto_metric_ranks)
        kendalltau_result = kendalltau(human_ranks, auto_metric_ranks)
        corr_data[i, :] = [spearmanr_result[0], pearsonr_result[0], kendalltau_result[0]]

    corr_mean_all = np.nanmean(corr_data, axis=0)
    corr_std_all = np.nanstd(corr_data, axis=0)
    print('\n====={}=====\n'.format(ranks_file_path))
    print("Correlation mean on all data spearman/pearsonr/kendall: {}".format(corr_mean_all))
    print("Correlation std on all data spearman/pearsonr/kendall: {}".format(corr_std_all))

    ranks_file.flush()
    ranks_file.close()

    return ranks_file_path