Example #1
0
data_dir = os.path.join(MOVERSCORE_DIR, 'MT')

reference_list = dict({
    "newstest2017-csen-ref.en": "cs-en",
    #        "newstest2017-deen-ref.en": "de-en",
    #        "newstest2017-ruen-ref.en": "ru-en",
    #        "newstest2017-tren-ref.en": "tr-en",
    #        "newstest2017-zhen-ref.en": "zh-en"
})

metric = 'MoverScore'

data = []
for _ in reference_list.items():
    reference_path, lp = _
    references = load_data(os.path.join(data_dir, reference_path))
    with MosesDetokenizer('en') as detokenize:
        references = [detokenize(ref.split(' ')) for ref in references]
    idf_dict_ref = get_idf_dict(references)

    all_meta_data = load_metadata(os.path.join(data_dir, lp))
    for i in tqdm.tqdm(range(len(all_meta_data))):
        path, testset, lp, system = all_meta_data[i]
        translations = load_data(path)
        with MosesDetokenizer('en') as detokenize:
            translations = [detokenize(hyp.split(' ')) for hyp in translations]
        idf_dict_hyp = get_idf_dict(translations)

        df_system = pd.DataFrame(columns=('metric', 'lp', 'testset', 'system',
                                          'sid', 'score'))
        scores = word_mover_score(references,
Example #2
0
import torch
import truecase
from mosestokenizer import MosesDetokenizer
from mt_utils import (find_corpus, load_data, load_metadata,
                      print_sys_level_correlation, print_seg_level_correlation,
                      df_append)

dataset = find_corpus(args.dataset)

wmt_xmoverscores = []
for pair in dataset.items():
    reference_path, lp = pair

    src, tgt = lp.split('-')
    references = load_data(
        os.path.join(args.dataset, 'references/', reference_path))

    source_path = reference_path.replace('ref', 'src')
    source_path = source_path.split('.')[0] + '.' + src
    source = load_data(os.path.join(args.dataset, 'source', source_path))

    all_meta_data = load_metadata(
        os.path.join(args.dataset, 'system-outputs', lp))

    with MosesDetokenizer(src) as detokenize:
        source = [detokenize(s.split(' ')) for s in source]
    with MosesDetokenizer(tgt) as detokenize:
        references = [detokenize(s.split(' ')) for s in references]

    device = 'cuda:0'
    temp = np.loadtxt('mapping/europarl-v7.' + src + '-' + tgt +