Example #1
0
from examples.wmt_2020.ru_en.transformer_config import TEMP_DIRECTORY, DRIVE_FILE_ID, GOOGLE_DRIVE, MODEL_NAME, \
    transformer_config, MODEL_TYPE, SEED, RESULT_FILE, RESULT_IMAGE, SUBMISSION_FILE
from transquest.algo.transformers.evaluation import pearson_corr, spearman_corr
from transquest.algo.transformers.run_model import QuestModel

if not os.path.exists(TEMP_DIRECTORY):
    os.makedirs(TEMP_DIRECTORY)

if GOOGLE_DRIVE:
    download_from_google_drive(DRIVE_FILE_ID, MODEL_NAME)

TRAIN_FILE = "examples/wmt_2020/ru_en/data/ru-en/train.ruen.df.short.tsv"
DEV_FILE = "examples/wmt_2020/ru_en/data/ru-en/dev.ruen.df.short.tsv"
TEST_FILE = "examples/wmt_2020/ru_en/data/ru-en/test20.ruen.df.short.tsv"

train = read_annotated_file(TRAIN_FILE, index="segid")
dev = read_annotated_file(DEV_FILE, index="segid")
test = read_test_file(TEST_FILE, index="segid")

train = train[['original', 'translation', 'z_mean']]
dev = dev[['original', 'translation', 'z_mean']]
test = test[['index', 'original', 'translation']]

index = test['index'].to_list()

train = train.rename(columns={
    'original': 'text_a',
    'translation': 'text_b',
    'z_mean': 'labels'
}).dropna()
dev = dev.rename(columns={
Example #2
0
from examples.wmt_2020.common.util.normalizer import fit, un_fit
from examples.wmt_2020.common.util.postprocess import format_submission
from examples.wmt_2020.common.util.reader import read_annotated_file, read_test_file
from examples.wmt_2020.en_zh.transformer_config import TEMP_DIRECTORY, MODEL_TYPE, MODEL_NAME, transformer_config, SEED, \
    RESULT_FILE, RESULT_IMAGE, SUBMISSION_FILE
from transquest.algo.transformers.evaluation import pearson_corr, spearman_corr
from transquest.algo.transformers.run_model import QuestModel

if not os.path.exists(TEMP_DIRECTORY):
    os.makedirs(TEMP_DIRECTORY)

TRAIN_FILE = "examples/wmt_2020/en_zh/data/en-zh/train.enzh.df.short.tsv"
DEV_FILE = "examples/wmt_2020/en_zh/data/en-zh/dev.enzh.df.short.tsv"
TEST_FILE = "examples/wmt_2020/en_zh/data/en-zh/test20.enzh.df.short.tsv"

train = read_annotated_file(TRAIN_FILE)
dev = read_annotated_file(DEV_FILE)
test = read_test_file(TEST_FILE)

train = train[['original', 'translation', 'z_mean']]
dev = dev[['original', 'translation', 'z_mean']]
test = test[['index', 'original', 'translation']]

index = test['index'].to_list()
train = train.rename(columns={
    'original': 'text_a',
    'translation': 'text_b',
    'z_mean': 'labels'
}).dropna()
dev = dev.rename(columns={
    'original': 'text_a',
Example #3
0
    #
    # "SI-EN": ["examples/wmt_2020/si_en/data/si-en/train.sien.df.short.tsv",
    #           "examples/wmt_2020/si_en/data/si-en/dev.sien.df.short.tsv",
    #           "examples/wmt_2020/si_en/data/si-en/test20.sien.df.short.tsv"],
}

train_list = []
dev_list = []
test_list = []
index_list = []
test_sentence_pairs_list = []

for key, value in languages.items():

    if key == "RU-EN":
        train_temp = read_annotated_file(value[0], index="segid")
        dev_temp = read_annotated_file(value[1], index="segid")
        test_temp = read_test_file(value[2], index="segid")

    else:
        train_temp = read_annotated_file(value[0])
        dev_temp = read_annotated_file(value[1])
        test_temp = read_test_file(value[2])

    train_temp = train_temp[['original', 'translation', 'z_mean']]
    dev_temp = dev_temp[['original', 'translation', 'z_mean']]
    test_temp = test_temp[['index', 'original', 'translation']]

    index_temp = test_temp['index'].to_list()
    train_temp = train_temp.rename(columns={
        'original': 'text_a',