Example #1
0
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

if not os.path.exists(TEMP_DIRECTORY):
    os.makedirs(TEMP_DIRECTORY)

TRAIN_FILE = "examples/sentence_level/wmt_2020/ru_en/data/ru-en/train.ruen.df.short.tsv"
DEV_FILE = "examples/sentence_level/wmt_2020/ru_en/data/ru-en/dev.ruen.df.short.tsv"
TEST_FILE = "examples/sentence_level/wmt_2020/ru_en/data/ru-en/test20.ruen.df.short.tsv"

train = read_annotated_file(TRAIN_FILE, index="segid")
dev = read_annotated_file(DEV_FILE, index="segid")
test = read_test_file(TEST_FILE, index="segid")

index = test['index'].to_list()

train = train[['original', 'translation', 'z_mean']]
dev = dev[['original', 'translation', 'z_mean']]
test = test[['original', 'translation']]

train = train.rename(columns={
    'original': 'text_a',
    'translation': 'text_b',
    'z_mean': 'labels'
}).dropna()
dev = dev.rename(columns={
    'original': 'text_a',
    'translation': 'text_b',
Example #2
0
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

if not os.path.exists(TEMP_DIRECTORY):
    os.makedirs(TEMP_DIRECTORY)

TRAIN_FILE = "examples/sentence_level/wmt_2020/si_en/data/si-en/train.sien.df.short.tsv"
DEV_FILE = "examples/sentence_level/wmt_2020/si_en/data/si-en/dev.sien.df.short.tsv"
TEST_FILE = "examples/sentence_level/wmt_2020/si_en/data/si-en/test20.sien.df.short.tsv"

train = read_annotated_file(TRAIN_FILE)
dev = read_annotated_file(DEV_FILE)
test = read_test_file(TEST_FILE)
index = test['index'].to_list()

train = train[['original', 'translation', 'z_mean']]
dev = dev[['original', 'translation', 'z_mean']]
test = test[['original', 'translation']]

train = train.rename(columns={
    'original': 'text_a',
    'translation': 'text_b',
    'z_mean': 'labels'
}).dropna()
dev = dev.rename(columns={
    'original': 'text_a',
    'translation': 'text_b',
    'z_mean': 'labels'
Example #3
0
        "examples/sentence_level/wmt_2020/si_en/data/si-en/test20.sien.df.short.tsv"
    ],
}

train_list = []
dev_list = []
test_list = []
index_list = []
test_sentence_pairs_list = []

for key, value in languages.items():

    if key == "RU-EN":
        train_temp = read_annotated_file(value[0], index="segid")
        dev_temp = read_annotated_file(value[1], index="segid")
        test_temp = read_test_file(value[2], index="segid")

    else:
        train_temp = read_annotated_file(value[0])
        dev_temp = read_annotated_file(value[1])
        test_temp = read_test_file(value[2])

    train_temp = train_temp[['original', 'translation', 'z_mean']]
    dev_temp = dev_temp[['original', 'translation', 'z_mean']]
    test_temp = test_temp[['index', 'original', 'translation']]

    index_temp = test_temp['index'].to_list()
    train_temp = train_temp.rename(columns={
        'original': 'text_a',
        'translation': 'text_b',
        'z_mean': 'labels'