Beispiel #1
0
        test_preds = np.zeros((len(test), siamesetransquest_config["n_fold"]))
        for i in range(siamesetransquest_config["n_fold"]):

            if os.path.exists(siamesetransquest_config['best_model_dir']) and os.path.isdir(
                    siamesetransquest_config['best_model_dir']):
                shutil.rmtree(siamesetransquest_config['best_model_dir'])

            if os.path.exists(siamesetransquest_config['cache_dir']) and os.path.isdir(
                    siamesetransquest_config['cache_dir']):
                shutil.rmtree(siamesetransquest_config['cache_dir'])

            train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i)
            model = SiameseTransQuestModel(MODEL_NAME, args=siamesetransquest_config)
            model.train_model(train_df, eval_df)

            model = SiameseTransQuestModel(siamesetransquest_config['best_model_dir'])
            dev_preds[:, i] = model.predict(dev_sentence_pairs)
            test_preds[:, i] = model.predict(test_sentence_pairs)

        dev['predictions'] = dev_preds.mean(axis=1)
        test['predictions'] = test_preds.mean(axis=1)

dev = un_fit(dev, 'labels')
dev = un_fit(dev, 'predictions')
test = un_fit(test, 'predictions')
dev.to_csv(os.path.join(TEMP_DIRECTORY, RESULT_FILE), header=True, sep='\t', index=False, encoding='utf-8')
draw_scatterplot(dev, 'labels', 'predictions', os.path.join(TEMP_DIRECTORY, RESULT_IMAGE), "English-Chinese")
print_stat(dev, 'labels', 'predictions')
format_submission(df=test, index=index, language_pair="en-zh", method="SiameseTransQuest",
                  path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE))
Beispiel #2
0
                                                 test_size=0.1,
                                                 random_state=SEED * i)
            model = SiameseTransQuestModel(MODEL_NAME,
                                           args=siamesetransquest_config)
            model.train_model(train_df, eval_df)

            model = SiameseTransQuestModel(
                siamesetransquest_config['best_model_dir'])
            dev_preds[:, i] = model.predict(dev_sentence_pairs)
            test_preds[:, i] = model.predict(test_sentence_pairs)

        dev['predictions'] = dev_preds.mean(axis=1)
        test['predictions'] = test_preds.mean(axis=1)

dev = un_fit(dev, 'labels')
dev = un_fit(dev, 'predictions')
test = un_fit(test, 'predictions')
dev.to_csv(os.path.join(TEMP_DIRECTORY, RESULT_FILE),
           header=True,
           sep='\t',
           index=False,
           encoding='utf-8')
draw_scatterplot(dev, 'labels', 'predictions',
                 os.path.join(TEMP_DIRECTORY, RESULT_IMAGE), "Sinhala-English")
print_stat(dev, 'labels', 'predictions')
format_submission(df=test,
                  index=index,
                  language_pair="si-en",
                  method="SiameseTransQuest",
                  path=os.path.join(TEMP_DIRECTORY, SUBMISSION_FILE))
Beispiel #3
0
for dev, test, index, language in zip(dev_list, test_list, index_list,
                                      [*languages]):
    dev = un_fit(dev, 'labels')
    dev = un_fit(dev, 'predictions')
    test = un_fit(test, 'predictions')
    dev.to_csv(os.path.join(
        TEMP_DIRECTORY,
        RESULT_FILE.split(".")[0] + "_" + language + "." +
        RESULT_FILE.split(".")[1]),
               header=True,
               sep='\t',
               index=False,
               encoding='utf-8')
    draw_scatterplot(
        dev, 'labels', 'predictions',
        os.path.join(
            TEMP_DIRECTORY,
            RESULT_IMAGE.split(".")[0] + "_" + language + "." +
            RESULT_IMAGE.split(".")[1]), language)
    print_stat(dev, 'labels', 'predictions')

    if language == "RU-EN":
        format_submission(df=test,
                          index=index,
                          language_pair=language.lower(),
                          method="TransQuest",
                          path=os.path.join(
                              TEMP_DIRECTORY,
                              SUBMISSION_FILE.split(".")[0] + "_" + language +
                              "." + SUBMISSION_FILE.split(".")[1]),
                          index_type="Auto")