Beispiel #1
0
for dev, test in zip(dev_list, test_list):
    dev_preds = np.zeros((len(dev), config["n_fold"]))
    test_preds = np.zeros((len(test), config["n_fold"]))

    dev_preds_list.append(dev_preds)
    test_preds_list.append(test_preds)

for i in range(config["n_fold"]):
    if os.path.exists(config['output_dir']) and os.path.isdir(
            config['output_dir']):
        shutil.rmtree(config['output_dir'])
    print("Started Fold {}".format(i))
    model = ClassificationModel(MODEL_TYPE,
                                MODEL_NAME,
                                args=config,
                                num_labels=1,
                                use_cuda=torch.cuda.is_available())
    train_df, eval_df = train_test_split(train,
                                         test_size=0.2,
                                         random_state=SEED * i)
    model.train_model(train_df, eval_df=eval_df, mae=mean_absolute_error)
    model = ClassificationModel(MODEL_TYPE,
                                config["best_model_dir"],
                                args=config,
                                num_labels=1,
                                use_cuda=torch.cuda.is_available())

    for dev_sentences, test_sentences, dev_preds, test_preds in zip(
            dev_sentences_list, test_sentences_list, dev_preds_list,
            test_preds_list):
print("--> dev preprocess tokenization done!")

test = pd.read_csv(os.path.join("examples", "arabic", "data", "covid19_disinfo_binary_arabic_test_input.tsv"), sep='\t')

dev_sentences = dev['text'].tolist()
dev_preds = np.zeros((len(dev_sentences), config["n_fold"]))

test['text'] = test['text'].apply(arabert_prep.preprocess)
test_sentences = test['text'].tolist()
test_preds = np.zeros((len(test_sentences), config["n_fold"]))

for i in range(config["n_fold"]):
    if os.path.exists(config['output_dir']) and os.path.isdir(config['output_dir']):
        shutil.rmtree(config['output_dir'])
    print("Started Fold {}".format(i))
    model = ClassificationModel(MODEL_TYPE, MODEL_NAME, args=config,
                                use_cuda=torch.cuda.is_available())
    train_df, eval_df = train_test_split(train, test_size=0.1, random_state=SEED * i)
    model.train_model(train_df, eval_df=eval_df, precision=precision, recall=recall, f1=f1)
    model = ClassificationModel(MODEL_TYPE, config["best_model_dir"], args=config,
                                use_cuda=torch.cuda.is_available())

    predictions, raw_outputs = model.predict(dev_sentences)
    dev_preds[:, i] = predictions

    test_predictions, test_raw_outputs = model.predict(test_sentences)
    test_preds[:, i] = test_predictions

    print("Completed Fold {}".format(i))

# select majority class of each instance (row)
dev_predictions = []