def test_it(MODEL_PATH='roberta-base'): models = [] for t in os.listdir('type'): for model_file in os.listdir(os.path.join('type', t)): model = TweetModel(MODEL_PATH=t) # model.cuda() model.cpu() model.load_state_dict( torch.load(os.path.join(os.path.join('type', t), model_file))) model.eval() models.append(model) test_df = pd.read_csv('data/test.csv') test_df['text'] = test_df['text'].astype(str) test_loader = get_test_loader(test_df, MODEL_PATH=MODEL_PATH) predictions = [] for data in test_loader: ids = data['ids'].cuda() masks = data['masks'].cuda() tweet = data['tweet'] offsets = data['offsets'].numpy() sentiment = data['sentiment'] start_logits = [] end_logits = [] # len_logits = [] for model in models: with torch.no_grad(): model.cuda() output = model(ids, masks) start_logits.append( torch.softmax(output[0], dim=1).cpu().detach().numpy()) end_logits.append( torch.softmax(output[1], dim=1).cpu().detach().numpy()) # len_logits.append(torch.softmax(output[2], dim=1).cpu().detach().numpy()) model.cpu() start_logits = np.mean(start_logits, axis=0) end_logits = np.mean(end_logits, axis=0) # len_logits = np.mean(len_logits, axis=0) for i in range(len(ids)): start_pred = np.argmax(start_logits[i]) end_pred = np.argmax(end_logits[i]) # length = np.argmax(len_logits[i]) # end_pred = start_pred + int(length) sentiment_val = sentiment[i] original_tweet = tweet[i] if start_pred > end_pred: pred = original_tweet else: pred = get_selected_text(tweet[i], start_pred, end_pred, offsets[i]) if sentiment_val == "neutral" or len(original_tweet.split()) < 2: pred = original_tweet predictions.append(pred) sub_df = pd.read_csv('data/sample_submission.csv') sub_df['selected_text'] = predictions sub_df['selected_text'] = sub_df['selected_text'].apply( lambda x: x.replace('!!!!', '!') if len(x.split()) == 1 else x) sub_df['selected_text'] = sub_df['selected_text'].apply( lambda x: x.replace('..', '.') if len(x.split()) == 1 else x) sub_df['selected_text'] = sub_df['selected_text'].apply( lambda x: x.replace('...', '.') if len(x.split()) == 1 else x) sub_df.to_csv('submission.csv', index=False) sub_df.head()
def main(): seed = 42 seed_everything(seed) num_epochs = 3 batch_size = 32 skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed) train_df = pd.read_csv('data/train.csv') train_df['text'] = train_df['text'].astype(str) train_df['selected_text'] = train_df['selected_text'].astype(str) for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df.sentiment), start=1): print(f'Fold: {fold}') model = TweetModel() optimizer = optim.AdamW(model.parameters(), lr=3e-5, betas=(0.9, 0.999)) criterion = loss_fn dataloaders_dict = get_train_val_loaders(train_df, train_idx, val_idx, batch_size) train_model(model, dataloaders_dict, criterion, optimizer, num_epochs, f'roberta_fold{fold}.pth') # inference test_df = pd.read_csv('data/test.csv') test_df['text'] = test_df['text'].astype(str) test_loader = get_test_loader(test_df) predictions = [] models = [] for fold in range(skf.n_splits): model = TweetModel() model.cuda() model.load_state_dict(torch.load(f'roberta_fold{fold+1}.pth')) model.eval() models.append(model) for data in test_loader: ids = data['ids'].cuda() masks = data['masks'].cuda() tweet = data['tweet'] offsets = data['offsets'].numpy() start_logits = [] end_logits = [] for model in models: with torch.no_grad(): output = model(ids, masks) start_logits.append( torch.softmax(output[0], dim=1).cpu().detach().numpy()) end_logits.append( torch.softmax(output[1], dim=1).cpu().detach().numpy()) start_logits = np.mean(start_logits, axis=0) end_logits = np.mean(end_logits, axis=0) for i in range(len(ids)): start_pred = np.argmax(start_logits[i]) end_pred = np.argmax(end_logits[i]) if start_pred > end_pred: pred = tweet[i] else: pred = get_selected_text(tweet[i], start_pred, end_pred, offsets[i]) predictions.append(pred) #submission sub_df = pd.read_csv('data/sample_submission.csv') sub_df['selected_text'] = predictions sub_df['selected_text'] = sub_df['selected_text'].apply( lambda x: x.replace('!!!!', '!') if len(x.split()) == 1 else x) sub_df['selected_text'] = sub_df['selected_text'].apply( lambda x: x.replace('..', '.') if len(x.split()) == 1 else x) sub_df['selected_text'] = sub_df['selected_text'].apply( lambda x: x.replace('...', '.') if len(x.split()) == 1 else x) sub_df.to_csv('submission.csv', index=False) sub_df.head()