device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') TEXT, LABEL, train, valid, test, train_iter, valid_iter, test_iter = load_file(filepath='data/', device=device) INPUT_DIM = len(TEXT.vocab) EMBEDDING_DIM = 100 HIDDEN_DIM = 256 OUTPUT_DIM = 1 model = SentimentModel(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM) optimizer = torch.optim.SGD(model.parameters(), lr=3e-3) criterion = nn.BCEWithLogitsLoss() model = model.to(device) criterion = criterion.to(device) def binary_accuracy(preds, y): rounded_preds = torch.round(torch.sigmoid(preds)) correct = (rounded_preds == y).float() acc = correct.sum() / len(correct) return acc def train(model, iterator, optimizer, criterion): epoch_loss = 0 epoch_acc = 0 model.train()
def predict(): # kfold type of data input data = pd.read_csv(config.TEST_FILE) data['Label_encoded'] = 0 data['Sentiment_encoded'] = 0 df_test = data test_data = CommentData(comments=df_test['Comment'], labels=df_test['Label_encoded'], sentiments=df_test['Sentiment_encoded']) test_dataloader = torch.utils.data.DataLoader( test_data, batch_size=config.TEST_BATCH_SIZE, # num_workers = 4 ) # model device = torch.device('cuda') model_config = BertConfig.from_pretrained(config.BERT_PATH) model_config.output_hidden_states = True model0 = SentimentModel(model_config, config.OUTPUT_SIZE_SENTIMENT) model0.to(device) # model0 = nn.DataParallel(model0) model0.load_state_dict(torch.load(config.SAVED_MODEL_PATH + '/model_0.bin')) model0.eval() model1 = SentimentModel(model_config, config.OUTPUT_SIZE_SENTIMENT) model1.to(device) # model1 = nn.DataParallel(model1) model1.load_state_dict(torch.load(config.SAVED_MODEL_PATH + '/model_1.bin')) model1.eval() model2 = SentimentModel(model_config, config.OUTPUT_SIZE_SENTIMENT) model2.to(device) # model2 = nn.DataParallel(model2) model2.load_state_dict(torch.load(config.SAVED_MODEL_PATH + '/model_2.bin')) model2.eval() model3 = SentimentModel(model_config, config.OUTPUT_SIZE_SENTIMENT) model3.to(device) # model3 = nn.DataParallel(model3) model3.load_state_dict(torch.load(config.SAVED_MODEL_PATH + '/model_3.bin')) model3.eval() model4 = SentimentModel(model_config, config.OUTPUT_SIZE_SENTIMENT) model4.to(device) # model4 = nn.DataParallel(model4) model4.load_state_dict(torch.load(config.SAVED_MODEL_PATH + '/model_4.bin')) model4.eval() # process raw output model_prediction = [] with torch.no_grad(): tq0 = tqdm(test_dataloader, total=len(test_dataloader)) for bi, data in tqdm(enumerate(tq0)): # load data / ready to input input_ids = data['input_ids'] token_type_ids = data['token_type_ids'] attention_mask = data['attention_mask'] label = data['label'] sentiment = data['sentiment'] # prepare input data input_ids = input_ids.to(device, dtype=torch.long) token_type_ids = token_type_ids.to(device, dtype=torch.long) attention_mask = attention_mask.to(device, dtype=torch.long) label = label.to(device, dtype=torch.long) sentiment = sentiment.to(device, dtype=torch.long) # forward(self, ids, mask, type_ids) out0 = model0(ids=input_ids, mask=attention_mask, type_ids=token_type_ids) out1 = model1(ids=input_ids, mask=attention_mask, type_ids=token_type_ids) out2 = model2(ids=input_ids, mask=attention_mask, type_ids=token_type_ids) out3 = model3(ids=input_ids, mask=attention_mask, type_ids=token_type_ids) out4 = model4(ids=input_ids, mask=attention_mask, type_ids=token_type_ids) out = (out0 + out1 + out2 + out3 + out4) / 5 out = torch.softmax(out, dim=1).cpu().detach().numpy() for ix, result in enumerate(out): pred = np.argmax(result) model_prediction.append(pred) sample = pd.read_csv(config.TEST_FILE) sample['sentiment_pred'] = model_prediction sample.to_csv(config.OUTPUT_PATH + '/pred_sentiment.csv', index=False)
def run(fold=0): # kfold type of data input data = pd.read_csv(config.TRAIN_FOLDS_FILE) df_train = data[data['kfold'] != fold].reset_index(drop=True) df_valid = data[data['kfold'] == fold].reset_index(drop=True) train_data = CommentData(comments=df_train['Comment'], labels=df_train['Label_encoded'], sentiments=df_train['Sentiment_encoded']) train_dataloader = torch.utils.data.DataLoader( train_data, batch_size=config.TRAIN_BATCH_SIZE, # num_workers = 4 ) valid_data = CommentData(comments=df_valid['Comment'], labels=df_valid['Label_encoded'], sentiments=df_valid['Sentiment_encoded']) valid_dataloader = torch.utils.data.DataLoader( valid_data, batch_size=config.VALID_BATCH_SIZE, # num_workers = 4 ) device = torch.device('cuda') model_config = RobertaConfig.from_pretrained(config.ROBERTA_PATH) model_config.output_hidden_states = True model = SentimentModel(model_config, config.OUTPUT_SIZE) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) # train_fn(data_loader, model, device, optimizer, scheduler=None) train_loss_rec = [] eval_loss_rec = [] early_stopping = utils.EarlyStopping(patience=5, mode='min') for epoch in range(config.EPOCHS): print(f'########### fold = {fold} epoch = {epoch} ############') loss_train = engine.train_fn(data_loader=train_dataloader, model=model, device=device, optimizer=optimizer, scheduler=scheduler) train_loss_rec.append(loss_train) losses_eval = engine.eval_fn(valid_dataloader, model, device) eval_loss_rec.append(losses_eval) print(f'train_loss = {loss_train} eval_loss = {losses_eval}') # print(f'save model_{fold}.bin') # torch.save(model.state_dict(), config.OUTPUT_PATH + f'/model_{fold}.bin') early_stopping(losses_eval, model, model_path=config.OUTPUT_PATH + f'/model_label_{fold}.bin') if early_stopping.early_stop: print('Early stopping') break
def preprocess_and_train(): # read dataset data = pd.read_csv('./training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None) data.columns = ('target','uid', 'time', 'query', 'user', 'text') # create new dataframe sent_df = pd.DataFrame(None, columns=('target', 'text')) sent_df['target'] = data['target'] sent_df['text'] = data['text'].apply(preprocess_text) sent_df['tweet_size'] = data['text'].apply(lambda x:len(x.split())) # select random sample of 400,000 tweets from total dataset (training on a smaller dataset) sent_df_sample = sent_df[(sent_df['tweet_size']>10) & (sent_df['target']==0)].sample(n=200000, random_state=SentConfig.SEED) sent_df_sample = sent_df_sample.append(sent_df[(sent_df['tweet_size']>10) & (sent_df['target']==4)].sample(n=200000, random_state=SentConfig.SEED)) # split dataset into train, test, validation set train, test = train_test_split(sent_df_sample, test_size=0.1) train, val = train_test_split(train, test_size=0.05) # create necessary dataloaders, for advantage of batching by pytorch train_dl = SentimentDL(train) val_dl = SentimentDL(val) test_dl = SentimentDL(test) train_loader = DataLoader(train_dl, batch_size=SentConfig.TRAIN_BATCH_SIZE, shuffle=True) validation_loader = DataLoader(val_dl, batch_size=SentConfig.VALID_BATCH_SIZE, shuffle=True) test_loader = DataLoader(test_dl, batch_size=SentConfig.VALID_BATCH_SIZE, shuffle=True) # select the cuda device if available device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # create model object model = SentimentModel() model.to(device) # ready with optimizer and scheduler objects # do not apply weight decay in AdamW to, bias layer and normalization terms no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias'] # taken from https://huggingface.co/transformers/training.html # more named parameteres in model.named_parameters() optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] # optim = AdamW(model.parameters(), lr=5e-5) optim = AdamW(optimizer_grouped_parameters, lr=5e-5) # learning rate scheduling num_train_steps = int((train_dl.__len__()/SentConfig.TRAIN_BATCH_SIZE)*SentConfig.EPOCHS) num_warmup_steps = int(0.05*num_train_steps) scheduler = get_cosine_schedule_with_warmup(optim, num_warmup_steps, num_train_steps) # Training : done on the basis of attaining better F1 score on the validation dataset scores = [] min_f1 = 0 for epoch in range(SentConfig.EPOCHS): _ = train_function(train_loader, model, optim, scheduler, device) _, results = evaluation_function(validation_loader, model, device) validation_f1 = round(f1_score(results[:,1], results[:,0]),4) accuracy = round(accuracy_score(results[:,1], results[:,0]), 4) scores.append((validation_f1, accuracy)) print('epoch num: ', epoch, 'f1 score: ',validation_f1 , 'accuracy: ', accuracy) if validation_f1 > min_f1: # save model if validation f1 score is torch.save(model.state_dict(), "SentimentModel.bin") # update max loss min_f1 = validation_f1 # plotting scores scores = np.array(scores) fig, ax = plt.subplots(1, 2, figsize=(14,6)) ax[0].plot(range(SentConfig.EPOCHS), scores[:,0], 'r') ax[1].plot(range(SentConfig.EPOCHS), scores[:,1]) ax[0].set(xlabel='Epoch num', ylabel='F1 Score') ax[1].set(xlabel='Epoch num', ylabel='Accuracy') ax[0].set_title('validation set f1 score at each epoch') ax[1].set_title('validation set accuracy at each apoch') # F1 score calculation on test predictions state_dict_ = torch.load('SentimentModel.bin') model = SentimentModel() model.load_state_dict(state_dict_) model.to(device) _, results = evaluation_function(test_loader, model, device, inference=True) print(classification_report(results[:,1], results[:,0])) print(round(accuracy_score(results[:,1], results[:,0]),4))