def run(fold=0): # kfold type of data input data = pd.read_csv(config.TRAIN_FOLDS_FILE) df_train = data[data['kfold'] != fold].reset_index(drop=True) df_valid = data[data['kfold'] == fold].reset_index(drop=True) train_data = CommentData(comments=df_train['Comment'], labels=df_train['Label_encoded'], sentiments=df_train['Sentiment_encoded']) train_dataloader = torch.utils.data.DataLoader( train_data, batch_size=config.TRAIN_BATCH_SIZE, # num_workers = 4 ) valid_data = CommentData(comments=df_valid['Comment'], labels=df_valid['Label_encoded'], sentiments=df_valid['Sentiment_encoded']) valid_dataloader = torch.utils.data.DataLoader( valid_data, batch_size=config.VALID_BATCH_SIZE, # num_workers = 4 ) device = torch.device('cuda') model_config = RobertaConfig.from_pretrained(config.ROBERTA_PATH) model_config.output_hidden_states = True model = SentimentModel(model_config, config.OUTPUT_SIZE) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }] num_train_steps = int( len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS) optimizer = AdamW(optimizer_parameters, lr=3e-5) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) # train_fn(data_loader, model, device, optimizer, scheduler=None) train_loss_rec = [] eval_loss_rec = [] early_stopping = utils.EarlyStopping(patience=5, mode='min') for epoch in range(config.EPOCHS): print(f'########### fold = {fold} epoch = {epoch} ############') loss_train = engine.train_fn(data_loader=train_dataloader, model=model, device=device, optimizer=optimizer, scheduler=scheduler) train_loss_rec.append(loss_train) losses_eval = engine.eval_fn(valid_dataloader, model, device) eval_loss_rec.append(losses_eval) print(f'train_loss = {loss_train} eval_loss = {losses_eval}') # print(f'save model_{fold}.bin') # torch.save(model.state_dict(), config.OUTPUT_PATH + f'/model_{fold}.bin') early_stopping(losses_eval, model, model_path=config.OUTPUT_PATH + f'/model_label_{fold}.bin') if early_stopping.early_stop: print('Early stopping') break
def preprocess_and_train(): # read dataset data = pd.read_csv('./training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None) data.columns = ('target','uid', 'time', 'query', 'user', 'text') # create new dataframe sent_df = pd.DataFrame(None, columns=('target', 'text')) sent_df['target'] = data['target'] sent_df['text'] = data['text'].apply(preprocess_text) sent_df['tweet_size'] = data['text'].apply(lambda x:len(x.split())) # select random sample of 400,000 tweets from total dataset (training on a smaller dataset) sent_df_sample = sent_df[(sent_df['tweet_size']>10) & (sent_df['target']==0)].sample(n=200000, random_state=SentConfig.SEED) sent_df_sample = sent_df_sample.append(sent_df[(sent_df['tweet_size']>10) & (sent_df['target']==4)].sample(n=200000, random_state=SentConfig.SEED)) # split dataset into train, test, validation set train, test = train_test_split(sent_df_sample, test_size=0.1) train, val = train_test_split(train, test_size=0.05) # create necessary dataloaders, for advantage of batching by pytorch train_dl = SentimentDL(train) val_dl = SentimentDL(val) test_dl = SentimentDL(test) train_loader = DataLoader(train_dl, batch_size=SentConfig.TRAIN_BATCH_SIZE, shuffle=True) validation_loader = DataLoader(val_dl, batch_size=SentConfig.VALID_BATCH_SIZE, shuffle=True) test_loader = DataLoader(test_dl, batch_size=SentConfig.VALID_BATCH_SIZE, shuffle=True) # select the cuda device if available device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # create model object model = SentimentModel() model.to(device) # ready with optimizer and scheduler objects # do not apply weight decay in AdamW to, bias layer and normalization terms no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias'] # taken from https://huggingface.co/transformers/training.html # more named parameteres in model.named_parameters() optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] # optim = AdamW(model.parameters(), lr=5e-5) optim = AdamW(optimizer_grouped_parameters, lr=5e-5) # learning rate scheduling num_train_steps = int((train_dl.__len__()/SentConfig.TRAIN_BATCH_SIZE)*SentConfig.EPOCHS) num_warmup_steps = int(0.05*num_train_steps) scheduler = get_cosine_schedule_with_warmup(optim, num_warmup_steps, num_train_steps) # Training : done on the basis of attaining better F1 score on the validation dataset scores = [] min_f1 = 0 for epoch in range(SentConfig.EPOCHS): _ = train_function(train_loader, model, optim, scheduler, device) _, results = evaluation_function(validation_loader, model, device) validation_f1 = round(f1_score(results[:,1], results[:,0]),4) accuracy = round(accuracy_score(results[:,1], results[:,0]), 4) scores.append((validation_f1, accuracy)) print('epoch num: ', epoch, 'f1 score: ',validation_f1 , 'accuracy: ', accuracy) if validation_f1 > min_f1: # save model if validation f1 score is torch.save(model.state_dict(), "SentimentModel.bin") # update max loss min_f1 = validation_f1 # plotting scores scores = np.array(scores) fig, ax = plt.subplots(1, 2, figsize=(14,6)) ax[0].plot(range(SentConfig.EPOCHS), scores[:,0], 'r') ax[1].plot(range(SentConfig.EPOCHS), scores[:,1]) ax[0].set(xlabel='Epoch num', ylabel='F1 Score') ax[1].set(xlabel='Epoch num', ylabel='Accuracy') ax[0].set_title('validation set f1 score at each epoch') ax[1].set_title('validation set accuracy at each apoch') # F1 score calculation on test predictions state_dict_ = torch.load('SentimentModel.bin') model = SentimentModel() model.load_state_dict(state_dict_) model.to(device) _, results = evaluation_function(test_loader, model, device, inference=True) print(classification_report(results[:,1], results[:,0])) print(round(accuracy_score(results[:,1], results[:,0]),4))