def Eval_phase(params,which_files='test',model=None,test_dataloader=None,device=None): if(params['is_model']==True): print("model previously passed") model.eval() else: return 1 # ### Have to modify in the final run # model=select_model(params['what_bert'],params['path_files'],params['weights']) # model.cuda() # model.eval() print("Running eval on ",which_files,"...") t0 = time.time() # Put the model in evaluation mode--the dropout layers behave differently # during evaluation. # Tracking variables true_labels=[] pred_labels=[] logits_all=[] # Evaluate data for one epoch for step, batch in tqdm(enumerate(test_dataloader)): # Progress update every 40 batches. if step % 40 == 0 and not step == 0: # Calculate elapsed time in minutes. elapsed = format_time(time.time() - t0) # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention vals # [2]: attention mask # [3]: labels b_input_ids = batch[0].to(device) b_att_val = batch[1].to(device) b_input_mask = batch[2].to(device) b_labels = batch[3].to(device) # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) model.zero_grad() outputs = model(b_input_ids, attention_vals=b_att_val, attention_mask=b_input_mask, labels=None,device=device) logits = outputs[0] # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Calculate the accuracy for this batch of test sentences. # Accumulate the total accuracy. pred_labels+=list(np.argmax(logits, axis=1).flatten()) true_labels+=list(label_ids.flatten()) logits_all+=list(logits) logits_all_final=[] for logits in logits_all: logits_all_final.append(softmax(logits)) testf1=f1_score(true_labels, pred_labels, average='macro') testacc=accuracy_score(true_labels,pred_labels) if(params['num_classes']==3): testrocauc=roc_auc_score(true_labels, logits_all_final,multi_class='ovo',average='macro') else: #testrocauc=roc_auc_score(true_labels, logits_all_final,multi_class='ovo',average='macro') testrocauc=0 testprecision=precision_score(true_labels, pred_labels, average='macro') testrecall=recall_score(true_labels, pred_labels, average='macro') if(params['logging']!='neptune' or params['is_model'] == True): # Report the final accuracy for this validation run. print(" Accuracy: {0:.2f}".format(testacc)) print(" Fscore: {0:.2f}".format(testf1)) print(" Precision: {0:.2f}".format(testprecision)) print(" Recall: {0:.2f}".format(testrecall)) print(" Roc Auc: {0:.2f}".format(testrocauc)) print(" Test took: {:}".format(format_time(time.time() - t0))) #print(ConfusionMatrix(true_labels,pred_labels)) else: bert_model = params['path_files'] language = params['language'] name_one=bert_model+"_"+language neptune.create_experiment(name_one,params=params,send_hardware_metrics=False,run_monitoring_thread=False) neptune.append_tag(bert_model) neptune.append_tag(language) neptune.append_tag('test') neptune.log_metric('test_f1score',testf1) neptune.log_metric('test_accuracy',testacc) neptune.log_metric('test_precision',testprecision) neptune.log_metric('test_recall',testrecall) neptune.log_metric('test_rocauc',testrocauc) neptune.stop() return testf1,testacc,testprecision,testrecall,testrocauc,logits_all_final
def standaloneEval_with_rational(params, test_data=None, extra_data_path=None, topk=2, use_ext_df=False): # device = torch.device("cpu") if torch.cuda.is_available() and params['device'] == 'cuda': # Tell PyTorch to use the GPU. device = torch.device("cuda") deviceID = get_gpu(params) torch.cuda.set_device(deviceID[0]) else: print('Since you dont want to use GPU, using the CPU instead.') device = torch.device("cpu") embeddings = None if (params['bert_tokens']): train, val, test = createDatasetSplit(params) vocab_own = None vocab_size = 0 padding_idx = 0 else: train, val, test, vocab_own = createDatasetSplit(params) params['embed_size'] = vocab_own.embeddings.shape[1] params['vocab_size'] = vocab_own.embeddings.shape[0] embeddings = vocab_own.embeddings if (params['auto_weights']): y_test = [ele[2] for ele in test] encoder = LabelEncoder() encoder.classes_ = np.load('Data/classes.npy') params['weights'] = class_weight.compute_class_weight( 'balanced', np.unique(y_test), y_test).astype('float32') if (extra_data_path != None): params_dash = {} params_dash['num_classes'] = 3 params_dash['data_file'] = extra_data_path params_dash['class_names'] = dict_data_folder[str( params['num_classes'])]['class_label'] temp_read = get_annotated_data(params_dash) with open('Data/post_id_divisions.json', 'r') as fp: post_id_dict = json.load(fp) temp_read = temp_read[ temp_read['post_id'].isin(post_id_dict['test']) & (temp_read['final_label'].isin(['hatespeech', 'offensive']))] test_data = get_test_data(temp_read, params, message='text') test_extra = encodeData(test_data, vocab_own, params) test_dataloader = combine_features(test_extra, params, is_train=False) elif (use_ext_df): test_extra = encodeData(test_data, vocab_own, params) test_dataloader = combine_features(test_extra, params, is_train=False) else: test_dataloader = combine_features(test, params, is_train=False) model = select_model(params, embeddings) if (params['bert_tokens'] == False): model = load_model(model, params) if (params["device"] == 'cuda'): model.cuda() model.eval() # Put the model in evaluation mode--the dropout layers behave differently # during evaluation. # Tracking variables if ((extra_data_path != None) or (use_ext_df == True)): post_id_all = list(test_data['Post_id']) else: post_id_all = list(test['Post_id']) print("Running eval on test data...") t0 = time.time() true_labels = [] pred_labels = [] logits_all = [] attention_all = [] input_mask_all = [] # Evaluate data for one epoch for step, batch in tqdm(enumerate(test_dataloader), total=len(test_dataloader)): # Progress update every 40 batches. if step % 40 == 0 and not step == 0: # Calculate elapsed time in minutes. elapsed = format_time(time.time() - t0) # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention vals # [2]: attention mask # [3]: labels b_input_ids = batch[0].to(device) b_att_val = batch[1].to(device) b_input_mask = batch[2].to(device) b_labels = batch[3].to(device) # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) #model.zero_grad() outputs = model(b_input_ids, attention_vals=b_att_val, attention_mask=b_input_mask, labels=None, device=device) # m = nn.Softmax(dim=1) logits = outputs[0] # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.detach().cpu().numpy() if (params['bert_tokens']): attention_vectors = np.mean( outputs[1][11][:, :, 0, :].detach().cpu().numpy(), axis=1) else: attention_vectors = outputs[1].detach().cpu().numpy() # Calculate the accuracy for this batch of test sentences. # Accumulate the total accuracy. pred_labels += list(np.argmax(logits, axis=1).flatten()) true_labels += list(label_ids.flatten()) logits_all += list(logits) attention_all += list(attention_vectors) input_mask_all += list(batch[2].detach().cpu().numpy()) logits_all_final = [] for logits in logits_all: logits_all_final.append(softmax(logits)) if (use_ext_df == False): testf1 = f1_score(true_labels, pred_labels, average='macro') testacc = accuracy_score(true_labels, pred_labels) #testrocauc=roc_auc_score(true_labels, logits_all_final,multi_class='ovo',average='macro') testprecision = precision_score(true_labels, pred_labels, average='macro') testrecall = recall_score(true_labels, pred_labels, average='macro') # Report the final accuracy for this validation run. print(" Accuracy: {0:.3f}".format(testacc)) print(" Fscore: {0:.3f}".format(testf1)) print(" Precision: {0:.3f}".format(testprecision)) print(" Recall: {0:.3f}".format(testrecall)) #print(" Roc Auc: {0:.3f}".format(testrocauc)) print(" Test took: {:}".format(format_time(time.time() - t0))) attention_vector_final = [] for x, y in zip(attention_all, input_mask_all): temp = [] for x_ele, y_ele in zip(x, y): if (y_ele == 1): temp.append(x_ele) attention_vector_final.append(temp) list_dict = [] for post_id, attention, logits, pred, ground_truth in zip( post_id_all, attention_vector_final, logits_all_final, pred_labels, true_labels): # if(ground_truth==1): # continue temp = {} encoder = LabelEncoder() encoder.classes_ = np.load('Data/classes.npy') pred_label = encoder.inverse_transform([pred])[0] ground_label = encoder.inverse_transform([ground_truth])[0] temp["annotation_id"] = post_id temp["classification"] = pred_label temp["classification_scores"] = { "hatespeech": logits[0], "normal": logits[1], "offensive": logits[2] } topk_indicies = sorted(range(len(attention)), key=lambda i: attention[i])[-topk:] temp_hard_rationales = [] for ind in topk_indicies: temp_hard_rationales.append({ 'end_token': ind + 1, 'start_token': ind }) temp["rationales"] = [{ "docid": post_id, "hard_rationale_predictions": temp_hard_rationales, "soft_rationale_predictions": attention, #"soft_sentence_predictions":[1.0], "truth": ground_truth }] list_dict.append(temp) return list_dict, test_data
def train_model(params,device): embeddings=None if(params['bert_tokens']): train,val,test=createDatasetSplit(params) else: train,val,test,vocab_own=createDatasetSplit(params) params['embed_size']=vocab_own.embeddings.shape[1] params['vocab_size']=vocab_own.embeddings.shape[0] embeddings=vocab_own.embeddings if(params['auto_weights']): y_test = [ele[2] for ele in test] # print(y_test) encoder = LabelEncoder() encoder.classes_ = np.load(params['class_names'],allow_pickle=True) params['weights']=class_weight.compute_class_weight('balanced',np.unique(y_test),y_test).astype('float32') #params['weights']=np.array([len(y_test)/y_test.count(encoder.classes_[0]),len(y_test)/y_test.count(encoder.classes_[1]),len(y_test)/y_test.count(encoder.classes_[2])]).astype('float32') print(params['weights']) train_dataloader =combine_features(train,params,is_train=True) validation_dataloader=combine_features(val,params,is_train=False) test_dataloader=combine_features(test,params,is_train=False) model=select_model(params,embeddings) if(params["device"]=='cuda'): model.cuda() optimizer = AdamW(model.parameters(), lr = params['learning_rate'], # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps = params['epsilon'] # args.adam_epsilon - default is 1e-8. ) # Number of training epochs (authors recommend between 2 and 4) # Total number of training steps is number of batches * number of epochs. total_steps = len(train_dataloader) * params['epochs'] # Create the learning rate scheduler. if(params['bert_tokens']): scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = int(total_steps/10), num_training_steps = total_steps) # Set the seed value all over the place to make this reproducible. fix_the_random(seed_val = params['random_seed']) # Store the average loss after each epoch so we can plot them. loss_values = [] if(params['bert_tokens']): bert_model = params['path_files'] name_one=bert_model else: name_one=params['model_name'] if(params['logging']=='neptune'): neptune.create_experiment(name_one,params=params,send_hardware_metrics=False,run_monitoring_thread=False) neptune.append_tag(name_one) if(params['best_params']): neptune.append_tag('AAAI final best') else: neptune.append_tag('AAAI final') best_val_fscore=0 best_test_fscore=0 best_val_roc_auc=0 best_test_roc_auc=0 best_val_precision=0 best_test_precision=0 best_val_recall=0 best_test_recall=0 for epoch_i in range(0, params['epochs']): print("") print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, params['epochs'])) print('Training...') # Measure how long the training epoch takes. t0 = time.time() # Reset the total loss for this epoch. total_loss = 0 model.train() # For each batch of training data... for step, batch in tqdm(enumerate(train_dataloader)): # Progress update every 40 batches. if step % 40 == 0 and not step == 0: # Calculate elapsed time in minutes. elapsed = format_time(time.time() - t0) # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention vals # [2]: attention mask # [3]: labels b_input_ids = batch[0].to(device) b_att_val = batch[1].to(device) b_input_mask = batch[2].to(device) b_labels = batch[3].to(device) # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) model.zero_grad() outputs = model(b_input_ids, attention_vals=b_att_val, attention_mask=b_input_mask, labels=b_labels, device=device) # The call to `model` always returns a tuple, so we need to pull the # loss value out of the tuple. loss = outputs[0] if(params['logging']=='neptune'): neptune.log_metric('batch_loss',loss.item()) # Accumulate the training loss over all of the batches so that we can # calculate the average loss at the end. `loss` is a Tensor containing a # single value; the `.item()` function just returns the Python value # from the tensor. total_loss += loss.item() # Perform a backward pass to calculate the gradients. loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Update parameters and take a step using the computed gradient. # The optimizer dictates the "update rule"--how the parameters are # modified based on their gradients, the learning rate, etc. optimizer.step() # Update the learning rate. if(params['bert_tokens']): scheduler.step() # Calculate the average loss over the training data. avg_train_loss = total_loss / len(train_dataloader) if(params['logging']=='neptune'): neptune.log_metric('avg_train_loss',avg_train_loss) else: print('avg_train_loss',avg_train_loss) # Store the loss value for plotting the learning curve. loss_values.append(avg_train_loss) train_fscore,train_accuracy,train_precision,train_recall,train_roc_auc,_=Eval_phase(params,'train',model,train_dataloader,device) val_fscore,val_accuracy,val_precision,val_recall,val_roc_auc,_=Eval_phase(params,'val',model,validation_dataloader,device) test_fscore,test_accuracy,test_precision,test_recall,test_roc_auc,logits_all_final=Eval_phase(params,'test',model,test_dataloader,device) #Report the final accuracy for this validation run. if(params['logging']=='neptune'): neptune.log_metric('test_fscore',test_fscore) neptune.log_metric('test_accuracy',test_accuracy) neptune.log_metric('test_precision',test_precision) neptune.log_metric('test_recall',test_recall) neptune.log_metric('test_rocauc',test_roc_auc) neptune.log_metric('val_fscore',val_fscore) neptune.log_metric('val_accuracy',val_accuracy) neptune.log_metric('val_precision',val_precision) neptune.log_metric('val_recall',val_recall) neptune.log_metric('val_rocauc',val_roc_auc) neptune.log_metric('train_fscore',train_fscore) neptune.log_metric('train_accuracy',train_accuracy) neptune.log_metric('train_precision',train_precision) neptune.log_metric('train_recall',train_recall) neptune.log_metric('train_rocauc',train_roc_auc) if(val_fscore > best_val_fscore): print(val_fscore,best_val_fscore) best_val_fscore=val_fscore best_test_fscore=test_fscore best_val_roc_auc = val_roc_auc best_test_roc_auc = test_roc_auc best_val_precision = val_precision best_test_precision = test_precision best_val_recall = val_recall best_test_recall = test_recall if(params['bert_tokens']): print('Loading BERT tokenizer...') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=False) save_bert_model(model,tokenizer,params) else: print("Saving model") save_normal_model(model,params) if(params['logging']=='neptune'): neptune.log_metric('best_val_fscore',best_val_fscore) neptune.log_metric('best_test_fscore',best_test_fscore) neptune.log_metric('best_val_rocauc',best_val_roc_auc) neptune.log_metric('best_test_rocauc',best_test_roc_auc) neptune.log_metric('best_val_precision',best_val_precision) neptune.log_metric('best_test_precision',best_test_precision) neptune.log_metric('best_val_recall',best_val_recall) neptune.log_metric('best_test_recall',best_test_recall) neptune.stop() else: print('best_val_fscore',best_val_fscore) print('best_test_fscore',best_test_fscore) print('best_val_rocauc',best_val_roc_auc) print('best_test_rocauc',best_test_roc_auc) print('best_val_precision',best_val_precision) print('best_test_precision',best_test_precision) print('best_val_recall',best_val_recall) print('best_test_recall',best_test_recall) # del model # torch.cuda.empty_cache() return model
def return_probab(self, sentences_list): """Input: should be a list of sentences""" """Ouput: probablity values""" params = self.params device = self.device if (params['auto_weights']): y_test = [ele[2] for ele in self.test] encoder = LabelEncoder() encoder.classes_ = np.load('Data/classes.npy') params['weights'] = class_weight.compute_class_weight( 'balanced', np.unique(y_test), y_test).astype('float32') temp_read = transform_dummy_data(sentences_list) test_data = get_test_data(temp_read, params, message='text') test_extra = encodeData(test_data, self.vocab, params) test_dataloader = combine_features(test_extra, params, is_train=False) # Put the model in evaluation mode--the dropout layers behave differently # during evaluation. # Tracking variables post_id_all = list(test_data['Post_id']) print("Running eval on test data...") t0 = time.time() true_labels = [] pred_labels = [] logits_all = [] #attention_all=[] input_mask_all = [] # Evaluate data for one epoch for step, batch in enumerate(test_dataloader): # Progress update every 40 batches. if step % 40 == 0 and not step == 0: # Calculate elapsed time in minutes. elapsed = format_time(time.time() - t0) # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention vals # [2]: attention mask # [3]: labels b_input_ids = batch[0].to(device) b_att_val = batch[1].to(device) b_input_mask = batch[2].to(device) b_labels = batch[3].to(device) # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) #model.zero_grad() outputs = self.model(b_input_ids, attention_vals=b_att_val, attention_mask=b_input_mask, labels=None, device=device) logits = outputs[0] #print(logits) # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.detach().cpu().numpy() # Calculate the accuracy for this batch of test sentences. # Accumulate the total accuracy. pred_labels += list(np.argmax(logits, axis=1).flatten()) true_labels += list(label_ids.flatten()) logits_all += list(logits) #attention_all+=list(attention_vectors) input_mask_all += list(batch[2].detach().cpu().numpy()) logits_all_final = [] for logits in logits_all: logits_all_final.append(list(softmax(logits))) return np.array(logits_all_final)