def load_paragraph_data(): data_val = pd.read_json(p.sent_split_dir + "val_clustered_sent_split.json") data_test = pd.read_json(p.sent_split_dir + "test_clustered_sent_split.json") X_val = data_val.tokens X_test = data_test.tokens y_val = np.array(data_val.label.values) y_test = np.array(data_test.label.values) dictionary = pd.read_pickle(p.dict_path) X_val = TextDataset._text2idx(X_val, dictionary.word2idx) X_test = TextDataset._text2idx(X_test, dictionary.word2idx) return X_val, y_val, X_test, y_test, dictionary
def check_loss_and_accuracy(grouped): loss_list = [] preds = [] label_list = [] for name, group in grouped: tokens = TextDataset._text2idx(group.tokens, dictionary.word2idx) labels = np.array(group.label.values) tokens, labels = process_batch(tokens, labels) if config.para_pooling == 'attn': y_pred, _ = model.forward(tokens) else: y_pred = model.forward(tokens) labels = labels.view(labels.shape[0], -1) loss = criterion(y_pred.cuda(), labels[0]) loss_list.append(loss.item()) _, y_pred = torch.max(y_pred, 1) preds.append(y_pred.item()) label_list.append(labels[0].item()) preds = np.array(preds) label_list = np.array(label_list) precision, recall, f1, _ = precision_recall_fscore_support(label_list, preds, average='macro') return np.mean(np.array(loss_list)), accuracy_score( label_list, preds), precision, recall, f1, confusion_matrix(label_list, preds)
def get_prediction(grouped, combine): total_pred = list() total_targets = list() # # id_prob = dict() # id_pred = dict() # id_label = dict() word_model.eval() for name, group in grouped: tokens = TextDataset._text2idx(group.tokens, word2idx) labels = np.array(group.label.values) tokens, labels = process_batch(tokens, labels) # prediction, soft_preds = test_accuracy_full_batch(tokens, labels, word_attn, sent_attn) logits, hidden = word_model.forward(tokens) soft_preds = F.softmax(logits, dim=1) _, prediction = torch.max(soft_preds, 1) if combine == 'majority': document_pred = 1 if torch.nonzero(prediction).size(0) >= prediction.shape[0] / 2 else 0 elif combine == 'avg': document_pred = 1 if torch.mean(soft_preds, 0)[0].item() < 0.5 else 0 target = labels[0].item() total_pred.append(document_pred) total_targets.append(target) return total_pred, total_targets
def get_prediction(grouped, word2idx, word_attn, sent_attn, combine): total_pred = list() total_targets = list() # # id_prob = dict() # id_pred = dict() # id_label = dict() word_attn.eval() sent_attn.eval() for name, group in grouped: tokens = TextDataset._text2idx(group.tokens, word2idx) labels = np.array(group.label.values) tokens, labels = process_batch(tokens, labels) soft_preds = model_forward(config.para_pooling, word_attn, sent_attn, tokens) _, prediction = torch.max(soft_preds, 1) if combine == 'majority': document_pred = 1 if torch.nonzero(prediction).size( 0) >= prediction.shape[0] / 2 else 0 elif combine == 'avg': document_pred = 1 if torch.mean(torch.exp(soft_preds), 0)[0].item() < 0.5 else 0 target = labels[0].item() # id_prob[name] = torch.transpose(torch.exp(soft_preds), 0, 1)[0].tolist() # id_pred[name] = document_pred # id_label[name] = target total_pred.append(document_pred) total_targets.append(target) return total_pred, total_targets
def get_prediction_slength(grouped): preds = [] label_list = [] slength_list = [] # for every batch for name, group in grouped: s = np.sum(list(map(lambda x: len(x), group.tokens))) slength_list.append(s) tokens = TextDataset._text2idx(group.tokens, dictionary.word2idx) labels = np.array(group.label.values) tokens, labels = process_batch(tokens, labels) if config.pooling == 'attn': y_pred, _, _ = model.forward(tokens) else: y_pred = model.forward(tokens) _, y_pred = torch.max(y_pred, 1) preds.append(y_pred.item()) label_list.append(labels[0].item()) return preds, label_list, slength_list
def check_loss_and_accuracy(grouped): loss = [] preds = [] labels = [] for name, group in grouped: tokens = TextDataset._text2idx(group.tokens, dictionary.word2idx) labels = np.array(group.label.values) tokens, labels = process_batch(tokens, labels) y_pred = model.forward(tokens) loss.append(loss.item()) loss = criterion(y_pred.cuda(), labels[0]) _, y_pred = torch.max(y_pred, 1) preds.append(np.ndarray.flatten(y_pred.data.cpu().numpy())) labels.append(np.ndarray.flatten(labels[0])) preds = np.array([item for sublist in preds for item in sublist]) labels = np.array([item for sublist in labels for item in sublist]) precision, recall, f1, _ = precision_recall_fscore_support(labels, preds) return np.mean(np.array(loss)), accuracy_score( labels, preds), precision, recall, f1, confusion_matrix(labels, preds)
def check_loss_and_accuracy(grouped, model, dictionary): preds = [] label_list = [] for name, group in grouped: tokens = TextDataset._text2idx(group.tokens, dictionary.word2idx) labels = np.array(group.label.values) tokens, labels = process_batch(tokens, labels) if config.pooling == 'attn': y_pred, _, _ = model.forward(tokens) elif config.pooling == 'ensem': y_pred = model.forward(tokens) labels = labels.view(labels.shape[0], -1) _, y_pred = torch.max(y_pred, 1) preds.append(y_pred.item()) label_list.append(labels[0].item()) preds = np.array(preds) label_list = np.array(label_list) precision, recall, f1, _ = precision_recall_fscore_support(label_list, preds) return accuracy_score(label_list, preds), precision, recall, f1, confusion_matrix(label_list, preds)
def train_early_stopping(epoch_number): global best_val_loss, best_acc loss_epoch = [] i = 1 batch_start = time.time() for name, group in train_grouped: # print(group.tokens.values) tokens = TextDataset._text2idx(group.tokens, dictionary.word2idx) labels = np.array(group.label.values) try: tokens, labels = process_batch(tokens, labels) except: print(tokens) sys.exit(0) loss = train_data(tokens, labels) loss_epoch.append(loss) # print loss every n passes if i % (p.print_loss_every * 5) == 0: print('| epoch %d | %d/%d batches | ms/batch (%s) | loss %f' % (epoch_number, i % (num_batches + 1), num_batches, time_since(batch_start), np.mean(loss_epoch))) batch_start = time.time() i += 1 # word_encoder.eval() # sent_encoder.eval() model.eval() print('-' * 89) val_loss, val_acc, precision, recall, f1, conf_matrix = check_loss_and_accuracy( val_grouped) print( '| val set result | valid loss (pure) {:5.4f} | Acc {:8.4f} | Precision {:8.4f} | Recall {:8.4f} ' '| F1-score {:8.4f}'.format(val_loss, val_acc, precision, recall, f1)) print('The confusion matrix is: ') print(str(conf_matrix)) print('-' * 89) test_loss, test_acc, precision, recall, f1, conf_matrix = check_loss_and_accuracy( test_grouped) print( '| test set result | valid loss (pure) {:5.4f} | Acc {:8.4f} | Precision {:8.4f} | Recall {:8.4f} ' '| F1-score {:8.4f}'.format(test_loss, test_acc, precision, recall, f1)) print('The confusion matrix is: ') print(str(conf_matrix)) print('-' * 89) directory = "./experiments/%s/models/" % config.exp_num if not os.path.exists(directory): os.makedirs(directory) if not best_val_loss or val_loss < best_val_loss: best_val_loss = val_loss else: # if loss doesn't go down, divide the learning rate by 5. for param_group in optimizer.param_groups: param_group['lr'] = param_group['lr'] * 0.2 if not best_acc or val_acc > best_acc: with open( directory + 'para_{}.best_acc.pt'.format(config.para_pooling), 'wb') as f: torch.save(model, f) best_acc = val_acc with open( directory + 'para_{}.epoch-{:02d}.pt'.format( config.para_pooling, epoch_number), 'wb') as f: torch.save(model, f) with open("./experiments/{}/optimizer.pt".format(config.exp_num), 'wb') as f: torch.save(optimizer.state_dict(), f)
def train_early_stopping(epoch_number): global best_val_loss, best_acc start = time.time() loss_epoch = [] i = 1 batch_start = time.time() for name, group in train_grouped: # print(group.tokens.values) tokens = TextDataset._text2idx(group.tokens, dictionary.word2idx) labels = np.array(group.label.values) tokens, labels = process_batch(tokens, labels) loss = train_data(tokens, labels) loss_epoch.append(loss) # print loss every n passes if i % (p.print_loss_every * 5) == 0: print('| epoch %d | %d/%d batches | ms/batch (%s) | loss %f' % (epoch_number, i % (num_batches + 1), num_batches, time_since(batch_start), np.mean(loss_epoch))) batch_start = time.time() i += 1 word_attn.eval() sent_attn.eval() model.eval() print('-' * 89) val_loss, val_acc, precision, recall, f1, conf_matrix = check_loss_and_accuracy( val_grouped) print('| val set loss %f | time %s | Acc %f' % (val_loss, time_since(start), val_acc) + "| Precision: " + str(precision) + " | Recall: " + str(recall) + " | F1-score: " + str(f1)) print('The confusion matrix is: ') print(str(conf_matrix)) print('-' * 89) test_loss, test_acc, precision, recall, f1, conf_matrix = check_loss_and_accuracy( test_grouped) print('| test set loss: %f| Acc %f ' % (test_loss, test_acc) + "| Precision: " + str(precision) + " | Recall: " + str(recall) + " | F1-score: " + str(f1)) print('The confusion matrix is: ') print(str(conf_matrix)) print('-' * 89) directory = "./experiments/%s/models/" % config.exp_num if not os.path.exists(directory): os.makedirs(directory) if not best_val_loss or val_loss < best_val_loss: best_val_loss = val_loss else: # if loss doesn't go down, divide the learning rate by 5. for param_group in model_optimizer.param_groups: param_group['lr'] = param_group['lr'] * 0.2 if not best_acc or val_loss > best_acc: with open(directory + p.para_ensem_path[:-3] + '.best_acc.pt', 'wb') as f: torch.save(model, f) best_acc = val_loss with open( directory + p.para_ensem_path[:-3] + '.epoch-{:02d}.pt'.format(epoch_number), 'wb') as f: torch.save(model, f)