def _infer(data): model.eval() # data = dev_data y_pred = [] with torch.no_grad(): for batch_data in data_iter(data, test_batch_size, shuffle=False): torch.cuda.empty_cache() batch_inputs, batch_labels = batch2tensor(batch_data) batch_outputs = model(batch_inputs) y_pred.extend(torch.max(batch_outputs, dim=1) [1].cpu().numpy().tolist()) print(label_encoder.label2name(y_pred))
def _eval(data): model.eval() # 不启用 BatchNormalization 和 Dropout # data = dev_data y_pred = [] y_true = [] with torch.no_grad(): for batch_data in data_iter(data, test_batch_size, shuffle=False): torch.cuda.empty_cache() batch_inputs, batch_labels = batch2tensor(batch_data) batch_outputs = model(batch_inputs) y_pred.extend(torch.max(batch_outputs, dim=1) [1].cpu().numpy().tolist()) y_true.extend(batch_labels.cpu().numpy().tolist()) score, dev_f1 = get_score(y_true, y_pred) return score, dev_f1
def run(method="train", save_path=None, infer_texts=[]): shuffle_slicer = ShuffleSlicer() # start_time = time.time() raw_data_path = "/home/wujinjie/kesci_question_multilabel_classification/data/raw_data/baidu/nlp_db.baidu_text.csv" texts = pd.read_csv(raw_data_path) train_df, dev_df, test_df = shuffle_slicer.split(texts, dev=True) # Test_data = {'label': [0] * len(texts), 'text': test_texts} clip = 5.0 epochs = 100 # log_interval = 50 test_batch_size = 128 train_batch_size = 128 train_texts, train_labels = process_corpus_dl(train_df) Train_data = {'label': train_labels, 'text': train_texts} dev_texts, dev_labels = process_corpus_dl(dev_df) Dev_data = {'label': dev_labels, 'text': dev_texts} vocab = Vocab(Train_data) step = 0 def _eval(data): model.eval() # 不启用 BatchNormalization 和 Dropout # data = dev_data y_pred = [] y_true = [] with torch.no_grad(): for batch_data in data_iter(data, test_batch_size, shuffle=False): torch.cuda.empty_cache() batch_inputs, batch_labels = batch2tensor(batch_data) batch_outputs = model(batch_inputs) y_pred.extend(torch.max(batch_outputs, dim=1) [1].cpu().numpy().tolist()) y_true.extend(batch_labels.cpu().numpy().tolist()) score, dev_f1 = get_score(y_true, y_pred) return score, dev_f1 def _infer(data): model.eval() # data = dev_data y_pred = [] with torch.no_grad(): for batch_data in data_iter(data, test_batch_size, shuffle=False): torch.cuda.empty_cache() batch_inputs, batch_labels = batch2tensor(batch_data) batch_outputs = model(batch_inputs) y_pred.extend(torch.max(batch_outputs, dim=1) [1].cpu().numpy().tolist()) print(label_encoder.label2name(y_pred)) if method == "train": model = Model(vocab, label_encoder) # loss criterion = nn.CrossEntropyLoss() # obj # 生成模型可处理的格式 train_data = get_examples_bert(Train_data, model.word_encoder, vocab, label_encoder) dev_data = get_examples_bert(Dev_data, model.word_encoder, vocab, label_encoder) # 一个epoch的batch个数 batch_num = int(np.ceil(len(train_data) / float(train_batch_size))) optimizer = Optimizer(model.all_parameters, steps=batch_num * epochs) # 优化器 best_train_f1, best_dev_f1 = 0, 0 early_stop = -1 EarlyStopEpochs = 3 # 当多个epoch,dev的指标都没有提升,则早停 # train print("start train") for epoch in range(1, epochs + 1): optimizer.zero_grad() model.train() # 启用 BatchNormalization 和 Dropout overall_losses = 0 losses = 0 # batch_idx = 1 y_pred = [] y_true = [] for batch_data in data_iter(train_data, train_batch_size, shuffle=True): torch.cuda.empty_cache() batch_inputs, batch_labels = batch2tensor(batch_data) batch_outputs = model(batch_inputs) loss = criterion(batch_outputs, batch_labels) loss.backward() loss_value = loss.detach().cpu().item() losses += loss_value overall_losses += loss_value y_pred.extend(torch.max(batch_outputs, dim=1) [1].cpu().numpy().tolist()) y_true.extend(batch_labels.cpu().numpy().tolist()) nn.utils.clip_grad_norm_( optimizer.all_params, max_norm=clip) # 梯度裁剪 for cur_optim, scheduler in zip(optimizer.optims, optimizer.schedulers): cur_optim.step() scheduler.step() optimizer.zero_grad() step += 1 # print(step) print(epoch) overall_losses /= batch_num overall_losses = reformat(overall_losses, 4) score, train_f1 = get_score(y_true, y_pred) print("score:{}, train_f1:{}".format(train_f1, score)) # if set(y_true) == set(y_pred): # print("report") # report = classification_report(y_true, y_pred, digits=4, target_names=label_encoder.target_names) # # logging.info('\n' + report) # print(report) # eval _, dev_f1 = _eval(data=dev_data) if best_dev_f1 <= dev_f1: best_dev_f1 = dev_f1 early_stop = 0 best_train_f1 = train_f1 save_path = model_utils.save_checkpoint( model, epoch, save_folder="/home/wujinjie/kesci_question_multilabel_classification/data/textbert") print("save_path:{}".format(save_path)) # torch.save(model.state_dict(), save_model) else: early_stop += 1 if early_stop == EarlyStopEpochs: # 达到早停次数,则停止训练 break print("score:{}, dev_f1:{}, best_train_f1:{}, best_dev_f1:{}".format( dev_f1, score, best_train_f1, best_dev_f1)) else: model = model_utils.load_checkpoint(save_path) if method == "test": test_texts, test_labels = process_corpus_dl(train_df) Test_data = {'label': test_labels, 'text': test_texts} test_data = get_examples_bert(Test_data, model.word_encoder, vocab, label_encoder) # model.load_state_dict(torch.load(save_model)) _, dev_f1 = _eval(data=test_data) print(dev_f1) elif method == "infer": infer_texts = list(map(segment, infer_texts)) # print(infer_texts) Infer_data = {'label': [0] * len(infer_texts), 'text': infer_texts} infer_data = get_examples_bert(Infer_data, model.word_encoder, vocab, label_encoder) _infer(data=infer_data)
batch_num = int(np.ceil(len(train_data) / float(train_batch_size))) # 一个epoch的batch个数 if __name__ == "__main__": best_train_f1, best_dev_f1 = 0, 0 early_stop = -1 EarlyStopEpochs = 3 # 当多个epoch,dev的指标都没有提升,则早停 # train print("start train") for epoch in range(1, epochs + 1): optimizer.zero_grad() model.train() # 启用 BatchNormalization 和 Dropout overall_losses = 0 losses = 0 batch_idx = 1 y_pred = [] y_true = [] for batch_data in data_iter(train_data, train_batch_size, shuffle=True): torch.cuda.empty_cache() batch_inputs, batch_labels = batch2tensor(batch_data) batch_outputs = model(batch_inputs) loss = criterion(batch_outputs, batch_labels) loss.backward() loss_value = loss.detach().cpu().item() losses += loss_value overall_losses += loss_value y_pred.extend(torch.max(batch_outputs, dim=1)[1].cpu().numpy().tolist()) y_true.extend(batch_labels.cpu().numpy().tolist()) nn.utils.clip_grad_norm_(optimizer.all_params, max_norm=clip) # 梯度裁剪 for cur_optim, scheduler in zip(optimizer.optims, optimizer.schedulers):