def train(config, model, train_iter, dev_iter, test_iter): start_time = time.time() # EMA初始化 ema = EMA(model, 0.999) ema.register() model.train() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=0.05, t_total=len(train_iter) * config.num_epochs) total_batch = 0 # 记录进行到多少batch dev_best_loss = float('inf') last_improve = 0 # 记录上次验证集loss下降的batch数 flag = False # 记录是否很久没有效果提升 model.train() for epoch in range(config.num_epochs): print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs)) for i, (trains, labels) in enumerate(train_iter): outputs = model(trains) model.zero_grad() loss = F.cross_entropy(outputs, labels) loss.backward() optimizer.step() ema.update() # 训练过程中,更新完参数后,同步update shadow weights if total_batch % 100 == 0: # 每多少轮输出在训练集和验证集上的效果 true = labels.data.cpu() predic = torch.max(outputs.data, 1)[1].cpu() train_acc = metrics.accuracy_score(true, predic) ema.apply_shadow() # model应用shadow weights,进行验证、保存模型 dev_acc, dev_loss = evaluate(config, model, dev_iter) if dev_loss < dev_best_loss: dev_best_loss = dev_loss torch.save(model.state_dict(), config.save_path) improve = '*' last_improve = total_batch else: improve = '' ema.restore() # 下一次训练之前,恢复模型参数 time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>5.2}, Train Acc: {2:>6.2%}, Val Loss: {3:>5.2}, Val Acc: {4:>6.2%}, Time: {5} {6}' print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve)) model.train() total_batch += 1 if total_batch - last_improve > config.require_improvement: # 验证集loss超过1000batch没下降,结束训练 print("No optimization for a long time, auto-stopping...") flag = True break if flag: break test(config, model, test_iter)
def train(config, model, train_iter, dev_iter, test_iter): param_optimizer = list(model.named_parameters()) #定义不需要衰减的参数,bert是基于transformer的encoder,add&norm层的参数不需要衰减 no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ #需要衰减 {'params':[p for n,p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay':0.01}, #不需要衰减 {'params':[p for n,p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_deacy':0.0} ] optimizer = BertAdam(params=optimizer_grouped_parameters, lr=config.learning_rate, warmup=0.05, t_total=len(train_iter) * config.num_epochs) total_batch = 0 dev_best_loss = float('inf') last_imporve = 0 # 上次校验集loss下降的batch数 flag = False # 记录是否有效果提升 model.train() for epoch in range(config.num_epochs): print('Epoch [{}/{}'.format(epoch+1, config.num_epochs)) for i, (trains, labels) in enumerate(train_iter): outputs = model(trains) model.zero_grad() loss = F.cross_entropy(outputs, labels) loss.backward() optimizer.step() if total_batch % 100 == 0: true = labels.data.cpu() predit = torch.max(outputs.data, 1)[1].cpu() train_acc = metrics.accuracy_score(true, predit) dev_acc, dev_loss = evaluate(config, model, dev_iter) if dev_loss < dev_best_loss: dev_best_loss = dev_loss torch.save(model.state_dict(), config.save_path) imporve = '*' last_imporve = total_batch else: imporve = '' msg = 'Iter:{0:>6}, Train Loss:{1:>5.2}, Train Acc:{2:>6.2}, Val Loss:{3:>5.2}, Val Acc:{4:>6.2%}, {6}' print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, imporve)) model.train() total_batch = total_batch + 1 if total_batch - last_imporve > config.require_improvement: print('在校验数据集合上已经很长时间没有提升了,模型自动停止训练') flag = True break if flag: break test(config, model, test_iter)
def train(self, config, model, train_iter, dev_iter, test_iter): start_time = time.time() model.train() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=0.05, t_total=len(train_iter) * config.num_epochs) total_batch = 0 # 记录进行到多少batch dev_best_loss = float('inf') last_improve = 0 # 记录上次验证集loss下降的batch数 flag = False # 记录是否很久没有效果提升 model.train() for epoch in range(config.num_epochs): print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs)) for i, (trains, labels) in enumerate(train_iter): predicts, predicts_list = model(trains, labels, mode="train") losses = list(map(self._get_loss, predicts, labels)) loss = torch.mean(torch.stack(losses)) model.zero_grad() #loss = F.cross_entropy(losses, labels) loss.backward() optimizer.step() if total_batch % 1000 == 0: dev_acc_lev1, dev_acc_lev2, dev_loss = self.evaluate(config, model, dev_iter) if dev_loss < dev_best_loss: dev_best_loss = dev_loss torch.save(model.state_dict(), config.save_path) improve = '*' last_improve = total_batch else: improve = '' time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>5.2}, Val Loss: {2:>5.2}, Val_lev1 Acc: {3:>6.2%}, Val_lev2 Acc: {4:>6.2%}, Time: {5} {6}' print(msg.format(total_batch, loss.item(), dev_loss, dev_acc_lev1, dev_acc_lev2, time_dif, improve)) model.train() if total_batch > 10 and total_batch % 90000 == 0: self.test(config, model, train_iter) total_batch += 1 if total_batch - last_improve > config.require_improvement: # 验证集loss超过1000batch没下降,结束训练 print("No optimization for a long time, auto-stopping...") flag = True break if flag: break self.test(config, model, test_iter)
def train(config, model, train_iter, dev_iter, test_iter): """ 模型训练方法 :param config: :param model: :param train_iter: :param dev_iter: :param test_iter: :return: """ start_time = time.time() #启动 BatchNormalization 和 dropout model.train() #拿到所有mode种的参数 param_optimizer = list(model.named_parameters()) # 不需要衰减的参数 no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_deacy': 0.0 }] optimizer = BertAdam(params=optimizer_grouped_parameters, lr=config.learning_rate, warmup=0.05, t_total=len(train_iter) * config.num_epochs) total_batch = 0 #记录进行多少batch dev_best_loss = float('inf') #记录校验集合最好的loss last_imporve = 0 #记录上次校验集loss下降的batch数 flag = False #记录是否很久没有效果提升,停止训练 model.train() for epoch in range(config.num_epochs): print('Epoch [{}/{}'.format(epoch + 1, config.num_epochs)) for i, (trains, labels) in enumerate(train_iter): outputs = model(trains) model.zero_grad() loss = F.cross_entropy(outputs, labels) loss.backward(retain_graph=False) optimizer.step() if total_batch % 100 == 0: #每多少次输出在训练集和校验集上的效果 true = labels.data.cpu() predit = torch.max(outputs.data, 1)[1].cpu() train_acc = metrics.accuracy_score(true, predit) dev_acc, dev_loss = evaluate(config, model, dev_iter) if dev_loss < dev_best_loss: dev_best_loss = dev_loss torch.save(model.state_dict(), config.save_path) imporve = '*' last_imporve = total_batch else: imporve = '' time_dif = utils.get_time_dif(start_time) msg = 'Iter:{0:>6}, Train Loss:{1:>5.2}, Train Acc:{2:>6.2}, Val Loss:{3:>5.2}, Val Acc:{4:>6.2%}, Time:{5} {6}' print( msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, imporve)) model.train() total_batch = total_batch + 1 if total_batch - last_imporve > config.require_improvement: #在验证集合上loss超过1000batch没有下降,结束训练 print('在校验数据集合上已经很长时间没有提升了,模型自动停止训练') flag = True break if flag: break test(config, model, test_iter)
def train(config, model, train_iter, dev_iter, test_iter): """ 模型训练方法 :param config: :param model: :param train_iter: :param dev_iter: :param test_iter: :return: """ start_time = time.time() # 启动 BatchNormalization 和 dropout model.train() # 拿到 model 中的参数 param_optimizer = list(model.named_parameters()) # 不需要衰减的参数 no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ { 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }, ] optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=0.05, t_total=len(train_iter) * config.num_epochs) # 记录进行到了多少batch total_batch = 0 # 记录校验集合最好的loss dev_best_loss = float('inf') # 记录上次验证集loss下降的batch数 last_improve = 0 # 记录是否很久没有效果提升 flag = False model.train() for epoch in range(config.num_epochs): print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs)) for i, (trains, labels) in enumerate(train_iter): outputs = model(trains) # 梯度清零 model.zero_grad() loss = F.cross_entropy(outputs, labels) loss.backward() optimizer.step() # 每多少轮输出在训练集和校验集上的效果 if total_batch % 100 == 0: true = labels.data.cpu() predict = torch.max(outputs.data, 1)[1].cpu() train_acc = metrics.accuracy_score(true, predict) # 评估校验集 dev_acc, dev_loss = evaluate(config, model, dev_iter) if dev_loss < dev_best_loss: dev_best_loss = dev_loss # 保存模型 torch.save(model.state_dict(), config.save_path) improve = '*' last_improve = total_batch else: improve = '' time_dif = get_time_dif(start_time) msg = 'Iter:{0:>6}, Train Loss:{1:>5.2}, Train Acc:{2:>6.2%}, Val Loss:{3:>5.2}, Val Acc:{4:>6.2%}, Time:{5} {6}' print( msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve)) model.train() total_batch += 1 # 校验集loss超过1000batch没下降,结束训练 if total_batch - last_improve > config.require_improvement: print("No optimization for a long time, auto-stopping...") flag = True break if flag: break print("开始测试模型") test(config, model, test_iter)
def train_(self): batch_size = self.arg_parser.batch_size epochs = self.arg_parser.num_epochs self.model.train() iter_counter = 0 # Adadelta, Adagrad, Adam, AdamW, SGD, SparseAdam, Adamax, ASGD, RMSprop, LBFGS, Rprop best_dev_loss = float("inf") best_dev_accuracy = 0 train_loss_info = {} # collection loss data to draw the loss curve train_loss_info["num_epochs"] = epochs train_loss_info["batch_size"] = batch_size if self.arg_parser.bert: param_optimizer = list(self.model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=self.lr, warmup=0.05, t_total=epochs * len(self.train_batch)) else: optimizer = getattr(torch.optim, self.arg_parser.optimizer)( self.model.parameters(), lr=self.lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode="min", factor=self.lr_decay, patience=self.lr_patience, verbose=True) for epoch in range(epochs): logger.info(f"-" * 35 + ">" + f"Training {epoch}th epoch" + "<" + "-" * 35) #optimizer = getattr(torch.optim, self.arg_parser.optimizer)(self.model.parameters(), lr = self.lr * (self.lr_decay ** epoch)) last_epoch_dev_loss = float( "inf") if epoch == 0 else epoch_dev_loss last_epoch_dev_accuracy = 0 if epoch == 0 else epoch_dev_accuracy epoch_train_loss = 0 epoch_train_accuracy = 0 epoch_dev_loss = 0 epoch_dev_accuracy = 0 print_counter = 0 for minibatch in self.train_batch: if self.arg_parser.bert: input_x = minibatch[0] masks = minibatch[2] input_ = (input_x, masks) else: input_ = minibatch[0] label_ = minibatch[1] output_ = self.model(input_) self.model.zero_grad() loss = F.cross_entropy(output_, label_) loss.backward() optimizer.step() iter_counter += 1 if iter_counter % 100 == 0: predict = output_.max(1)[1].cpu() label_cpu = label_.cpu() train_loss = loss.cpu().item() train_loss = round(train_loss, 5) #train_accuracy = round(accuracy_score(label_cpu, predict), 4) train_accuracy = accuracy_score(label_cpu, predict) dev_loss, dev_accuracy, dev_f1_macro, dev_f1_micro, dev_weighted = self.evaluation( self.model, self.dev_batch) epoch_train_loss += train_loss epoch_train_accuracy += train_accuracy epoch_dev_loss += dev_loss epoch_dev_accuracy += dev_accuracy logger.info( f"Iter: {iter_counter}, train loss: {train_loss}, train accuracy: {train_accuracy}, val loss: {dev_loss}, val accuracy: {dev_accuracy}" ) if self.unbalanced == True: logger.info( f"val F1 macro: {dev_f1_macro}, val F1 micro: {dev_f1_micro}, val F1 weighted: {dev_weighted}" ) self.model.train() if dev_loss < best_dev_loss and dev_accuracy > best_dev_accuracy: best_dev_loss = dev_loss best_dev_accuracy = dev_accuracy #logger.info(f"Best validation loss updated: {best_dev_loss}") torch.save(self.model, self.model_save_path) else: self.iter_patience += 1 print_counter += 1 epoch_train_loss = round(epoch_train_loss / print_counter, 5) epoch_train_accuracy = round(epoch_train_accuracy / print_counter, 5) epoch_dev_loss = round(epoch_dev_loss / print_counter, 5) epoch_dev_accuracy = round(epoch_dev_accuracy / print_counter, 5) scheduler.step(epoch_dev_loss) logger.info( f"{epoch}th epoch finished, val epoch loss:{epoch_dev_loss}, val epoch accuracy:{epoch_dev_accuracy}" ) self.EarlyStopping(epoch_dev_loss, epoch_dev_accuracy, last_epoch_dev_loss, last_epoch_dev_accuracy) if self.EarlyStopping_triggered == True: logger.info("=" * 70) logger.info( f"Early Stopping triggered after {epoch + 1} epoches, calculating test accuracy..." ) break if self.EarlyStopping_triggered == False: logger.info( "Training fnished, full epoch, calculating test accuracy...") self.evaluate_test()
def train(config, model, train_iter, dev_iter, test_iter): start_time = time.time() ema = EMA(model, 0.999) ema.register() model.train() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] # bert官方将此三类免于正则化 optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=0.05, t_total=len(train_iter) * config.num_epochs) total_batch = 0 # 记录进行到多少batch dev_best_loss = float('inf') # 正无穷 last_improve = 0 # 记录上次验证集loss下降的batch数 flag = False # 记录是否很久没有效果提升 dev_f1_score = [] model.train() for epoch in range(config.num_epochs): print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs)) for i, (trains, labels) in enumerate( train_iter): # trains, labels ==> (x, seq_len, mask), y outputs = model(trains) model.zero_grad() loss = F.cross_entropy(outputs, labels) loss = loss / config.acc_grad loss.backward() if (i + 1) % config.acc_grad == 0: # 梯度累加 optimizer.step() ema.update() if total_batch % 100 == 0: # 每100轮输出在训练集和验证集上的效果 true = labels.data.cpu() predic = torch.max(outputs.data, 1)[1].cpu() train_f1 = metrics.f1_score(true, predic, average='macro') ema.apply_shadow() dev_f1, dev_loss = evaluate(config, model, dev_iter) dev_f1_score.append(dev_f1) if dev_loss < dev_best_loss: dev_best_loss = dev_loss torch.save(model.state_dict(), config.save_path) # 单gpu improve = '*' last_improve = total_batch else: improve = '' ema.restore() time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>5.2}, Train F1: {2:>6.2%}, Val Loss: {3:>5.2}, Val F1: {4:>6.2%}, Time: {5} {6}' print( msg.format(total_batch, loss.item(), train_f1, dev_loss, dev_f1, time_dif, improve)) model.train() total_batch += 1 if total_batch - last_improve > config.require_improvement: # 验证集loss超过1000batch没下降,结束训练 print("No optimization for a long time, auto-stopping...") flag = True break # print(logits_res) print('Epoch {} Average F1-Score: {}'.format(epoch + 1, np.mean(dev_f1_score))) if flag: break test(config, model, test_iter)
def train(config, model, train_iter, dev_iter, test_iter): start_time = time.time() model.train() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=0.05, t_total=len(train_iter) * config.num_epochs) total_batch = 0 # 记录进行到多少batch dev_best_loss = float('inf') last_improve = 0 # 记录上次验证集loss下降的batch数 flag = False # 记录是否很久没有效果提升 model.train() for epoch in range(config.num_epochs): print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs)) for i, (trains, labels) in enumerate(train_iter): outputs = model(trains) model.zero_grad() loss = F.cross_entropy(outputs, labels) loss.backward() optimizer.step() if total_batch % 5 == 0: # 每多少轮输出在训练集和验证集上的效果 true = labels.data.cpu() predic = torch.max(outputs.data, 1)[1].cpu() train_acc = metrics.accuracy_score(true, predic) train_f1 = metrics.f1_score(true, predic, average='macro') dev_acc, dev_loss, dev_f1 = evaluate(config, model, dev_iter) if dev_loss < dev_best_loss: dev_best_loss = dev_loss torch.save(model.state_dict(), config.save_path) improve = '*' last_improve = total_batch else: improve = '' time_dif = get_time_dif(start_time) print( 'Iter:{}, Train acc:{:.2f} , Train loss:{:.2f} , train-f1: {:.2f}' .format(total_batch, train_acc, loss.item(), train_f1)) print('dev_loss:{:.2f}, dev_acc:{:.2f},dev_f1:{:.2f}'.format( dev_loss, dev_acc, dev_f1)) print('using Time', time_dif) print('----' * 10) print('\n') # msg = 'Iter: {0:>6}, Train Loss: {1:>5.2}, Train Acc: {2:>6.2%},Train f1: {2:>6.2%}, Val Loss: {3:>5.2}, Val Acc: {4:>6.2%}, Val f1: {4:>6.2%},Time: {5} {6}' # print(msg.format(total_batch, loss.item(), train_acc, train_f1, dev_loss, dev_acc, dev_f1, time_dif, improve)) model.train() total_batch += 1 if total_batch - last_improve > config.require_improvement: # 验证集loss超过1000batch没下降,结束训练 print("No optimization for a long time, auto-stopping...") flag = True break if flag: break test(config, model, test_iter)
def train(config, model, train_iter, dev_iter): """ 模型的训练 :param config: :param model: :param train_iter: :param dev_iter: :return: """ start_time = time.time() # 启动模型的训练模式 model.train() # 拿到所有参数 param_optimizer = list(model.named_parameters()) # 定义不需要衰减的参数 no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight': 0.0} ] optimizer = BertAdam(params=optimizer_grouped_parameters, lr=config.learning_rate, warmup=0.05, t_total=len(train_iter) * config.epochs) total_batch = 0 # 记录进行多少batch dev_best_loss = float('inf') # 记录上次最好的验证集loss last_improve = 0 # 记录上次提升的batch flag = False # 停止位的标志, 是否很久没提升 for epoch in range(config.epochs): print('Epoch [{}/{}]'.format(epoch+1, config.epochs)) for i, (trains, labels) in enumerate(train_iter): outputs = model(trains) model.zero_grad() loss = F.cross_entropy(outputs, labels) loss.backward() optimizer.step() if total_batch % 50 == 0: # 每训练50次输出在训练集和验证集上的效果 torch.cuda.empty_cache() true = labels.data.cpu() predict = torch.max(outputs.data, 1)[1].cpu() score = metrics.accuracy_score(true, predict) dev_acc, dev_loss = evaluate(config, model, dev_iter) if dev_best_loss > dev_loss: dev_best_loss = dev_loss torch.save(model.state_dict(), config.model_save) improve = '+' last_improve = total_batch else: improve = '' time_idf = utils.get_time_idf(start_time) msg = 'Iter:{0:>6}, Train Loss:{1:>5.2}, Train ACC:{2:>6.2%}, ' \ 'Val Loss:{3:>5.2}, Val ACC:{4:>6.2%}, Time:{5} {6}' print(msg.format(total_batch, loss.item(), score, dev_loss, dev_acc, time_idf, improve)) model.train() total_batch = total_batch + 1 if total_batch - last_improve > config.require_improvement: # 在验证集上loss超过1000batch没有下降, 结束训练 print('在验证集上loss超过1000batch没有下降, 结束训练') flag = True break if flag: break
def train(config, model, train_iter, dev_iter, test_iter): """ Train model """ # get the model's parameters param_optimizer = list(model.named_parameters()) # which parameters do not need to be decay no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n,p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay':config.weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay':0.0} ] optimizer = BertAdam(params=optimizer_grouped_parameters, schedule = None, lr = config.learning_rate, warmup=config.warmup, t_total=len(train_iter) * config.num_epochs) config.hvd.broadcast_parameters(model.state_dict(), root_rank=0) config.hvd.broadcast_optimizer_state(optimizer, root_rank=0) optimizer = config.hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) start_time = time.time() # activate BatchNormalization & dropout model.train() flag = False # progress total_batch = 0 # Best loss in dev dev_best_loss = float('inf') # Last time loss was decreased's batch number in dev last_improve = 0 # no improvement in long time, it's ok to stop train for epoch in range(config.num_epochs): if config.hvd.rank() == 0: print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs)) for i, (token_ids, label, seq_len, mask) in enumerate(train_iter): token_ids = token_ids.to(config.device) label_gpu = label.to(config.device) seq_len = seq_len.to(config.device) mask = mask.to(config.device) outputs = model(token_ids, seq_len, mask) model.zero_grad() loss = F.cross_entropy(outputs, label_gpu) loss.backward() optimizer.step() #Every n batches, go dev if total_batch % 100 == 0: # Get the highest one from Softmax() predit = torch.max(outputs.data, 1)[1].cpu() train_acc = metrics.accuracy_score(label, predit) dev_acc, dev_loss = evaluate(config, model, dev_iter) if dev_loss < dev_best_loss: dev_best_loss = dev_loss if config.hvd.rank() == 0: torch.save(model.state_dict(), config.save_path) improve = '*' last_improve = total_batch else: improve = '' time_dif = utils.get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>5.2}, Train Acc:{2:>6.2}, Val Loss: {3:>5.2}, Val Acc: {4:>6.2%}, Time: {5} {6}' if config.hvd.rank() == 0: print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve)) model.train() total_batch = total_batch + 1 if total_batch - last_improve > config.require_improvement: # No improvement for too long (longer than config.require_improvement) print('No improvement for too long. Stop training automatically.') flag = True break if flag: break
def train(config, model, train_iter, dev_iter, test_iter): start_time = time.time() TIMESTAMP = "{}_bs{}_lr{}_ps{}_{:%Y_%m_%d_%H_%M_%S/}".format( config.model_name, config.batch_size, config.learning_rate, config.pad_size, datetime.now()) writer = SummaryWriter('/data-output/{}'.format(TIMESTAMP)) model.train() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=0.05, t_total=len(train_iter) * config.num_epochs) # optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=0.05) model, optimizer = amp.initialize(model, optimizer, opt_level="O1") # dev_best_loss = float('inf') dev_best_acc = 0 last_improve = 0 # 记录上次验证集loss下降的batch数 flag = False # 记录是否很久没有效果提升 model.train() sum_batch = 0 for epoch in range(config.num_epochs): total_batch = 0 print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs)) pbar = tqdm(total=len(train_iter)) for i, (trains, labels) in enumerate(train_iter): outputs = model(trains) model.zero_grad() loss = F.cross_entropy(outputs, labels) # loss = FocalLoss(gamma=config.gamma, num_class=config.num_classes)(outputs, labels) # loss.backward() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() # if i % 2 == 1: # optimizer.step() # model.zero_grad() optimizer.step() # model.zero_grad() if total_batch % 50 == 0 and total_batch != 0: true = labels.data.cpu() predic = torch.max(outputs.data, 1)[1].cpu() train_acc = metrics.accuracy_score(true, predic) dev_acc, dev_loss = evaluate(config, model, dev_iter) if dev_acc > dev_best_acc: dev_best_acc = dev_acc # torch.save(model.state_dict(), config.save_path) # torch.save(model.state_dict(), 'out/model/epoch{}_{}_pytorch_model.bin'.format(epoch, config.model_name)) torch.save( model.state_dict(), 'out/model/best_{}_pytorch_model.bin'.format( config.model_name)) improve = '*' last_improve = total_batch else: improve = '' # time_dif = get_time_dif(start_time) print( '\n Iter: {}, Train Loss: {:.3f} Train Acc: {:.3f}% Val Loss: {:.3f} Val Acc: {:.3f}% {} ' .format(total_batch, loss.item(), train_acc * 100, dev_loss, dev_acc * 100, improve)) writer.add_scalar('Loss/train', loss.item(), sum_batch) writer.add_scalar('Loss/dev', dev_loss, sum_batch) writer.add_scalar('Acc/train', train_acc, sum_batch) writer.add_scalar('Acc/dev', dev_acc, sum_batch) writer.flush() model.train() pbar.update(1) sum_batch += 1 total_batch += 1 if total_batch - last_improve > config.require_improvement: print("No optimization for a long time, auto-stopping...") flag = True break pbar.close() true = labels.data.cpu() predic = torch.max(outputs.data, 1)[1].cpu() train_acc = metrics.accuracy_score(true, predic) dev_acc, dev_loss = evaluate(config, model, dev_iter) writer.add_scalar('Loss/train', loss.item(), sum_batch) writer.add_scalar('Loss/dev', dev_loss, sum_batch) writer.add_scalar('Acc/train', train_acc, sum_batch) writer.add_scalar('Acc/dev', dev_acc, sum_batch) writer.flush() print( '\n Epoch{}, Train Loss: {:.3f} Train Acc: {:.3f} Val Loss: {:.3f} Val Acc: {:.3f} ' .format(epoch + 1, loss.item(), train_acc, dev_loss, dev_acc)) torch.save(model.state_dict(), 'out/model/{}_pytorch_model.bin'.format(config.model_name)) if flag: break # torch.save(model.state_dict(), 'out/model/{}_pytorch_model.bin'.format(config.model_name)) predict(config, model, test_iter, dev_acc) writer.close()
def train(config, model, train_iter, dev_iter, test_iter): start_time = time.time() model.train() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] # optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate) optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=0.05, t_total=len(train_iter) * config.num_epochs) total_batch = 0 # 记录进行到多少batch dev_best_loss = float('inf') last_improve = 0 # 记录上次验证集loss下降的batch数 flag = False # 记录是否很久没有效果提升 model.train() # 初始化 pgd = PGD(model) K = 3 for epoch in range(config.num_epochs): print('Epoch [{}/{}]'.format(epoch + 1, config.num_epochs)) for i, (trains, labels) in enumerate(train_iter): outputs = model(trains) # model.zero_grad() loss = F.cross_entropy(outputs, labels) loss.backward() pgd.backup_grad() # 对抗训练 for t in range(K): pgd.attack(is_first_attack=(t == 0)) # 在embedding上添加对抗扰动, first attack时备份param.data if t != K - 1: model.zero_grad() else: pgd.restore_grad() outputs_adv = model(trains) loss_adv = F.cross_entropy(outputs_adv, labels) loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 pgd.restore() # 恢复embedding参数 optimizer.step() model.zero_grad() if total_batch % 100 == 0: # 每多少轮输出在训练集和验证集上的效果 true = labels.data.cpu() predic = torch.max(outputs.data, 1)[1].cpu() train_acc = metrics.accuracy_score(true, predic) dev_acc, dev_loss = evaluate(config, model, dev_iter) if dev_loss < dev_best_loss: dev_best_loss = dev_loss torch.save(model.state_dict(), config.save_path) improve = '*' last_improve = total_batch else: improve = '' time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>5.2}, Train Acc: {2:>6.2%}, Val Loss: {3:>5.2}, Val Acc: {4:>6.2%}, Time: {5} {6}' print(msg.format(total_batch, loss.item(), train_acc, dev_loss, dev_acc, time_dif, improve)) model.train() total_batch += 1 if total_batch - last_improve > config.require_improvement: # 验证集loss超过1000batch没下降,结束训练 print("No optimization for a long time, auto-stopping...") flag = True break if flag: break test(config, model, test_iter)