def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(): best_val_loss = 100 ntokens = len(corpus.dictionary) train_data = batchify(corpus.train, args.batch_size) # num_batches, batch_size val_data = batchify(corpus.valid, args.batch_size) model = RNNModel(rnn_type=args.model, ntoken=ntokens, ninp=args.emsize, nfeat=args.nfeat, nhid=args.nhid, nlayers=args.nlayers, font_path=args.font_path, font_size=args.font_size, dropout=args.dropout, tie_weights=args.tied, ).to(device) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) print('start training...') hidden = model.init_hidden(args.batch_size) epoch_start_time = time.time() for epoch in range(args.epochs): model.eval() # 在validation上测试 total_loss = 0. with torch.no_grad(): for idx in range(0, val_data.size(0) - 1, args.bptt): data, targets = get_batch(val_data, idx) output, hidden = model(data, hidden) output_flat = output.view(-1, ntokens) # (seq_len, batch, ntokens) -> (seq_len*batch, ntokens) total_loss += len(data) * criterion(output_flat, targets.view(-1)).item() hidden = repackage_hidden(hidden) val_loss = total_loss / len(val_data) best_val_loss = min(best_val_loss, val_loss) print('-' * 100) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f} | best valid ppl {:8.2f}' .format(epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss), math.exp(best_val_loss))) print('-' * 100) epoch_start_time = time.time() if val_loss == best_val_loss: # Save the model if the validation loss is best so far. torch.save(model, os.path.join(args.save, 'model.pkl')) else: args.lr /= 4.0 model.train() # 在training set上训练 total_loss = 0. start_time = time.time() for i, idx in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, idx) hidden = repackage_hidden(hidden) model.zero_grad() # 求loss和梯度 output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets.view(-1)) loss.backward() total_loss += loss.item() torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) # 用梯度更新参数 optimizer.step() # for p in model.parameters(): # p.data.add_(-args.lr, p.grad.data) if i % args.log_interval == 0 and i > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} |loss {:5.2f} | ppl {:8.2f}' .format(epoch + 1, i, len(train_data) // args.bptt, args.lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(): # 载入数据与配置模型 print("Loading data...") corpus = Corpus(train_dir) print(corpus) config = Config() config.vocab_size = len(corpus.dictionary) train_data = batchify(corpus.train, config.batch_size) train_len = train_data.size(0) seq_len = config.seq_len print("Configuring model...") model = RNNModel(config) if use_cuda: model.cuda() print(model) criterion = nn.CrossEntropyLoss() lr = config.learning_rate # 初始学习率 start_time = time.time() print("Training and generating...") for epoch in range(1, config.num_epochs + 1): # 多轮次训练 total_loss = 0.0 model.train() # 在训练模式下dropout才可用。 hidden = model.init_hidden(config.batch_size) # 初始化隐藏层参数 for ibatch, i in enumerate(range(0, train_len - 1, seq_len)): data, targets = get_batch(train_data, i, seq_len) # 取一个批次的数据 # 在每批开始之前,将隐藏的状态与之前产生的结果分离。 # 如果不这样做,模型会尝试反向传播到数据集的起点。 hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, config.vocab_size), targets) loss.backward() # 反向传播 # `clip_grad_norm` 有助于防止RNNs/LSTMs中的梯度爆炸问题。 torch.nn.utils.clip_grad_norm(model.parameters(), config.clip) for p in model.parameters(): # 梯度更新 p.data.add_(-lr, p.grad.data) total_loss += loss.data # loss累计 if ibatch % config.log_interval == 0 and ibatch > 0: # 每隔多少个批次输出一次状态 cur_loss = total_loss[0] / config.log_interval elapsed = get_time_dif(start_time) print( "Epoch {:3d}, {:5d}/{:5d} batches, lr {:2.3f}, loss {:5.2f}, ppl {:8.2f}, time {}" .format(epoch, ibatch, train_len // seq_len, lr, cur_loss, math.exp(cur_loss), elapsed)) total_loss = 0.0 lr /= 4.0 # 在一轮迭代完成后,尝试缩小学习率 # 每隔多少轮次保存一次模型参数 if epoch % config.save_interval == 0: torch.save(model.state_dict(), os.path.join(save_dir, model_name.format(epoch))) print(''.join(generate(model, corpus.dictionary.idx2word)))