def train_model(): vocab, embeddings = data_helper.load_embeddings(config.get('data', 'embedding_file')) train_data = data_helper.load_data(os.path.join(config.get('data', 'treebank_dir'), 'train.txt')) numeric_train_samples = data_helper.convert_to_numeric_samples(train_data, vocab, num_classes=5) model = RNNModel(embeddings, num_classes=5, model_config=config['model']) dev_data = data_helper.load_data(os.path.join(config.get('data', 'treebank_dir'), 'dev.txt')) numeric_dev_samples = data_helper.convert_to_numeric_samples(dev_data, vocab, num_classes=5) eval_func = lambda: model.eval(numeric_dev_samples) model.train(numeric_train_samples, eval_func) model.save(config.get('data', 'model_dir'))
def train(): # 模型定义 model = RNNModel(len(word2ix), embed_size, hidden_dims) optimizer = torch.optim.Adam(model.parameters(), lr=lr) criterion = nn.CrossEntropyLoss() model.to(device) model.train() for epoch in (range(epochs)): total_loss = 0 count = 0 for ii, data_ in tqdm.tqdm(enumerate(data)): data_ = torch.tensor(data_).long() x = data_.unsqueeze(1).to(device) optimizer.zero_grad() y = torch.zeros(x.shape).to(device).long() y[:-1], y[-1] = x[1:], x[0] output, _ = model(x) loss = criterion(output, y.view(-1)) """ hidden=None for k in range(2,max_lenth): data1=data_[:k] input_, target = data1[:-1, :], data1[1:, :] output, hidden = model(input_,hidden) loss = criterion(output, target.view(-1)) optimizer.step() """ loss.backward() optimizer.step() total_loss += (loss.item()) count += 1 print(epoch, 'loss=', total_loss / count) torch.save(model.state_dict(), 'model.bin') chars = test(model) print(chars)
def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0. start_time = time.time() ntokens = len(corpus.dictionary) hidden = model.init_hidden(args.batch_size) for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, i) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) total_loss += loss.item() if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def training(self): vocab_to_id = get_vocab_to_id(self.train_data_path, self.vocab_file, False) logdir = os.path.join( self.summary_path, datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/") self.vocab_size = len(vocab_to_id) create_path(self.log_path) logger = get_logger(self.logfile_path) with tf.Session() as sess: summary_writer = tf.summary.FileWriter(logdir, graph=sess.graph) summary_writer.flush() rnn_model = RNNModel(self.rnn_size, self.embedding_size, self.class_num, self.vocab_size, self.learning_rate, self.model_path) test_data_generator = SentenceGenerator(self.test_data_path) testBatchManage = BatchManager(test_data_generator, 0, vocab_to_id) test_data = testBatchManage.get_all_data_to_batch() sess.run(tf.global_variables_initializer()) current_step = 0 for e in range(self.epoch_num): logger.info("Epoch num: " + str(e + 1) + "\n") print("Epoch num: " + str(e + 1) + "\n") train_data_generator = SentenceGenerator(self.train_data_path) trainBatchManage = BatchManager(train_data_generator, self.batch_size, vocab_to_id) for batchs in trainBatchManage.getBatches(): current_step += 1 loss, accuracy, summary_op = rnn_model.train( sess, batchs, self.dropout) if current_step % self.epoch_step == 0: loss_test, accuracy_test, _ = rnn_model.train_test( sess, test_data, 1.0) logger.info("loss:" + str(loss_test) + " accuracy:" + str(accuracy_test) + "\n") print("loss:" + str(loss_test) + " accuracy:" + str(accuracy_test) + "\n") summary_writer.add_summary(summary_op, current_step) rnn_model.saver.save(sess, self.model_path, global_step=current_step)
def train(train_data, val_data, steps=6000, val_per_steps=300, checkpoint_per_steps=100, batch_size=64, learning_rate=0.01, **kwargs): global args # train_data = list(filter(SimpleLengthModel.data_filter, train_data)) # val_data = list(filter(SimpleLengthModel.data_filter, val_data)) model = RNNModel(feature_dims=train_data[0].feature_dim, model_dir=args.output_dir, **kwargs) if args.checkpoint is not None: model.restore(args.checkpoint) data_provider = batch_data_provider(train_data, batch_size=batch_size) for t in range(0, steps): x, y, length = get_feature_label(next(data_provider), length_limit=1000) result = model.train(x, y, length, learning_rate) logging.info("step = {}: {}".format(model.global_step, result)) if model.global_step % val_per_steps == 0: result = val(model, val_data) model.init_streaming() logging.info("validation for step = {}: {}".format( model.global_step, result)) if model.global_step % checkpoint_per_steps == 0: model.save_checkpoint() logging.info("save checkpoint at {}".format(model.global_step)) if model.global_step % 2000 == 0: learning_rate *= 0.2 logging.info("current learning rate = {}".format(learning_rate))
def loss_function(preds, labels, lens): # TODO: delete padding new_preds, new_labels = [], [] for pred, label, l in zip(preds, labels, lens): new_preds.append(pred[:l]) new_labels.append(label[:l]) preds = torch.cat(new_preds, dim=0) labels = torch.cat(new_labels, dim=0) return cross_entropy(preds, labels) earlystop_count, min_loss = 0, 100 for epoch in range(0, 10000): total_loss = 0 model = model.train() for batch_num, batch in enumerate(train_loader): # input: BOS, term0, # target: term0, term1 lm_input, lens, lm_output = batch predictions, _, lens = model(cudalize(lm_input), cudalize(lens)) loss = loss_function(predictions, cudalize(lm_output), lens) optimizer.zero_grad() loss.backward() optimizer.step() total_loss += loss.item() avg_loss = total_loss / (batch_num + 1) ppl = math.exp(avg_loss) print(f'\rLM Training: epoch:{epoch} train batch:{batch_num} ' + f'loss:{avg_loss} ppl:{ppl}', end='')
class DartsTrainer(): def __init__(self, arm): # Default params for eval network args = { 'emsize': 850, 'nhid': 850, 'nhidlast': 850, 'dropoute': 0.1, 'wdecay': 8e-7 } args['data'] = '/home/liamli4465/darts/data/penn' args['lr'] = 20 args['clip'] = 0.25 args['batch_size'] = 64 args['search_batch_size'] = 256 * 4 args['small_batch_size'] = 64 args['bptt'] = 35 args['dropout'] = 0.75 args['dropouth'] = 0.25 args['dropoutx'] = 0.75 args['dropouti'] = 0.2 args['seed'] = arm['seed'] args['nonmono'] = 5 args['log_interval'] = 50 args['save'] = arm['dir'] args['alpha'] = 0 args['beta'] = 1e-3 args['max_seq_length_delta'] = 20 args['unrolled'] = True args['gpu'] = 0 args['cuda'] = True args['genotype'] = arm['genotype'] args = AttrDict(args) self.args = args self.epoch = 0 np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.set_device(args.gpu) cudnn.benchmark = True cudnn.enabled = True torch.cuda.manual_seed_all(args.seed) corpus = data.Corpus(args.data) self.corpus = corpus self.eval_batch_size = 10 self.test_batch_size = 1 self.train_data = batchify(corpus.train, args.batch_size, args) self.search_data = batchify(corpus.valid, args.search_batch_size, args) self.val_data = batchify(corpus.valid, self.eval_batch_size, args) self.test_data = batchify(corpus.test, self.test_batch_size, args) self.ntokens = len(corpus.dictionary) def model_save(self, fn, to_save): if self.epoch % 150 == 0: with open( os.path.join(self.args.save, "checkpoint-incumbent-%d" % self.epoch), 'wb') as f: torch.save(to_save, f) with open(fn, 'wb') as f: torch.save(to_save, f) def model_load(self, fn): with open(fn, 'rb') as f: self.model, self.optimizer, rng_state, cuda_state = torch.load(f) torch.set_rng_state(rng_state) torch.cuda.set_rng_state(cuda_state) def model_resume(self, filename): logging.info('Resuming model from %s' % filename) self.model_load(filename) self.optimizer.param_groups[0]['lr'] = self.args.lr for rnn in self.model.rnns: rnn.genotype = self.args.genotype def train_epochs(self, epochs): args = self.args resume_filename = os.path.join(self.args.save, "checkpoint.incumbent") if os.path.exists(resume_filename): self.model_resume(resume_filename) logging.info('Loaded model from checkpoint') else: self.model = RNNModel(self.ntokens, args.emsize, args.nhid, args.nhidlast, args.dropout, args.dropouth, args.dropoutx, args.dropouti, args.dropoute, genotype=args.genotype) self.optimizer = torch.optim.SGD(self.model.parameters(), lr=args.lr, weight_decay=args.wdecay) size = 0 for p in self.model.parameters(): size += p.nelement() logging.info('param size: {}'.format(size)) logging.info('initial genotype:') logging.info(self.model.rnns[0].genotype) total_params = sum(x.data.nelement() for x in self.model.parameters()) logging.info('Args: {}'.format(args)) logging.info('Model total parameters: {}'.format(total_params)) self.model = self.model.cuda() # Loop over epochs. lr = args.lr best_val_loss = [] stored_loss = 100000000 # At any point you can hit Ctrl + C to break out of training early. try: for epoch in range(epochs): epoch_start_time = time.time() self.train() if 't0' in self.optimizer.param_groups[0]: tmp = {} for prm in self.model.parameters(): tmp[prm] = prm.data.clone() prm.data = self.optimizer.state[prm]['ax'].clone() val_loss2 = self.evaluate(self.val_data) logging.info('-' * 89) logging.info( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format( self.epoch, (time.time() - epoch_start_time), val_loss2, math.exp(val_loss2), val_loss2 / math.log(2))) logging.info('-' * 89) if val_loss2 < stored_loss: self.model_save( os.path.join(args.save, 'checkpoint.incumbent'), [ self.model, self.optimizer, torch.get_rng_state(), torch.cuda.get_rng_state() ]) logging.info('Saving Averaged!') stored_loss = val_loss2 for prm in self.model.parameters(): prm.data = tmp[prm].clone() else: val_loss = self.evaluate(self.val_data, self.eval_batch_size) logging.info('-' * 89) logging.info( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format( self.epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss), val_loss / math.log(2))) logging.info('-' * 89) if val_loss < stored_loss: self.model_save( os.path.join(args.save, 'checkpoint.incumbent'), [ self.model, self.optimizer, torch.get_rng_state(), torch.cuda.get_rng_state() ]) logging.info('Saving model (new best validation)') stored_loss = val_loss if (self.epoch > 75 and 't0' not in self.optimizer.param_groups[0] and (len(best_val_loss) > args.nonmono and val_loss > min(best_val_loss[:-args.nonmono]))): logging.info('Switching to ASGD') self.optimizer = torch.optim.ASGD( self.model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay) best_val_loss.append(val_loss) except Exception as e: logging.info('-' * 89) logging.info(e) logging.info('Exiting from training early') return 0, 10000, 10000 # Load the best saved model. self.model_load(os.path.join(args.save, 'checkpoint.incumbent')) # Run on test data. val_loss = self.evaluate(self.val_data, self.eval_batch_size) logging.info(math.exp(val_loss)) test_loss = self.evaluate(self.test_data, self.test_batch_size) logging.info('=' * 89) logging.info( '| End of training | test loss {:5.2f} | test ppl {:8.2f} | test bpc {:8.3f}' .format(test_loss, math.exp(test_loss), test_loss / math.log(2))) logging.info('=' * 89) return 0, math.exp(val_loss), math.exp(test_loss) def train(self): args = self.args corpus = self.corpus total_loss = 0 start_time = time.time() hidden = [ self.model.init_hidden(args.small_batch_size) for _ in range(args.batch_size // args.small_batch_size) ] batch, i = 0, 0 while i < self.train_data.size(0) - 1 - 1: bptt = args.bptt if np.random.random() < 0.95 else args.bptt / 2. # Prevent excessively small or negative sequence lengths seq_len = max(5, int(np.random.normal(bptt, 5))) # There's a very small chance that it could select a very long sequence length resulting in OOM seq_len = min(seq_len, args.bptt + args.max_seq_length_delta) lr2 = self.optimizer.param_groups[0]['lr'] self.optimizer.param_groups[0]['lr'] = lr2 * seq_len / args.bptt self.model.train() data, targets = get_batch(self.train_data, i, args, seq_len=seq_len) self.optimizer.zero_grad() start, end, s_id = 0, args.small_batch_size, 0 while start < args.batch_size: cur_data, cur_targets = data[:, start: end], targets[:, start: end].contiguous( ).view(-1) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden[s_id] = repackage_hidden(hidden[s_id]) log_prob, hidden[s_id], rnn_hs, dropped_rnn_hs = self.model( cur_data, hidden[s_id], return_h=True) raw_loss = nn.functional.nll_loss( log_prob.view(-1, log_prob.size(2)), cur_targets) loss = raw_loss # Activiation Regularization if args.alpha > 0: loss = loss + sum(args.alpha * dropped_rnn_h.pow(2).mean() for dropped_rnn_h in dropped_rnn_hs[-1:]) # Temporal Activation Regularization (slowness) loss = loss + sum(args.beta * (rnn_h[1:] - rnn_h[:-1]).pow(2).mean() for rnn_h in rnn_hs[-1:]) loss *= args.small_batch_size / args.batch_size total_loss += raw_loss.data * args.small_batch_size / args.batch_size loss.backward() s_id += 1 start = end end = start + args.small_batch_size gc.collect() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs. torch.nn.utils.clip_grad_norm(self.model.parameters(), args.clip) self.optimizer.step() # total_loss += raw_loss.data self.optimizer.param_groups[0]['lr'] = lr2 if np.isnan(total_loss[0]): raise #if batch % args.log_interval == 0 and batch > 0: # cur_loss = total_loss[0] / args.log_interval # elapsed = time.time() - start_time # logging.info('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' # 'loss {:5.2f} | ppl {:8.2f}'.format( # self.epoch, batch, len(self.train_data) // args.bptt, self.optimizer.param_groups[0]['lr'], # elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) # total_loss = 0 # start_time = time.time() batch += 1 i += seq_len self.epoch += 1 def evaluate(self, data_source, batch_size=10): # Turn on evaluation mode which disables dropout. self.model.eval() total_loss = 0 hidden = self.model.init_hidden(batch_size) for i in range(0, data_source.size(0) - 1, self.args.bptt): data, targets = get_batch(data_source, i, self.args, evaluation=True) targets = targets.view(-1) log_prob, hidden = self.model(data, hidden) loss = nn.functional.nll_loss(log_prob.view(-1, log_prob.size(2)), targets).data total_loss += loss * len(data) hidden = repackage_hidden(hidden) return total_loss[0] / len(data_source)
def train(): best_val_loss = 100 ntokens = len(corpus.dictionary) train_data = batchify(corpus.train, args.batch_size) # num_batches, batch_size val_data = batchify(corpus.valid, args.batch_size) model = RNNModel(rnn_type=args.model, ntoken=ntokens, ninp=args.emsize, nfeat=args.nfeat, nhid=args.nhid, nlayers=args.nlayers, font_path=args.font_path, font_size=args.font_size, dropout=args.dropout, tie_weights=args.tied, ).to(device) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) print('start training...') hidden = model.init_hidden(args.batch_size) epoch_start_time = time.time() for epoch in range(args.epochs): model.eval() # 在validation上测试 total_loss = 0. with torch.no_grad(): for idx in range(0, val_data.size(0) - 1, args.bptt): data, targets = get_batch(val_data, idx) output, hidden = model(data, hidden) output_flat = output.view(-1, ntokens) # (seq_len, batch, ntokens) -> (seq_len*batch, ntokens) total_loss += len(data) * criterion(output_flat, targets.view(-1)).item() hidden = repackage_hidden(hidden) val_loss = total_loss / len(val_data) best_val_loss = min(best_val_loss, val_loss) print('-' * 100) print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | valid ppl {:8.2f} | best valid ppl {:8.2f}' .format(epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss), math.exp(best_val_loss))) print('-' * 100) epoch_start_time = time.time() if val_loss == best_val_loss: # Save the model if the validation loss is best so far. torch.save(model, os.path.join(args.save, 'model.pkl')) else: args.lr /= 4.0 model.train() # 在training set上训练 total_loss = 0. start_time = time.time() for i, idx in enumerate(range(0, train_data.size(0) - 1, args.bptt)): data, targets = get_batch(train_data, idx) hidden = repackage_hidden(hidden) model.zero_grad() # 求loss和梯度 output, hidden = model(data, hidden) loss = criterion(output.view(-1, ntokens), targets.view(-1)) loss.backward() total_loss += loss.item() torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) # 用梯度更新参数 optimizer.step() # for p in model.parameters(): # p.data.add_(-args.lr, p.grad.data) if i % args.log_interval == 0 and i > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} |loss {:5.2f} | ppl {:8.2f}' .format(epoch + 1, i, len(train_data) // args.bptt, args.lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time()
def train(): # 载入数据与配置模型 print("Loading data...") corpus = Corpus(train_dir) print(corpus) config = Config() config.vocab_size = len(corpus.dictionary) train_data = batchify(corpus.train, config.batch_size) train_len = train_data.size(0) seq_len = config.seq_len print("Configuring model...") model = RNNModel(config) if use_cuda: model.cuda() print(model) criterion = nn.CrossEntropyLoss() lr = config.learning_rate # 初始学习率 start_time = time.time() print("Training and generating...") for epoch in range(1, config.num_epochs + 1): # 多轮次训练 total_loss = 0.0 model.train() # 在训练模式下dropout才可用。 hidden = model.init_hidden(config.batch_size) # 初始化隐藏层参数 for ibatch, i in enumerate(range(0, train_len - 1, seq_len)): data, targets = get_batch(train_data, i, seq_len) # 取一个批次的数据 # 在每批开始之前,将隐藏的状态与之前产生的结果分离。 # 如果不这样做,模型会尝试反向传播到数据集的起点。 hidden = repackage_hidden(hidden) model.zero_grad() output, hidden = model(data, hidden) loss = criterion(output.view(-1, config.vocab_size), targets) loss.backward() # 反向传播 # `clip_grad_norm` 有助于防止RNNs/LSTMs中的梯度爆炸问题。 torch.nn.utils.clip_grad_norm(model.parameters(), config.clip) for p in model.parameters(): # 梯度更新 p.data.add_(-lr, p.grad.data) total_loss += loss.data # loss累计 if ibatch % config.log_interval == 0 and ibatch > 0: # 每隔多少个批次输出一次状态 cur_loss = total_loss[0] / config.log_interval elapsed = get_time_dif(start_time) print( "Epoch {:3d}, {:5d}/{:5d} batches, lr {:2.3f}, loss {:5.2f}, ppl {:8.2f}, time {}" .format(epoch, ibatch, train_len // seq_len, lr, cur_loss, math.exp(cur_loss), elapsed)) total_loss = 0.0 lr /= 4.0 # 在一轮迭代完成后,尝试缩小学习率 # 每隔多少轮次保存一次模型参数 if epoch % config.save_interval == 0: torch.save(model.state_dict(), os.path.join(save_dir, model_name.format(epoch))) print(''.join(generate(model, corpus.dictionary.idx2word)))