def make_lstm_model(device, my_words, train_data, valid_data, test_data): """ Makes the LSTM model then calls a function to save it to disk :param device: Stores whether to use cuda (GPU) :return: None """ vocab_size = len(my_words.word_to_id) if args.bi_lstm: print("Making Bidirectional LSTM Model") models = {'forward': None} else: print("Making Forward-Backward LSTM Model") models = {'forward': None, 'backward': None} for direction in models: models[direction] = LM_LSTM(embedding_dim=args.embedding_size, num_steps=args.num_steps, batch_size=args.batch_size, hidden_dim=args.hidden_size, vocab_size=vocab_size, num_layers=args.num_layers, dp_keep_prob=args.dp_keep_prob, bidirectional=args.bi_lstm) models[direction].direction = direction models[direction].to(device) # Move model to GPU if cuda is utilized lr = args.inital_lr lr_decay_base = 1 / 1.15 # decay factor for learning rate m_flat_lr = 14.0 # we will not touch lr for the first m_flat_lr epochs print("########## Training ##########################") for epoch in range(args.num_epochs): lr_decay = lr_decay_base ** max(epoch - m_flat_lr, 0) lr = lr * lr_decay # decay lr if it is timeUntitled4 train_p = dict() for model in models: train_p[model] = run_epoch(models[model], train_data, True, lr, device) if "backward" in models: print('Train perplexity at epoch {}: forward: {:8.2f}, backward: {:8.2f}' .format(epoch, train_p["forward"], train_p["backward"])) # print('Validation perplexity at epoch {}: forward: {:8.2f}, backward: {:8.2f}' # .format(epoch, run_epoch(model['forward'], valid_data, device=device), # run_epoch(model['backward'], valid_data, device=device))) else: print('Train perplexity at epoch {}: forward: {:8.2f}' .format(epoch, train_p["forward"])) # print('Validation perplexity at epoch {}: forward: {:8.2f}' # .format(epoch, run_epoch(model['forward'], valid_data, device=device))) save_data(models=models) # Save the results print("########## Testing ##########################") for direction in models: models[direction].batch_size = 1 # to make sure we process all the data
p.data.add_(-lr, p.grad.data) if step % (epoch_size // 10) == 10: print("{} perplexity: {:8.2f} speed: {} wps".format( step * 1.0 / epoch_size, np.exp(costs / iters), iters * model.batch_size / (time.time() - start_time))) return np.exp(costs / iters) if __name__ == "__main__": raw_data = reader.ptb_raw_data(data_path=args.data) train_data, valid_data, test_data, word_to_id, id_2_word = raw_data vocab_size = len(word_to_id) print('Vocabluary size: {}'.format(vocab_size)) model = LM_LSTM(embedding_dim=args.hidden_size, num_steps=args.num_steps, batch_size=args.batch_size, vocab_size=vocab_size, num_layers=args.num_layers, dp_keep_prob=args.dp_keep_prob) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.to(device) lr = args.inital_lr # decay factor for learning rate lr_decay_base = 1 / 1.15 # we will not touch lr for the first m_flat_lr epochs m_flat_lr = 14.0 print("########## Training ##########################") for epoch in range(args.num_epochs): lr_decay = lr_decay_base**max(epoch - m_flat_lr, 0) lr = lr * lr_decay # decay lr if it is time
if args.eval: model = torch.load(args.checkpoint) model.batch_size = args.batch_size optimizer = torch.optim.SGD(model.parameters(), lr=args.initial_lr) print('Test Perplexity: {:8.2f}'.format( run_epoch(model, test_data, optimizer))) sys.exit() lr = args.initial_lr # decay factor for learning rate lr_decay_base = 1 / 1.15 # we will not touch lr for the first m_flat_lr epochs m_flat_lr = 14.0 model = LM_LSTM(embedding_dim=args.embedding_size, rnn_type=args.rnn_type, hidden_size=args.hidden_size, num_steps=args.num_steps, batch_size=args.batch_size, vocab_size=vocab_size, num_layers=args.num_layers, dp_keep_prob=args.dp_keep_prob) model.cuda() print(model) print("########## Training ##########################") if args.optimizer == "sgd": optimizer = torch.optim.SGD(model.parameters(), lr=args.initial_lr) else: optimizer = torch.optim.Adam(model.parameters(), lr=args.initial_lr) notimprove = 0 best_val = run_epoch(model, valid_data, optimizer) for epoch in range(args.num_epochs): is_best = False if args.lr_schedule == "default":
loss.backward() torch.nn.utils.clip_grad_norm(model.parameters(), 0.25) for p in model.parameters(): p.data.add_(-lr, p.grad.data) if step % 30 == 0: print("{} perplexity: {:8.2f} speed: {} wps".format( step * 1.0 / epoch_size, np.exp(costs / iters), iters * model.batch_size / (time.time() - start_time))) return np.exp(costs / iters) model = LM_LSTM(embedding_dim=HIDDEN_DIM, num_steps=num_steps, batch_size=batch_size, vocab_size=18280, num_layers=EMBEDDING_DIM, dp_keep_prob=0.9) model.cuda() lr = 20 # decay factor for learning rate lr_decay_base = 1 / 1.15 # we will not touch lr for the first m_flat_lr epochs m_flat_lr = 14.0 for epoch in range(300): prt = False if epoch % 30 == 0: prt = True else: prt = False