def main(args): with open(args.word2idx_path, 'r') as fr: word2idx = json.loads(fr.read()) with open(args.sememe2idx_path, 'r') as fr: sememe2idx = json.loads(fr.read()) results = ResDataset(args.gen_file_path, word2idx, sememe2idx) res_loader = data.DataLoader(dataset=results, batch_size=1, shuffle=False) if torch.cuda.is_available(): pretrained_word_emb = torch.Tensor( np.load(args.pretrained_word_emb_path)).cuda() pretrained_sememe_emb = torch.Tensor( np.load(args.pretrained_sememe_emb_path)).cuda() else: pretrained_word_emb = torch.Tensor( np.load(args.pretrained_word_emb_path)) pretrained_sememe_emb = torch.Tensor( np.load(args.pretrained_sememe_emb_path)) # Load pretrained model or build from scratch adaptive = Encoder2Decoder(args.embed_size, args.hidden_size, len(word2idx) + 1, pretrained_word_emb, pretrained_sememe_emb) if torch.cuda.is_available(): adaptive.cuda() adaptive.load_state_dict(torch.load(args.pretrained)) scores = gen_score(adaptive, res_loader) with codecs.open(args.output_path, 'w', 'utf-8') as fw: fw.write('\n'.join(scores)) return 0
def main(args): # tb_summary_writer = SummaryWriter(args.checkpoint_path) if args.gpu is not None: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu # torch.cuda.set_device(args.gpu) torch.backends.cudnn.benchmark = True # To reproduce training results torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) # Image Preprocessing # For normalization, see https://github.com/pytorch/vision#models # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Load pretrained model or build from scratch adaptive = Encoder2Decoder(args, len(vocab)) if torch.cuda.is_available(): adaptive.cuda() # adaptive = Encoder2Decoder(args, len(vocab), args.gpu) if vars(args).get('start_from', None) is not None and os.path.isfile(args.start_from): adaptive.load_state_dict(torch.load(args.start_from)) # cider_scores = [] # Start Training # for epoch in range(start_epoch, args.num_epochs + 1): cider, metrics = coco_eval(adaptive, args, 0, split='test') print('Testing Model: CIDEr score %.2f' % (cider))
def main(self): print "********************Overhead Operations***************************" with open(self.vocab_path, 'rb') as f: self.vocab = pickle.load(f) # Image transformation self.transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load model print "Loading model {}......".format(self.pretrained) self.model = Encoder2Decoder(256, len(self.vocab), 512) self.model.load_state_dict(torch.load(self.pretrained)) self.model.eval() if torch.cuda.is_available(): self.model.cuda() print "Model loaded!" print "********************Validation Phase***************************" images_path = './data/resized/val2014/' caption_path = './data/annotations/captions_val2014.json' self.eval(images_path, caption_path, self.args.val_saved_name) print "********************Test Phase***************************" images_path = './data/resized/test2014/' caption_path = './data/annotations/image_info_test2014.json' self.eval(images_path, caption_path, self.args.test_saved_name)
def main(args): with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) adaptive = Encoder2Decoder(args.embed_size, len(vocab), args.hidden_size) adaptive.load_state_dict(torch.load(args.pretrained)) if torch.cuda.is_available(): adaptive.cuda() adaptive.eval() transform = transforms.Compose([ transforms.Scale((args.crop_size, args.crop_size)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) inference_data_loader = torch.utils.data.DataLoader( InferenceLoader(args.image_dir, img_transform=transform), batch_size=args.eval_size, shuffle=False, num_workers=args.num_workers, drop_last=False) results = [] print( '---------------------Start Inference on AI-challenger dataset-----------------------' ) for i, (images, file_prefix) in enumerate(inference_data_loader): images = to_var(images) generated_captions = adaptive.sampler_beam_search( images, args.beam_size) sampled_caption = [] #_generated_captions=generated_captions.cpu().data.numpy() for word_id in generated_captions: #print(word_id.int()) word = vocab.idx2word[int(word_id.cpu().data.numpy())] if word == '<end>': break else: sampled_caption.append(word) sentence = ''.join(sampled_caption[1:]) temp = {'image_id': file_prefix[0], 'caption': sentence} results.append(temp) # Disp evaluation process if (i + 1) % 10 == 0: print('[%d/%d]' % ((i + 1), len(inference_data_loader))) #json.dump(results,open(args.inference_output_json,"w"),ensure_ascii=False,sort_keys=True, indent=2, separators=(',', ': ')) with io.open(args.inference_output_json, 'w', encoding='utf-8') as fd: fd.write( unicode( json.dumps(results, ensure_ascii=False, sort_keys=True, indent=2, separators=(',', ': '))))
def main(args): # To reproduce training results torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) # Load word2idx with open(args.word2idx_path, 'r') as fr: word2idx = json.loads(fr.read()) with open(args.idx2word_path, 'r') as fr: idx2word = json.loads(fr.read()) with open(args.idx2sememe_path, 'r') as fr: idx2sememe = json.loads(fr.read()) # Build training data loader test_loader = get_loader(args.test_path, args.test_size, shuffle=False, num_workers=args.num_workers, mode='test') # Load pretrained embeddings if torch.cuda.is_available(): pretrained_word_emb = torch.Tensor( np.load(args.pretrained_word_emb_path)).cuda() pretrained_sememe_emb = torch.Tensor( np.load(args.pretrained_sememe_emb_path)).cuda() else: pretrained_word_emb = torch.Tensor( np.load(args.pretrained_word_emb_path)) pretrained_sememe_emb = torch.Tensor( np.load(args.pretrained_sememe_emb_path)) adaptive = Encoder2Decoder(args.embed_size, args.hidden_size, len(word2idx) + 1, pretrained_word_emb, pretrained_sememe_emb) if torch.cuda.is_available(): adaptive.cuda() adaptive.load_state_dict(torch.load(args.pretrained)) if args.beam_size == 1: results = greedy_sampler(adaptive, test_loader, idx2word, idx2sememe) else: results = beam_sampler(adaptive, test_loader, idx2word, idx2sememe) with codecs.open(args.output_path, 'w', 'utf-8') as fw: for word, sememes, definition in results: fw.write('%s ||| %s ||| %s\n' % (word, sememes, definition)) return 0
def __init__(self, dictionaries, model_path): self.model_path = model_path self.seg2idx, self.idx2seg = dictionaries self.model = Encoder2Decoder(256, len(self.seg2idx), 512) self.model.load_state_dict(torch.load( self.model_path, map_location='cpu' )) self.model.cuda() self.model.eval() CROP_SIZE = 224 transform = transforms.Compose([ transforms.Resize((CROP_SIZE, CROP_SIZE)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) self.transform = transform
def main(args): args.checkpoint_path = os.path.join( 'log_' + args.dataset + '_' + args.pattern, args.session) tb_summary_writer = SummaryWriter(args.checkpoint_path) if args.gpu is not None: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu # torch.cuda.set_device(args.gpu) torch.backends.cudnn.benchmark = True # To reproduce training results torch.manual_seed(args.seed) if torch.cuda.is_available(): print('### CUDA is available!') torch.cuda.manual_seed(args.seed) # Create model directory if not os.path.exists(args.checkpoint_path): os.makedirs(args.checkpoint_path) if not os.path.exists('data'): os.mkdir('data') # Image Preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) args.vocab = vocab # Build training data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Load pretrained model or build from scratch adaptive = Encoder2Decoder(args, len(vocab)) # adaptive = Encoder2Decoder(args, len(vocab), args.gpu) infos = {} if args.start_from is not None: with open( os.path.join(args.start_from, 'infos_' + args.dataset + '.pkl')) as f: infos = cPickle.load(f) # saved_model_opt = infos['args'] # need_be_same = ["caption_model", "rnn_type", "rnn_size", "num_layers"] # for checkme in need_be_same: # assert vars(saved_model_opt)[checkme] == vars(args)[ # checkme], "Command line argument and saved model disagree on '%s' " % checkme if vars(args).get('start_from', None) is not None and os.path.isfile( os.path.join(args.start_from, "model.pth")): adaptive.load_state_dict( torch.load(os.path.join(args.start_from, 'model.pth'))) epoch = infos.get('epoch', 1) # Constructing CNN parameters for optimization, only fine-tuning higher layers cnn_subs = list(adaptive.encoder.vgg_conv.children()) cnn_params = [list(sub_module.parameters()) for sub_module in cnn_subs] cnn_params = [item for sublist in cnn_params for item in sublist] cnn_optimizer = torch.optim.Adam(cnn_params, lr=args.learning_rate_cnn, betas=(args.alpha, args.beta)) if vars(args).get('start_from', None) is not None and os.path.isfile( os.path.join(args.start_from, "cnn_optimizer.pth")): cnn_optimizer.load_state_dict( torch.load(os.path.join(args.start_from, 'cnn_optimizer.pth'))) # Other parameter optimization params = list(adaptive.decoder.parameters()) # Will decay later learning_rate = args.learning_rate # Language Modeling Loss, Optimizers LMcriterion = nn.CrossEntropyLoss() # Change to GPU mode if available if torch.cuda.is_available(): adaptive.cuda() LMcriterion.cuda() # Train the Models total_step = len(data_loader) cider_scores = [] best_cider = 0.0 best_epoch = 0 best_cider_test = 0.0 best_epoch_test = 0 optimizer = torch.optim.Adam(params, lr=learning_rate) if vars(args).get('start_from', None) is not None and os.path.isfile( os.path.join(args.start_from, "optimizer.pth")): optimizer.load_state_dict( torch.load(os.path.join(args.start_from, 'optimizer.pth'))) # Start Training # for epoch in range(start_epoch, args.num_epochs + 1): update_lr_flag = True while True: if update_lr_flag: if epoch > args.lr_decay: frac = (epoch - args.cnn_epoch) / args.learning_rate_decay_every decay_factor = math.pow(0.5, frac) # Decay the learning rate learning_rate = learning_rate * decay_factor for group in optimizer.param_groups: group['lr'] = learning_rate update_lr_flag = False # Language Modeling Training print('------------------Training for Epoch %d----------------' % (epoch)) cur_time = time.time() for i, (images, captions, lengths) in enumerate(data_loader): start_time = time.time() # print('### images:', images.size()) # print('### captions:', captions.size()) # print('### lengths:', len(lengths)) # Set mini-batch dataset images = to_var(images) captions = to_var(captions) lengths = [cap_len - 1 for cap_len in lengths] targets = pack_padded_sequence(captions[:, 1:], lengths, batch_first=True)[0] # Forward, Backward and Optimize adaptive.train() adaptive.zero_grad() packed_scores = adaptive(images, captions, lengths, args.pattern) # Compute loss and backprop loss = LMcriterion(packed_scores[0], targets) loss.backward() # Gradient clipping for gradient exploding problem in LSTM for p in adaptive.decoder.lstm_cell.parameters(): p.data.clamp_(-args.clip, args.clip) optimizer.step() # Start learning rate decay # Start CNN fine-tuning if epoch > args.cnn_epoch: cnn_optimizer.step() # Print log info if i % args.log_step == 0: print('Epoch [%d/%d], Step [%d/%d], CrossEntropy Loss: %.4f, Perplexity: %5.4f, Elapsed: %.2fs' % \ (epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()), time.time() - start_time)) add_summary_value(tb_summary_writer, 'loss', loss.item(), epoch) print('##### Per Epoch Cost time: %.2fs' % (time.time() - cur_time)) infos['epoch'] = epoch infos['vocab'] = vocab infos['args'] = args with open(os.path.join(args.checkpoint_path, 'infos.pkl'), 'wb') as f: cPickle.dump(infos, f) torch.save(optimizer.state_dict(), os.path.join(args.checkpoint_path, 'optimizer.pth')) torch.save(cnn_optimizer.state_dict(), os.path.join(args.checkpoint_path, 'cnn_optimizer.pth')) torch.save(adaptive.state_dict(), os.path.join(args.checkpoint_path, 'model.pkl')) # with open(os.path.join(args.checkpoint_path, 'histories.pkl'), 'wb') as f: # cPickle.dump(infos, f) # Evaluation on validation set cider, metrics = coco_eval(adaptive, args, epoch, split='val') cider_scores.append(cider) add_summary_dict(tb_summary_writer, 'metrics', metrics, epoch) if cider > best_cider: best_cider = cider best_epoch = epoch # Save the Adaptive Attention model after each epoch # name = str(args.yml).split('.')[0].split('/')[-1] torch.save(adaptive.state_dict(), os.path.join(args.checkpoint_path, 'model-best.pkl')) with open(os.path.join(args.checkpoint_path, 'infos-best.pkl'), 'wb') as f: cPickle.dump(infos, f) print('Model of best epoch #: %d with CIDEr score %.2f' % (best_epoch, best_cider)) # Test on test set caption_val_path = args.caption_val_path args.caption_val_path = args.caption_val_path.replace('val', 'test') cider_test, metrics_test = coco_eval(adaptive, args, epoch, split='test') args.caption_val_path = caption_val_path if cider_test > best_cider_test: best_cider_test = cider_test best_epoch_test = epoch print('Test Phase: Model of best epoch #: %d with CIDEr score %.2f' % (best_epoch_test, best_cider_test)) epoch += 1 if epoch > 80: break
def main(args): # To reproduce training results torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image Preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build training data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Load pretrained model or build from scratch adaptive = Encoder2Decoder(args.embed_size, len(vocab), args.hidden_size) if args.pretrained: adaptive.load_state_dict(torch.load(args.pretrained)) # Get starting epoch #, note that model is named as '...your path to model/algoname-epoch#.pkl' # A little messy here. start_epoch = int( args.pretrained.split('/')[-1].split('-')[1].split('.')[0]) + 1 else: start_epoch = 1 # Constructing CNN parameters for optimization, only fine-tuning higher layers cnn_subs = list( adaptive.encoder.resnet_conv.children())[args.fine_tune_start_layer:] cnn_params = [list(sub_module.parameters()) for sub_module in cnn_subs] cnn_params = [item for sublist in cnn_params for item in sublist] cnn_optimizer = torch.optim.Adam(cnn_params, lr=args.learning_rate_cnn, betas=(args.alpha, args.beta)) # Other parameter optimization params = list( adaptive.encoder.affine_a.parameters() ) + list( adaptive.encoder.affine_b.parameters() ) \ + list( adaptive.decoder.parameters() ) # Will decay later learning_rate = args.learning_rate # Language Modeling Loss, Optimizers LMcriterion = nn.CrossEntropyLoss() # Change to GPU mode if available if torch.cuda.is_available(): adaptive.cuda() LMcriterion.cuda() # Train the Models total_step = len(data_loader) cider_scores = [] best_cider = 0.0 best_epoch = 0 # Start Training for epoch in range(start_epoch, args.num_epochs + 1): optimizer = torch.optim.Adam(params, lr=learning_rate) # Language Modeling Training print '------------------Training for Epoch %d----------------' % ( epoch) for i, (images, captions, lengths, _) in enumerate(data_loader): # Set mini-batch dataset images = to_var(images) captions = to_var(captions) lengths = [cap_len - 1 for cap_len in lengths] targets = pack_padded_sequence(captions[:, 1:], lengths, batch_first=True)[0] # Forward, Backward and Optimize adaptive.train() adaptive.zero_grad() packed_scores = adaptive(images, captions, lengths) # Compute loss and backprop loss = LMcriterion(packed_scores[0], targets) loss.backward() # Gradient clipping for gradient exploding problem in LSTM for p in adaptive.decoder.LSTM.parameters(): p.data.clamp_(-args.clip, args.clip) optimizer.step() # Start learning rate decay if epoch > args.lr_decay: frac = (epoch - args.cnn_epoch) / args.learning_rate_decay_every decay_factor = math.pow(0.5, frac) # Decay the learning rate learning_rate = learning_rate * decay_factor # Start CNN fine-tuning if epoch > args.cnn_epoch: cnn_optimizer.step() # Print log info if i % args.log_step == 0: print 'Epoch [%d/%d], Step [%d/%d], CrossEntropy Loss: %.4f, Perplexity: %5.4f' % ( epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0])) # Save the Adaptive Attention model after each epoch torch.save(adaptive.state_dict(), os.path.join(args.model_path, 'adaptive-%d.pkl' % (epoch))) # Evaluation on validation set cider = coco_eval(adaptive, args, epoch) cider_scores.append(cider) if cider > best_cider: best_cider = cider best_epoch = epoch if len(cider_scores) > 5: last_6 = cider_scores[-6:] last_6_max = max(last_6) # Test if there is improvement, if not do early stopping if last_6_max != best_cider: print 'No improvement with CIDEr in the last 6 epochs...Early stopping triggered.' print 'Model of best epoch #: %d with CIDEr score %.2f' % ( best_epoch, best_cider) break
def main(args): # To reproduce training results torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image Preprocessing # For normalization, see https://github.com/pytorch/vision#models transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper. with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build training data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, args.det_file, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Load pretrained model or build from scratch adaptive = Encoder2Decoder(args.embed_size, len(vocab), args.hidden_size) if args.pretrained: adaptive.load_state_dict(torch.load(args.pretrained)) # Get starting epoch #, note that model is named as '...your path to model/algoname-epoch#.pkl' # A little messy here. start_epoch = int( args.pretrained.split('/')[-1].split('-')[1].split('.')[0]) + 1 else: start_epoch = 1 # Constructing CNN parameters for optimization, only fine-tuning higher layers ch = list(adaptive.encoder.resnet_conv.children()) #for i in range(len(ch)): # print i,'th:',ch[i] #cnn_subs = list(adaptive.encoder.resnet_conv.children())[args.fine_tune_start_layer:] cnn_subs = list(adaptive.encoder.resnet_conv.children()) cnn_params = [list(sub_module.parameters()) for sub_module in cnn_subs] #print "cnn_params",cnn_params cnn_params = [item for sublist in cnn_params for item in sublist] #print "cnn_params",cnn_params #www cnn_optimizer = torch.optim.Adam(cnn_params, lr=args.learning_rate_cnn, betas=(args.alpha, args.beta)) # Other parameter optimization params = list(adaptive.encoder.affine_a.parameters()) + list( adaptive.encoder.affine_b.parameters()) + list( adaptive.decoder.parameters()) # Will decay later learning_rate = args.learning_rate # Language Modeling Loss LMcriterion = nn.CrossEntropyLoss() # Change to GPU mode if available if torch.cuda.is_available(): adaptive.cuda() LMcriterion.cuda() # Train the Models total_step = len(data_loader) cider_scores = [] best_cider = 0.0 best_epoch = 0 adaptive.load_state_dict(torch.load(args.checkpoint_file)) # Start Training for epoch in range(start_epoch, args.num_epochs + 1): if epoch > start_epoch: break # Evaluation on validation set Flickr_visual(adaptive, args, epoch)
def main(args): # To reproduce training results torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Load word2idx with open(args.word2idx_path, 'r') as fr: word2idx = json.loads(fr.read()) # Build training data loader data_loader = get_loader(args.train_path, args.batch_size, shuffle=True, num_workers=args.num_workers, mode='train') # Load pretrained embeddings if torch.cuda.is_available(): pretrained_word_emb = torch.Tensor( np.load(args.pretrained_word_emb_path)).cuda() pretrained_sememe_emb = torch.Tensor( np.load(args.pretrained_sememe_emb_path)).cuda() else: pretrained_word_emb = torch.Tensor( np.load(args.pretrained_word_emb_path)) pretrained_sememe_emb = torch.Tensor( np.load(args.pretrained_sememe_emb_path)) # Load pretrained model or build from scratch adaptive = Encoder2Decoder(args.embed_size, args.hidden_size, len(word2idx) + 1, pretrained_word_emb, pretrained_sememe_emb) if args.pretrained: adaptive.load_state_dict(torch.load(args.pretrained)) # Get starting epoch #, # note that model is named as # '...your path to model/algoname-epoch#.pkl' # A little messy here. start_epoch = int( args.pretrained.split('/')[-1].split('-')[1].split('.')[0]) + 1 else: start_epoch = 1 # Will decay later # learning_rate = args.learning_rate # Language Modeling Loss LMcriterion = nn.CrossEntropyLoss(ignore_index=0) # Change to GPU mode if available if torch.cuda.is_available(): adaptive.cuda() LMcriterion.cuda() # Train the Models total_step = len(data_loader) ppl_scores = [] best_ppl = 0.0 best_epoch = 0 # Start Learning Rate Decay # if epoch > args.lr_decay: # frac = float(epoch - # args.lr_decay) / args.learning_rate_decay_every # decay_factor = math.pow(0.5, frac) # # Decay the learning rate # learning_rate = args.learning_rate * decay_factor # print('Learning Rate for Epoch %d: %.6f' % (epoch, learning_rate)) optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, adaptive.parameters()), lr=args.learning_rate, betas=(args.alpha, args.beta)) # Start Training for epoch in range(start_epoch, args.num_epochs + 1): epoch_loss = [] # Language Modeling Training print('------------------Training for Epoch %d----------------' % (epoch)) for i, (word, sememes, definition) in enumerate(data_loader): # Set mini-batch dataset word = to_var(word) sememes = to_var(sememes) definition = to_var(definition) targets = definition[:, 1:] # Forward, Backward and Optimize adaptive.train() adaptive.zero_grad() scores, _ = adaptive(word, sememes, definition) scores = scores[:, :-1, :].transpose(1, 2) # Compute loss and backprop loss = LMcriterion(scores, targets) epoch_loss.append(loss.data[0]) loss.backward() # Gradient clipping for gradient exploding problem in LSTM # for p in adaptive.decoder.LSTM.parameters(): # p.data.clamp_(-args.clip, args.clip) clip_grad_norm( filter(lambda p: p.requires_grad, adaptive.parameters()), args.clip) # print(args.clip) optimizer.step() # Print log info if i % args.log_step == 0: print( 'Epoch [%d/%d], Step [%d/%d], CrossEntropy Loss: %.4f, Perplexity: %5.4f' % (epoch, args.num_epochs, i, total_step, loss.data[0], np.exp(loss.data[0]))) train_loss = np.mean(epoch_loss) train_ppl = np.exp(train_loss) # Save the Adaptive Attention model after each epoch torch.save(adaptive.state_dict(), os.path.join(args.model_path, 'adaptive-%d.pkl' % (epoch))) # Evaluation on validation set valid_ppl = defseq_eval(adaptive, args, epoch) ppl_scores.append(valid_ppl) print( 'Epoch [%d/%d], Train Loss: %.4f, Train PPL: %5.4f, Valid PPL: %5.4f' % (epoch, args.num_epochs, train_loss, train_ppl, valid_ppl)) if valid_ppl < best_ppl or best_ppl == 0.0: best_ppl = valid_ppl best_epoch = epoch if len(ppl_scores) > 5: last_6 = ppl_scores[-6:] last_6_min = min(last_6) # Test if there is improvement, if not do early stopping if last_6_min != best_ppl: print( 'No improvement with ppl in the last 6 epochs...Early stopping triggered.' ) print('Model of best epoch #: %d with ppl score %.2f' % (best_epoch, best_ppl)) break