def init_model(self, Model, config_path): with open(config_path) as c: config = json.load(c) # Get model config model_config = config[Model.__name__] self.logger.debug('%s config:' % (Model.__name__)) self.logger.debug(model_config) # Load vocab self.logger.debug('Loading vocab from %s...' % (model_config['vocab_path'])) self.vocab = load_vocab(model_config['vocab_path']) # Initialize model self.logger.debug('Initializing model...') model = Model(len(self.vocab), model_config['max_len'], model_config['hidden_size'], self.vocab(self.vocab.sos), self.vocab(self.vocab.eos)) # Load model weights model.load_state_dict(torch.load(model_config['model_path'])) self.beam_size = model_config['beam_size'] model.decoder = TopKDecoder(model.decoder, self.beam_size) model.eval() if torch.cuda.is_available(): model.cuda() self.logger.debug('Done') return model
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.Scale(args.crop_size), transforms.CenterCrop(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # Load vocabulary wrapper vocab = load_vocab(args.vocab_path) # Build Models vqa = VQAModel(len(vocab), args.max_length, args.hidden_size, vocab(vocab.sos), vocab(vocab.eos), rnn_cell=args.rnn_cell) # Load the trained model parameters vqa.load_state_dict(torch.load(args.model_path)) vqa.decoder = TopKDecoder(vqa.decoder, args.beam_size) vqa.eval() # Prepare Image image = Image.open(args.image) image_tensor = Variable(transform(image).unsqueeze(0)) # Prepare question question = args.question tokens = nltk.tokenize.word_tokenize(str(question).lower()) print tokens question = [] question.extend([vocab(token) for token in tokens]) question_tensor = Variable(torch.Tensor(question).long().unsqueeze(0)) # If use gpu if torch.cuda.is_available(): vqa.cuda() image_tensor = image_tensor.cuda() question_tensor = question_tensor.cuda() # Run model softmax_list, _, other = vqa(image_tensor, question_tensor, [len(question)]) topk_length = other['topk_length'][0] topk_sequence = other['sequence'] for k in range(args.beam_size): length = topk_length[k] sequence = [seq[k] for seq in topk_sequence] tgt_id_seq = [sequence[di][0].data[0] for di in range(length)] tgt_seq = [vocab.idx2word[tok] for tok in tgt_id_seq] print tgt_seq
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.Scale(args.crop_size), transforms.CenterCrop(args.crop_size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]) # Load vocabulary wrapper vocab = load_vocab(args.vocab_path) # Build Models vqg = VQGModel(len(vocab), args.max_length, args.hidden_size, vocab(vocab.sos), vocab(vocab.eos), rnn_cell=args.rnn_cell) # Load the trained model parameters vqg.load_state_dict(torch.load(args.model_path)) vqg.decoder = TopKDecoder(vqg.decoder, args.beam_size) vqg.eval() # Prepare Image image = Image.open(args.image) image_tensor = Variable(transform(image).unsqueeze(0)) # If use gpu if torch.cuda.is_available(): vqg.cuda() image_tensor = image_tensor.cuda() # Run model softmax_list, _, other = vqg(image_tensor) topk_length = other['topk_length'] topk_sequence = other['topk_sequence'] for i in range(image_tensor.size(0)): print 'image %d' % (i) for k in range(args.beam_size): length = topk_length[i][k] sequence = [seq[i, k] for seq in topk_sequence] tgt_id_seq = [sequence[di].data[0] for di in range(length)] tgt_seq = [vocab.idx2word[tok] for tok in tgt_id_seq] print tgt_seq
def main(args): # Config logging logging.basicConfig(level=logging.INFO, format=LOG_FORMAT) logger = logging.getLogger() # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # Load vocabulary wrapper. vocab = load_vocab(args.vocab_path) # Build data loader logger.info("Building data loader...") data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) logger.info("Done") # Build the models logger.info("Building image captioning models...") vqg = VQGModel(len(vocab), args.max_length, args.hidden_size, vocab(vocab.sos), vocab(vocab.eos), rnn_cell=args.rnn_cell) logger.info("Done") if torch.cuda.is_available(): vqg.cuda() # Loss and Optimizer weight = torch.ones(len(vocab)) pad = vocab(vocab.pad) # Set loss weight for 'pad' symbol to 0 loss = NLLLoss(weight, pad) if torch.cuda.is_available(): loss.cuda() # Parameters to train params = vqg.params_to_train() optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the Models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, questions, answers) in enumerate(data_loader): # Set mini-batch dataset images = Variable(images) questions = Variable(questions) answers = Variable(answers) if torch.cuda.is_available(): images = images.cuda() questions = questions.cuda() answers = answers.cuda() # Forward, Backward and Optimize vqg.zero_grad() outputs, hiddens, other = vqg(images, questions, teacher_forcing_ratio=1.0) # Get loss loss.reset() for step, step_output in enumerate(outputs): batch_size = questions.size(0) loss.eval_batch(step_output.contiguous().view(batch_size, -1), questions[:, step + 1]) loss.backward() optimizer.step() # Print log info if i % args.log_step == 0: logger.info( 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f' % (epoch, args.num_epochs, i, total_step, loss.get_loss())) # Save the models if (i + 1) % args.save_step == 0: torch.save( vqg.state_dict(), os.path.join(args.model_path, 'vqg-%d-%d.pkl' % (epoch + 1, i + 1)))