Ejemplo n.º 1
0
    def init_model(self, Model, config_path):

        with open(config_path) as c:
            config = json.load(c)

        # Get model config
        model_config = config[Model.__name__]
        self.logger.debug('%s config:' % (Model.__name__))
        self.logger.debug(model_config)

        # Load vocab
        self.logger.debug('Loading vocab from %s...' %
                          (model_config['vocab_path']))
        self.vocab = load_vocab(model_config['vocab_path'])

        # Initialize model
        self.logger.debug('Initializing model...')
        model = Model(len(self.vocab), model_config['max_len'],
                      model_config['hidden_size'], self.vocab(self.vocab.sos),
                      self.vocab(self.vocab.eos))

        # Load model weights
        model.load_state_dict(torch.load(model_config['model_path']))

        self.beam_size = model_config['beam_size']
        model.decoder = TopKDecoder(model.decoder, self.beam_size)
        model.eval()

        if torch.cuda.is_available():
            model.cuda()

        self.logger.debug('Done')

        return model
Ejemplo n.º 2
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.Scale(args.crop_size),
        transforms.CenterCrop(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    # Load vocabulary wrapper
    vocab = load_vocab(args.vocab_path)

    # Build Models
    vqa = VQAModel(len(vocab),
                   args.max_length,
                   args.hidden_size,
                   vocab(vocab.sos),
                   vocab(vocab.eos),
                   rnn_cell=args.rnn_cell)

    # Load the trained model parameters
    vqa.load_state_dict(torch.load(args.model_path))

    vqa.decoder = TopKDecoder(vqa.decoder, args.beam_size)
    vqa.eval()

    # Prepare Image
    image = Image.open(args.image)
    image_tensor = Variable(transform(image).unsqueeze(0))

    # Prepare question
    question = args.question
    tokens = nltk.tokenize.word_tokenize(str(question).lower())
    print tokens
    question = []
    question.extend([vocab(token) for token in tokens])
    question_tensor = Variable(torch.Tensor(question).long().unsqueeze(0))

    # If use gpu
    if torch.cuda.is_available():
        vqa.cuda()
        image_tensor = image_tensor.cuda()
        question_tensor = question_tensor.cuda()

    # Run model
    softmax_list, _, other = vqa(image_tensor, question_tensor,
                                 [len(question)])
    topk_length = other['topk_length'][0]
    topk_sequence = other['sequence']

    for k in range(args.beam_size):
        length = topk_length[k]
        sequence = [seq[k] for seq in topk_sequence]
        tgt_id_seq = [sequence[di][0].data[0] for di in range(length)]
        tgt_seq = [vocab.idx2word[tok] for tok in tgt_id_seq]
        print tgt_seq
Ejemplo n.º 3
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([ 
        transforms.Scale(args.crop_size),  
        transforms.CenterCrop(args.crop_size),
        transforms.ToTensor(), 
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])
    
    # Load vocabulary wrapper
    vocab = load_vocab(args.vocab_path)

    # Build Models
    vqg = VQGModel(len(vocab), args.max_length, args.hidden_size,
                   vocab(vocab.sos), vocab(vocab.eos),
                   rnn_cell=args.rnn_cell)

    # Load the trained model parameters
    vqg.load_state_dict(torch.load(args.model_path))

    vqg.decoder = TopKDecoder(vqg.decoder, args.beam_size)
    vqg.eval()

    # Prepare Image       
    image = Image.open(args.image)
    image_tensor = Variable(transform(image).unsqueeze(0))

    # If use gpu
    if torch.cuda.is_available():
        vqg.cuda()
        image_tensor = image_tensor.cuda()

    
    # Run model
    softmax_list, _, other = vqg(image_tensor)
    topk_length = other['topk_length']
    topk_sequence = other['topk_sequence']

    for i in range(image_tensor.size(0)):
        print 'image %d' % (i)
        for k in range(args.beam_size):
            length = topk_length[i][k]
            sequence = [seq[i, k] for seq in topk_sequence]
            tgt_id_seq = [sequence[di].data[0] for di in range(length)]
            tgt_seq = [vocab.idx2word[tok] for tok in tgt_id_seq]
            print tgt_seq
Ejemplo n.º 4
0
def main(args):
    # Config logging
    logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)
    logger = logging.getLogger()

    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    # Load vocabulary wrapper.
    vocab = load_vocab(args.vocab_path)

    # Build data loader
    logger.info("Building data loader...")
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)
    logger.info("Done")

    # Build the models
    logger.info("Building image captioning models...")
    vqg = VQGModel(len(vocab),
                   args.max_length,
                   args.hidden_size,
                   vocab(vocab.sos),
                   vocab(vocab.eos),
                   rnn_cell=args.rnn_cell)

    logger.info("Done")

    if torch.cuda.is_available():
        vqg.cuda()

    # Loss and Optimizer
    weight = torch.ones(len(vocab))
    pad = vocab(vocab.pad)  # Set loss weight for 'pad' symbol to 0
    loss = NLLLoss(weight, pad)
    if torch.cuda.is_available():
        loss.cuda()

    # Parameters to train
    params = vqg.params_to_train()
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the Models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, questions, answers) in enumerate(data_loader):

            # Set mini-batch dataset
            images = Variable(images)
            questions = Variable(questions)
            answers = Variable(answers)
            if torch.cuda.is_available():
                images = images.cuda()
                questions = questions.cuda()
                answers = answers.cuda()

            # Forward, Backward and Optimize
            vqg.zero_grad()
            outputs, hiddens, other = vqg(images,
                                          questions,
                                          teacher_forcing_ratio=1.0)

            # Get loss
            loss.reset()
            for step, step_output in enumerate(outputs):
                batch_size = questions.size(0)
                loss.eval_batch(step_output.contiguous().view(batch_size, -1),
                                questions[:, step + 1])
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                logger.info(
                    'Epoch [%d/%d], Step [%d/%d], Loss: %.4f' %
                    (epoch, args.num_epochs, i, total_step, loss.get_loss()))

            # Save the models
            if (i + 1) % args.save_step == 0:
                torch.save(
                    vqg.state_dict(),
                    os.path.join(args.model_path,
                                 'vqg-%d-%d.pkl' % (epoch + 1, i + 1)))