def getCaption(self,
                   imgs,
                   output_path='',
                   vocab_path='data/vocab.pkl',
                   decoder_path='models/decoder-5-3000.pkl',
                   encoder_path='models/encoder-5-3000.pkl',
                   embed_size=256,
                   hidden_size=512,
                   num_layers=1):
        if (output_path == ''):
            output_path = self.DEFAULT_OUTPUT_PATH
        device = self.device
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

        # Load vocabulary wrapper
        with open(vocab_path, 'rb') as f:
            vocab = pickle.load(f)

        # Build models
        encoder = EncoderCNN(embed_size).eval(
        )  # eval mode (batchnorm uses moving mean/variance)
        decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
        encoder = encoder.to(device)
        decoder = decoder.to(device)

        # Load the trained model parameters
        encoder.load_state_dict(torch.load(encoder_path))
        decoder.load_state_dict(torch.load(decoder_path))

        CAPTIONS = []

        for img in imgs:
            # Prepare an image
            image = self.load_image(img, transform=transform)
            image_tensor = image.to(device)

            # Generate an caption from the image
            feature = encoder(image_tensor)
            sampled_ids = decoder.sample(feature)
            sampled_ids = sampled_ids[0].cpu().numpy(
            )  # (1, max_seq_length) -> (max_seq_length)

            # Convert word_ids to words
            sampled_caption = []
            for word_id in sampled_ids:
                word = vocab.idx2word[word_id]
                sampled_caption.append(word)
                if word == '<end>':
                    break
            sentence = ' '.join(sampled_caption)

            # Print out the image and the generated caption
            CAPTIONS.append(self.prune_caption(sentence))

        json_captions = self.writeJSON(imgs, CAPTIONS, output_path=output_path)

        return json_captions
Esempio n. 2
0
def test(args):
    transform = transforms.Compose([
        transforms.ToTensor(),
    ])

    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    encoder = EncoderCNN(args.embed_size).eval()
    decoder = DecoderRNN(args.embed_size, len(vocab), args.hidden_size,
                         args.num_layers)

    # 加载训练好的模型的参数
    encoder.load_state_dict(torch.load(args.encoder_path, map_location='cpu'))
    decoder.load_state_dict(torch.load(args.decoder_path, map_location='cpu'))

    image = load_img(args.img_path, transform)

    feature = encoder(image)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()

    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    print(sentence)
    image = Image.open(args.img_path)
    plt.imshow(np.asarray(image))
    plt.show()
Esempio n. 3
0
def main(args):
    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))
    
    # Build data loader
    data_loader = get_loader(args.image_dir, args.caption_path, vocab, 
                             transform, args.batch_size,
                             shuffle=True, num_workers=args.num_workers) 

    total_step = len(data_loader)

    # List to score the BLEU scores
    bleu_scores = []

    for i, (images, captions, lengths) in enumerate(data_loader):
        
        # Set mini-batch dataset
        images = images.to(device)
        # captions = captions.to(device)
        
        # Generate an caption from the image
        feature = encoder(images)
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids[0].cpu().numpy()

        # Convert word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)

        score = sentence_bleu(captions, sentence, args.bleu_weights)
        bleu_scores.append(score)

        # Print log info
        if i % args.log_step == 0:
            print('Finish [{}/{}], Current BLEU Score: {:.4f}'
                  .format(i, total_step, np.mean(bleu_scores)))

    np.save('test_results.npy', [bleu_scores, np.mean(bleu_scores)])
Esempio n. 4
0
def main(args):
    vectore_dir = '/root/server/best_model/'

    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    # encoder = EncoderCNN(args.embed_size)
    qvecs_pca = np.load(
        os.path.join(vectore_dir, "q_2{}.npy".format(args.embed_size)))
    # encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    # encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    #image = load_image(args.image, transform)
    #image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        # encoder.cuda()
        decoder.cuda()

    data = []
    # img_path = args.image
    # # Prepare Image
    # image = load_image(img_path, transform)
    # image_tensor = to_var(image, volatile=True)
    # Generate caption from image
    # feature = encoder(image_tensor)
    num = 29
    feature = torch.from_numpy(qvecs_pca[num:num + 1, :]).cuda()
    #pdb.set_trace()
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        if word == '<start>':
            continue
        if word == '<end>':
            break
        sampled_caption.append(word)
    sentence = ' '.join(sampled_caption)
    # Print out image and generated caption.
    print(sentence)
Esempio n. 5
0
def run_inference(image_path,
                  encoder_path,
                  decoder_path,
                  vocab_path,
                  embed_size=256,
                  hidden_size=512,
                  num_layers=1):
    print(f'sample.py running ... ')
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(vocab_path, 'rb') as f:
        print("using " + vocab_path)
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(
        embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(
        torch.load(encoder_path, map_location=torch.device('cpu')))
    decoder.load_state_dict(
        torch.load(decoder_path, map_location=torch.device('cpu')))

    # Prepare an image
    image = load_image(image_path, transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        print(word)
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption).replace('<start>', '')
    sentence = sentence.replace('<end>', '')
    sentence = sentence.replace('_', ' ')

    # Print out the image and the generated caption
    print(sentence)

    print(f'debug: chay xong roi ne')
    return sentence.strip().capitalize()
Esempio n. 6
0
def main(args):
    # Val images folder
    filepath = '/scratch/ys2542/pytorch-tutorial/tutorials/03-advanced/image_captioning/data/resizedval2014'
    onlyfiles = [fl for fl in listdir(filepath) if isfile(join(filepath, fl))]

    # image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # load vocabulary wrapper pickle file
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    encoder = EncoderCNN(args.embed_size)  # build encoder
    encoder.eval()  # evaluation mode by moving mean and variance
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)  # build decoder

    # load the trained CNN and RNN parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Load all images in val folder
    for i in onlyfiles:

        badsize = 0  # count the unload images
        args_image = filepath + '/'  # val folder path with image names
        args_image = args_image + i

        # transform image and wrap it to tensor
        image = load_image(args_image, transform)
        image_tensor = to_var(image, volatile=True)

        if torch.cuda.is_available():  # load GPU
            encoder.cuda()
            decoder.cuda()

            # generate caption from image
            try:
                feature = encoder(image_tensor)
                sampled_ids = decoder.sample(feature)
                sampled_ids = sampled_ids.cpu().data.numpy()

                # decode word_ids to words
                sampled_caption = []
                for word_id in sampled_ids:
                    word = vocab.idx2word[word_id]
                    sampled_caption.append(word)
                    if word == '<end>':
                        break
                sentence = ' '.join(sampled_caption)

                # print out image and generated caption without start and end
                print('beam_size_1' + '\t' + i + '\t' + sentence[8:-8])

            except:
                badsize = badsize + 1  # count some wrong images
Esempio n. 7
0
def main(image):
    # Configuration for hyper-parameters
    config = Config()

    # Image Preprocessing
    transform = config.test_transform

    # Load vocabulary
    with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(config.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(config.embed_size, config.hidden_size, len(vocab),
                         config.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(
        torch.load(
            os.path.join(config.teacher_cnn_path, config.trained_encoder)))
    decoder.load_state_dict(
        torch.load(
            os.path.join(config.teacher_lstm_path, config.trained_decoder)))
    # Prepare Image
    image = Image.open(image)
    image_tensor = Variable(transform(image).unsqueeze(0))

    # Set initial states
    state = (Variable(torch.zeros(config.num_layers, 1, config.hidden_size)),
             Variable(torch.zeros(config.num_layers, 1, config.hidden_size)))

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
        state = [s.cuda() for s in state]
        image_tensor = image_tensor.cuda()

    # Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature, state)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word_id == 96:
            sampled_caption.append('<end>')
            break
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out image and generated caption.
    print(sentence)
    return sentence
Esempio n. 8
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out image and generated caption.
    sentence = sentence.replace('<start> ',
                                '').replace(' <end>', '').replace('.',
                                                                  '').strip()
    translator = Translator()
    sentence_indo = translator.translate(sentence, dest='id').text
    print('This is an image of: ' + sentence_indo)
    tts = gTTS(sentence_indo, 'id')
    tts.save('result.mp3')
    playsound('result.mp3')

    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
    plt.show()
Esempio n. 9
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.Scale(args.crop_size),
        transforms.CenterCrop(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    alexnet = models.alexnet(pretrained=True)
    alexnet2 = AlexNet2(alexnet)
    # Build Models
    encoder = EncoderCNN(4096, args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = Image.open(args.image)
    image_tensor = Variable(transform(image).unsqueeze(0))

    # Set initial states
    state = (Variable(torch.zeros(args.num_layers, 1, args.hidden_size)),
             Variable(torch.zeros(args.num_layers, 1, args.hidden_size)))

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
        alexnet2.cuda()
        state = [s.cuda() for s in state]
        image_tensor = image_tensor.cuda()

    # Generate caption from image
    alexnet2(image_tensor)
    feature = encoder(alexnet2.fc7_value)
    sampled_ids = decoder.sample(feature, state)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out image and generated caption.
    print(sentence)
Esempio n. 10
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    #image = load_image(args.image, transform)
    #image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    data = []
    try:
        img_path = args.image
        # Prepare Image
        image = load_image(img_path, transform)
        image_tensor = to_var(image, volatile=True)
        # Generate caption from image
        feature = encoder(image_tensor)
        #pdb.set_trace()
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids.cpu().data.numpy()

        # Decode word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            if word == '<start>':
                continue
            if word == '<end>':
                break
            sampled_caption.append(word)
        sentence = ' '.join(sampled_caption)
        # Print out image and generated caption.
        print(sentence)
        data.append({'key': img_path.split('/')[-1], 'sentence': sentence})
    except:
        print(img_path)
Esempio n. 11
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))
    
    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    
    # Prepare Image
    image_dir = args.image
    images = os.listdir(image_dir)
    for image_id in images:
        if not image_id.endswith('.jpg'):
            continue
        image = os.path.join(image_dir, image_id)
        image = load_image(image, transform)
        image_tensor = to_var(image, volatile=True)
        
        # Generate caption from image
        try:
            feature, cnn_features = encoder(image_tensor)
            sampled_ids = decoder.sample(feature, cnn_features)
            sampled_ids = sampled_ids.cpu().data.numpy()
        except:
            continue
        
        # Decode word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)
        
        # Print out image and generated caption.
        print (image_id + '\t' + sentence)
Esempio n. 12
0
def main(args):
    # Image preprocessing
    prediction = []
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    dirname=''

    fnames = listdir(os.getcwd)
    #with open(dirname)
    for fname in fnames:
        
        #print(fname)
        # Prepare an image
        image = load_image(''+fname, transform)
        image_tensor = image.to(device)
        
        # Generate an caption from the image
        feature = encoder(image_tensor)
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)
        
        # Convert word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)
        current_pred = [fname, sentence]
        predictions.append(current_pred)
        # Print out the image and the generated caption
        print(fname)
        print (sentence)
        #image = Image.open(args.image)
        #plt.imshow(np.asarray(image))

    df = pd.DataFrame(predictions, columns=['File Name', 'Caption'])
    df.to_excel('output.xls')
Esempio n. 13
0
class Neuraltalk2:

  def __init__(self):
    print("Defining I.A")
    # Device configuration
    self.device = torch.device('cpu')

    #vars
    embed_size = 256
    hidden_size = 512
    num_layers = 1
    encoder_path = 'models/encoder-5-3000.pkl'
    decoder_path = 'models/decoder-5-3000.pkl'
    vocab_path = 'data/vocab.pkl'

    # Image preprocessing
    self.transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])

    with open(vocab_path, 'rb') as f:
      self.vocab = pickle.load(f)

    print("Building Model")
    # Build models
    self.encoder = EncoderCNN(embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    self.decoder = DecoderRNN(embed_size, hidden_size, len(self.vocab), num_layers)
    self.encoder = self.encoder.to(self.device)
    self.decoder = self.decoder.to(self.device)

    print("loading checkpoint")
    # Load the trained model parameters
    self.encoder.load_state_dict(torch.load(encoder_path))
    self.decoder.load_state_dict(torch.load(decoder_path))

  def eval_image(self, image_path):
    # Prepare an image
    image = load_image(image_path, self.transform)
    image_tensor = image.to(self.device)
    
    # Generate an caption from the image
    feature = self.encoder(image_tensor)
    sampled_ids = self.decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()
    
    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
      word = self.vocab.idx2word[word_id]
      if word == '<end>':
        break
      if word == '<start>':
        continue
      sampled_caption.append(word)
        
    sentence = ' '.join(sampled_caption)
    return sentence
Esempio n. 14
0
def main(args):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    vocab = Vocabulary.load_vocab(args['data_dir'])
    args['vocab_size'] = len(vocab)
    encoder = EncoderCNN(args).eval()
    decoder = DecoderRNN(args)
    encoder.to(device)
    decoder.to(device)

    encoder.load_state_dict(
        torch.load(os.path.join(args['model_dir'], args['encoder_name'])))
    decoder.load_state_dict(
        torch.load(os.path.join(args['model_dir'], args['decoder_name'])))

    test_caption_list = []
    for file_name in os.listdir(
            os.path.join(args['data_dir'], args['image_dir'])):
        if os.path.isfile(
                os.path.join(args['data_dir'], args['image_dir'], file_name)):
            image = load_image(
                os.path.join(args['data_dir'], args['image_dir'], file_name),
                transform)
            image_tensor = image.to(device)
        else:
            continue

        feature = encoder(image_tensor)
        sample_ids = decoder.sample(feature)
        sample_ids = sample_ids[0].cpu().numpy()

        sample_caption = []
        for word_id in sample_ids:
            word = vocab.idx2word[word_id]
            sample_caption.append(word)
            if word == '<end>':
                break

        sentence = ' '.join(sample_caption)
        print(sentence)
        test_caption_list.append((file_name, sentence))


#        image=Image.open(os.path.join(args['data_dir'],args['image_dir'],file_name))
#        plt.imshow(np.asarray(image))

    with open(os.path.join(args['data_dir'], 'test_caption.txt'), 'w') as f:
        for item in test_caption_list:
            f.write('image_name:{} ---- generated_caption:{}\n'.format(
                item[0], item[1]))
            f.write('\n')
Esempio n. 15
0
    def predict(self, args):
        print('predict..start')
        device = torch.device('cpu')

        # Image preprocessing
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

        # Load vocabulary wrapper
        # Load vocabulary wrapper
        #vocab = Vocabulary()

        with open(args.vocab_path, 'rb') as f:
            vocab = pickle.load(f)

        # Build models
        encoder = EncoderCNN(args.embed_size).eval(
        )  # eval mode (batchnorm uses moving mean/variance)
        decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                             args.num_layers)
        encoder = encoder.to(device)
        decoder = decoder.to(device)

        # Load the trained model parameters
        encoder.load_state_dict(
            torch.load(args.encoder_path, map_location=device))
        decoder.load_state_dict(
            torch.load(args.decoder_path, map_location=device))

        # Prepare an image
        image = self.load_image(args.image, transform)
        image_tensor = image.to(device)

        # Generate an caption from the image
        feature = encoder(image_tensor)
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids[0].cpu().numpy(
        )  # (1, max_seq_length) -> (max_seq_length)

        # Convert word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)

        # Print out the image and the generated caption
        print(sentence)
        return sentence
Esempio n. 16
0
def main2(image,
          encoder_path='models/encoder-5-3000.pkl',
          decoder_path='models/decoder-5-3000.pkl',
          vocab_path="data/vocab.pkl",
          embed_size=256,
          hidden_size=512,
          num_layers=1):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(
        embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(encoder_path))
    decoder.load_state_dict(torch.load(decoder_path))

    # Prepare an image
    image = load_image(image, transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out the image and the generated caption
    print(sentence)
    #image = Image.open(args.image)
    #plt.imshow(np.asarray(image))
    return sentence
Esempio n. 17
0
def main():
    st.title('Image Captioning App')
    st.markdown(STYLE, unsafe_allow_html=True)

    file = st.file_uploader("Upload file", type=["png", "jpg", "jpeg"])
    show_file = st.empty()

    if not file:
        show_file.info("Please upload a file of type: " +
                       ", ".join(["png", "jpg", "jpeg"]))
        return

    content = file.getvalue()

    show_file.image(file)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoder_file = 'encoder-5-batch-128-hidden-256-epochs-5.pkl'
    decoder_file = 'decoder-5-batch-128-hidden-256-epochs-5.pkl'

    embed_size = 300
    hidden_size = 256

    vocab_size, word2idx, idx2word = get_vocab()

    encoder = EncoderCNN(embed_size)
    encoder.eval()
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
    decoder.eval()

    encoder.load_state_dict(torch.load(os.path.join('./models', encoder_file)))
    decoder.load_state_dict(torch.load(os.path.join('./models', decoder_file)))

    encoder.to(device)
    decoder.to(device)

    transform_test = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    PIL_image = Image.open(file).convert('RGB')
    orig_image = np.array(PIL_image)
    image = transform_test(PIL_image)
    image = image.to(device).unsqueeze(0)
    features = encoder(image).unsqueeze(1)
    output = decoder.sample(features)

    sentence = clean_sentence(output, idx2word)
    st.info("Generated caption --> " + sentence)

    file.close()
Esempio n. 18
0
def sample(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    dissection.replace_layers(encoder, [
        ('resnet.7.2.bn3', 'final_layer'),
    ])
    vec = torch.zeros(2048).to(device)
    vec[0] = 100
    encoder.replacement['final_layer'] = vec

    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare an image
    image = load_image(args.image, transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out the image and the generated caption
    print(sentence)
Esempio n. 19
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    bar = Bar('Processing', max=100)
    for i in range(100):
        bar.next()

# Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids.cpu().data.numpy()
    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    bar.finish()
    # Print out image and generated caption.
    print("\n")
    print(sentence)
    image = Image.open(args.image)
    imgplot = plt.imshow(np.asarray(image))
    plt.show()
Esempio n. 20
0
def main(cfg):

    # print(cfg.pretty())
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(cfg.image.mean, cfg.image.std)
    ])
    print(hydra.utils.to_absolute_path(cfg.train.vocab_path))
    with open(hydra.utils.to_absolute_path(cfg.train.vocab_path), 'rb') as f:
        vocab = pickle.load(f)

    # モデルの構築
    encoder = EncoderCNN(cfg.train.embed_size).eval()
    decoder = DecoderRNN(cfg.train.embed_size, cfg.train.hidden_size,
                         len(vocab), cfg.train.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # 学習済みモデルのパラメータを読み込む
    encoder.load_state_dict(
        torch.load(hydra.utils.to_absolute_path(cfg.train.encoder_path)))
    decoder.load_state_dict(
        torch.load(hydra.utils.to_absolute_path(cfg.train.decoder_path)))

    with open(json_dir, encoding='utf-8') as f:
        data = json.loads(f.read())

        for key in data['images']:
            img_file_name = key['file_name']
            img_file_path = base_dir + '/data/val2014/' + img_file_name

            # 画像の準備
            image = load_image(hydra.utils.to_absolute_path(img_file_path),
                               transform)
            image_tensor = image.to(device)

            # 入力した画像からキャプションを生成する
            feature = encoder(image_tensor)
            sampled_ids = decoder.sample(feature)
            sampled_ids = sampled_ids[0].cpu().numpy()

            # word_idsをwordに変換する
            sampled_caption = []
            for word_id in sampled_ids:
                word = vocab.idx2word[word_id]
                sampled_caption.append(word)
                if word == '<end>':
                    break
            # <start>,<end>,"."を取り除いて処理を行う
            sentence = ' '.join(sampled_caption[1:-2])

            print(sentence)
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare an image
    image = load_image(args.image, transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sampled_caption.remove('<end>')
    sampled_caption.remove('<start>')
    sentence = ' '.join(sampled_caption)
    # Print out the image and the generated caption
    print(sentence)
    f = open("demofile3.txt", "w")
    f.truncate(0)
    f.write(sentence)
    f.close()
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
Esempio n. 22
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.Scale(args.crop_size),
        #transforms.CenterCrop(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
        image_tensor = image_tensor.cuda()
    # Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature, args.length)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        if word != '<start>' and word != '<end>':
            sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ''.join(sampled_caption)

    # Print out image and generated caption.
    print(sentence)
Esempio n. 23
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)
    
    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    
    # Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids.cpu().data.numpy()
    
    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    
    # Print out image and generated caption.
    print (sentence)
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
def main(args):

    # Build models
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(
        args.encoder_path))  #path for dumbing output of encoder model
    decoder.load_state_dict(torch.load(
        args.decoder_path))  #path for dumbing output of decoder model

    test_data = generate_training_data(5)
    textures_test = generate_textures(test_data)
    transforms_test = generate_transforms(test_data)
    for i in range(len(textures_test)):
        plt.imsave('predictions/texture4_0%i.png' % i,
                   textures_test[i],
                   cmap="gray")

    print(transforms_test)
    predicted_progs = []

    for texture in textures_test:
        texture = torch.tensor(texture, device=device)
        texture = texture.unsqueeze(0)
        texture = texture.unsqueeze(
            0)  #for EncoderCNN ought to unsqueeze twice
        feature = encoder(texture)
        sampled_seq = decoder.sample(feature)
        sampled_seq = sampled_seq[0].cpu().numpy(
        )  # (1, max_seq_length) -> (max_seq_length)

        # Convert sampled sequence of transforms to words
        prog = []
        for int_word in sampled_seq:
            word = int_to_word(int_word)
            prog.append(word)
            if word == '<end>':
                break
        trans_seq = '-->'.join(prog)
        predicted_progs.append([trans_seq])

    # Print out the sequence of generated transform sequences
    print(predicted_progs)
Esempio n. 25
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare an image
    '''
    image = load_image(args.image, transform)
    image_tensor = image.to(device)
    '''
    data_loader, _ = get_loader(transforms=False)
    inp, targets = next(iter(data_loader))
    audio = inp_transform(inp)
    audio = audio.to(device)
    
    # Generate an caption from the image
    feature = encoder(audio)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)
    
    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    
    # Print out the image and the generated caption
    print("Logits : {}\nTarget : {}".format(sentence, targets))
    '''
Esempio n. 26
0
        def generatecaption(image):
            # Image preprocessing
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406),
                                     (0.229, 0.224, 0.225))])

            # Load vocabulary wrapper
            with open('/root/ImageCaptioning/data/vocab.pkl', 'rb') as f:
                vocab = pickle.load(f)

            # Build models
            encoder = EncoderCNN(256).eval()  # eval mode (batchnorm uses moving mean/variance)
            decoder = DecoderRNN(256, 512, len(vocab), 1)
            encoder = encoder.to(device)
            decoder = decoder.to(device)

            # Load the trained model parameters
            encoder.load_state_dict(torch.load('models/encoder-5-3000.pkl', map_location='cpu'))
            decoder.load_state_dict(torch.load('models/decoder-5-3000.pkl', map_location='cpu'))

            encoder.eval()
            decoder.eval()
            # Prepare an image
            image = load_image(image, transform)
            image_tensor = image.to(device)

            # Generate an caption from the image
            feature = encoder(image_tensor)
            sampled_ids = decoder.sample(feature)
            sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)

            # Convert word_ids to words
            sampled_caption = []
            for word_id in sampled_ids:
                word = vocab.idx2word[word_id]
                sampled_caption.append(word)
                if word == '<end>':
                    break
            self.sentence = ' '.join(sampled_caption)

            # Print out the image and the generated caption


            self.Entry1.delete(0, END)
            self.Entry1.insert(0,self.sentence[7:-5])
Esempio n. 27
0
def img2txt(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args['vocab_path'], 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args['embed_size']).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args['embed_size'], args['hidden_size'], len(vocab),
                         args['num_layers'])
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args['encoder_path']))
    decoder.load_state_dict(torch.load(args['decoder_path']))

    # Prepare an image
    image = load_image(args['image'], transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption).lstrip('<start>').rstrip(
        '<end>').strip()

    # Print out the image and the generated caption
    print(sentence)
    return f'[这个图翻译如下]:\n{sentence}\n{translate(sentence)}'
Esempio n. 28
0
def main(args):
    # Goruntu on isleme
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Kelime sarmalayiciyi yukle
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Model olusturma
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Egitilmis model parametrelerini yukleyin
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Bir goruntu hazirla
    image = load_image(args.image, transform)
    image_tensor = image.to(device)

    # Resimden bir resim yazisi olusturun
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Word_ids kelimelerini kelimelere donustur
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Resmi ve olusturulan basligi yazdirin
    print(sentence)
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
Esempio n. 29
0
def main(args):

    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    images = glob.glob('../data/flickr/train/*.jpg')
    for i in images:
        # Prepare an image
        image = load_image(i, transform)
        image_tensor = image.to(device)

        # Generate an caption from the image
        feature = encoder(image_tensor)
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids[0].cpu().numpy(
        )  # (1, max_seq_length) -> (max_seq_length)

        # Convert word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)

        print("{} : {}".format(i[21:], sentence))
Esempio n. 30
0
def get_text_caption(image):

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(
        args.embed_size, args.model_type,
        args.mode)  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(
        torch.load(args.model_path + "_" + args.model_type + "/encoder.pt"))
    encoder.eval()
    decoder.load_state_dict(
        torch.load(args.model_path + "_" + args.model_type + "/decoder.pt"))
    decoder.eval()

    # Prepare an image
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)

    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)
    print(sampled_ids)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    return (sentence.split("<start> ")[1].split(" <end>")[0]
            [:-2].capitalize().replace(" , ", ", "))
    def get_caption(self, img_tensor):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(device)
        print("running")

        # Models
        encoder_file = 'legit_model/encoder_1.pkl'
        decoder_file = 'legit_model/decoder_1.pkl'

        # Embed and hidden
        embed_size = 512
        hidden_size = 512

        # The size of the vocabulary.
        vocab_size = 8856

        # Initialize the encoder and decoder, and set each to inference mode.
        encoder = EncoderCNN(embed_size)
        encoder.eval()

        decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
        decoder.eval()

        # Load the trained weights.
        encoder.load_state_dict(
            torch.load(os.path.join('./models', encoder_file)))
        decoder.load_state_dict(
            torch.load(os.path.join('./models', decoder_file)))

        # Move models to GPU if CUDA is available.
        encoder.to(device)
        decoder.to(device)

        img_d = img_tensor.to(device)

        # Obtain the embedded image features.
        features = encoder(img_d).unsqueeze(1)

        # Pass the embedded image features through the model to get a predicted caption.
        img_output = decoder.sample(features)

        sentence = self.clean_sentence(img_output)

        return sentence
Esempio n. 32
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare an image
    image = load_image(args.image, transform)
    image_tensor = image.to(device)
    
    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)
    
    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    
    # Print out the image and the generated caption
    print (sentence)
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
Esempio n. 33
0
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

device = torch.device("cpu")
# encoder.to(device)
# decoder.to(device)

# Load the pretrained model
encoder.load_state_dict(torch.load(PRETRAINED_MODEL_PATH.format('encoder')))
decoder.load_state_dict(torch.load(PRETRAINED_MODEL_PATH.format('decoder')))

encoder.eval()
decoder.eval()

images, conv_images = next(iter(data_loader))
features = encoder(conv_images).unsqueeze(1)
output = decoder.sample(features, max_len=max_len)

word_list = []
for word_idx in output:
    if word_idx == vocab.word2idx[vocab.start_word]:
        continue
    if word_idx == vocab.word2idx[vocab.end_word]:
        break
    word_list.append(vocab.idx2word[word_idx])

print(' '.join(word_list))
plt.imshow(np.squeeze(images))
# plt.title(' '.join(word_list))
# plt.axis([])
plt.show()