Ejemplo n.º 1
0
def main(args):
    with open('data/vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
    encoder = EncoderCNN(256)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(256, 512, len(vocab), 1)
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder))
    decoder.load_state_dict(torch.load(args.decoder))

    measurement_score = test(encoder, decoder, vocab, args.num_samples,
                             args.num_hints, args.debug, args.c_step,
                             args.no_avg)
    if args.msm == "co":
        scores = cocoEval()
        scores_u = cocoEval(res='data/captions_val2014_results_u.json')
        print(scores)
        print(scores_u)

        with open(args.filepath, 'w+') as f:
            pickle.dump((scores, scores_u), f)
Ejemplo n.º 2
0
def main():
    # Load vocabulary wrapper.
    with open(vocab_path) as f:
        vocab = pickle.load(f)

    encoder = EncoderCNN(4096, embed_dim)
    encoder.load_state_dict(torch.load('searchimage.pkl'))
    for p in encoder.parameters():
        p.requires_grad = False

    word_encoder = EncoderRNN(embed_dim, embed_dim, len(vocab), num_layers_rnn)
    word_encoder.load_state_dict(torch.load('searchword.pkl'))
    if torch.cuda.is_available():
        encoder.cuda()
        word_encoder.cuda()
    # Loss and Optimizer
    criterion = nn.MSELoss()
    params = list(
        word_encoder.parameters())  # + list(encoder.linear.parameters())
    optimizer = torch.optim.Adam(params, lr=2e-6, weight_decay=0.001)

    #load data
    with open(image_data_file) as f:
        image_data = pickle.load(f)
    image_features = si.loadmat(image_feature_file)

    img_features = image_features['fc7'][0]
    img_features = np.concatenate(img_features)

    print 'here'
    iteration = 0

    for i in range(10):  # epoch
        use_caption = i % 5
        print 'Epoch', i
        losses = []
        for x, y in make_mini_batch(img_features,
                                    image_data,
                                    use_caption=use_caption):
            encoder.zero_grad()
            word_encoder.zero_grad()

            word_padding, lengths = make_word_padding(y, vocab)
            x = Variable(torch.from_numpy(x).cuda())
            word_index = Variable(torch.from_numpy(word_padding).cuda())

            features = encoder(x)
            outputs = word_encoder(word_index, lengths)
            loss = torch.mean((features - outputs).pow(2))
            loss.backward()
            optimizer.step()
            losses.append(loss.data[0])
            if iteration % 100 == 0:
                print 'loss', sum(losses) / float(len(losses))
                losses = []

            iteration += 1

        torch.save(word_encoder.state_dict(), 'searchword.pkl')
        torch.save(encoder.state_dict(), 'searchimage.pkl')
Ejemplo n.º 3
0
def test(args):
    transform = transforms.Compose([
        transforms.ToTensor(),
    ])

    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    encoder = EncoderCNN(args.embed_size).eval()
    decoder = DecoderRNN(args.embed_size, len(vocab), args.hidden_size,
                         args.num_layers)

    # 加载训练好的模型的参数
    encoder.load_state_dict(torch.load(args.encoder_path, map_location='cpu'))
    decoder.load_state_dict(torch.load(args.decoder_path, map_location='cpu'))

    image = load_img(args.img_path, transform)

    feature = encoder(image)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()

    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    print(sentence)
    image = Image.open(args.img_path)
    plt.imshow(np.asarray(image))
    plt.show()
    def getCaption(self,
                   imgs,
                   output_path='',
                   vocab_path='data/vocab.pkl',
                   decoder_path='models/decoder-5-3000.pkl',
                   encoder_path='models/encoder-5-3000.pkl',
                   embed_size=256,
                   hidden_size=512,
                   num_layers=1):
        if (output_path == ''):
            output_path = self.DEFAULT_OUTPUT_PATH
        device = self.device
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

        # Load vocabulary wrapper
        with open(vocab_path, 'rb') as f:
            vocab = pickle.load(f)

        # Build models
        encoder = EncoderCNN(embed_size).eval(
        )  # eval mode (batchnorm uses moving mean/variance)
        decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
        encoder = encoder.to(device)
        decoder = decoder.to(device)

        # Load the trained model parameters
        encoder.load_state_dict(torch.load(encoder_path))
        decoder.load_state_dict(torch.load(decoder_path))

        CAPTIONS = []

        for img in imgs:
            # Prepare an image
            image = self.load_image(img, transform=transform)
            image_tensor = image.to(device)

            # Generate an caption from the image
            feature = encoder(image_tensor)
            sampled_ids = decoder.sample(feature)
            sampled_ids = sampled_ids[0].cpu().numpy(
            )  # (1, max_seq_length) -> (max_seq_length)

            # Convert word_ids to words
            sampled_caption = []
            for word_id in sampled_ids:
                word = vocab.idx2word[word_id]
                sampled_caption.append(word)
                if word == '<end>':
                    break
            sentence = ' '.join(sampled_caption)

            # Print out the image and the generated caption
            CAPTIONS.append(self.prune_caption(sentence))

        json_captions = self.writeJSON(imgs, CAPTIONS, output_path=output_path)

        return json_captions
Ejemplo n.º 5
0
def main(args):
    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))
    
    # Build data loader
    data_loader = get_loader(args.image_dir, args.caption_path, vocab, 
                             transform, args.batch_size,
                             shuffle=True, num_workers=args.num_workers) 

    total_step = len(data_loader)

    # List to score the BLEU scores
    bleu_scores = []

    for i, (images, captions, lengths) in enumerate(data_loader):
        
        # Set mini-batch dataset
        images = images.to(device)
        # captions = captions.to(device)
        
        # Generate an caption from the image
        feature = encoder(images)
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids[0].cpu().numpy()

        # Convert word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)

        score = sentence_bleu(captions, sentence, args.bleu_weights)
        bleu_scores.append(score)

        # Print log info
        if i % args.log_step == 0:
            print('Finish [{}/{}], Current BLEU Score: {:.4f}'
                  .format(i, total_step, np.mean(bleu_scores)))

    np.save('test_results.npy', [bleu_scores, np.mean(bleu_scores)])
Ejemplo n.º 6
0
def run_inference(image_path,
                  encoder_path,
                  decoder_path,
                  vocab_path,
                  embed_size=256,
                  hidden_size=512,
                  num_layers=1):
    print(f'sample.py running ... ')
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(vocab_path, 'rb') as f:
        print("using " + vocab_path)
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(
        embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(
        torch.load(encoder_path, map_location=torch.device('cpu')))
    decoder.load_state_dict(
        torch.load(decoder_path, map_location=torch.device('cpu')))

    # Prepare an image
    image = load_image(image_path, transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        print(word)
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption).replace('<start>', '')
    sentence = sentence.replace('<end>', '')
    sentence = sentence.replace('_', ' ')

    # Print out the image and the generated caption
    print(sentence)

    print(f'debug: chay xong roi ne')
    return sentence.strip().capitalize()
Ejemplo n.º 7
0
def main(args):
    # Val images folder
    filepath = '/scratch/ys2542/pytorch-tutorial/tutorials/03-advanced/image_captioning/data/resizedval2014'
    onlyfiles = [fl for fl in listdir(filepath) if isfile(join(filepath, fl))]

    # image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # load vocabulary wrapper pickle file
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    encoder = EncoderCNN(args.embed_size)  # build encoder
    encoder.eval()  # evaluation mode by moving mean and variance
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)  # build decoder

    # load the trained CNN and RNN parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Load all images in val folder
    for i in onlyfiles:

        badsize = 0  # count the unload images
        args_image = filepath + '/'  # val folder path with image names
        args_image = args_image + i

        # transform image and wrap it to tensor
        image = load_image(args_image, transform)
        image_tensor = to_var(image, volatile=True)

        if torch.cuda.is_available():  # load GPU
            encoder.cuda()
            decoder.cuda()

            # generate caption from image
            try:
                feature = encoder(image_tensor)
                sampled_ids = decoder.sample(feature)
                sampled_ids = sampled_ids.cpu().data.numpy()

                # decode word_ids to words
                sampled_caption = []
                for word_id in sampled_ids:
                    word = vocab.idx2word[word_id]
                    sampled_caption.append(word)
                    if word == '<end>':
                        break
                sentence = ' '.join(sampled_caption)

                # print out image and generated caption without start and end
                print('beam_size_1' + '\t' + i + '\t' + sentence[8:-8])

            except:
                badsize = badsize + 1  # count some wrong images
Ejemplo n.º 8
0
def main(image):
    # Configuration for hyper-parameters
    config = Config()

    # Image Preprocessing
    transform = config.test_transform

    # Load vocabulary
    with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(config.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(config.embed_size, config.hidden_size, len(vocab),
                         config.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(
        torch.load(
            os.path.join(config.teacher_cnn_path, config.trained_encoder)))
    decoder.load_state_dict(
        torch.load(
            os.path.join(config.teacher_lstm_path, config.trained_decoder)))
    # Prepare Image
    image = Image.open(image)
    image_tensor = Variable(transform(image).unsqueeze(0))

    # Set initial states
    state = (Variable(torch.zeros(config.num_layers, 1, config.hidden_size)),
             Variable(torch.zeros(config.num_layers, 1, config.hidden_size)))

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
        state = [s.cuda() for s in state]
        image_tensor = image_tensor.cuda()

    # Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature, state)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word_id == 96:
            sampled_caption.append('<end>')
            break
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out image and generated caption.
    print(sentence)
    return sentence
Ejemplo n.º 9
0
def inference_coco(encoder_file: str, decoder_file: str, embed_size: int,
                   hidden_size: int, from_cpu: bool) -> None:
    """
    Displays an original image from coco test dataset and prints its associated caption.

    encoder_file:   Name of the encoder to load.
    decoder_file:   Name of the decoder to load.
    embed_size:     Word embedding size for the encoder.
    hidden_size:    Hidden layer of the LSTM size.
    from_cpu:       Whether the model has been saved on CPU.
    """
    # Define transform
    transform_test = transforms.Compose([
        transforms.Resize(256),  # smaller edge of image resized to 256
        transforms.RandomCrop(224),  # get 224x224 crop from random location
        transforms.ToTensor(),  # convert the PIL Image to a tensor
        transforms.Normalize(
            (0.485, 0.456, 0.406),  # normalize image for pre-trained model
            (0.229, 0.224, 0.225))
    ])

    # Device to use fo inference
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Create the data loader.
    data_loader = get_loader(transform=transform_test, mode='test')

    # Obtain sample image
    _, image = next(iter(data_loader))

    # The size of the vocabulary.
    vocab_size = len(data_loader.dataset.vocab)

    # Initialize the encoder and decoder, and set each to inference mode.
    encoder = EncoderCNN(embed_size)
    encoder.eval()
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
    decoder.eval()

    # Load the trained weights.
    if from_cpu:
        encoder.load_state_dict(
            torch.load(os.path.join('./models', encoder_file),
                       map_location='cpu'))
        decoder.load_state_dict(
            torch.load(os.path.join('./models', decoder_file),
                       map_location='cpu'))
    else:
        encoder.load_state_dict(
            torch.load(os.path.join('./models', encoder_file)))
        decoder.load_state_dict(
            torch.load(os.path.join('./models', decoder_file)))

    # Move models to GPU if CUDA is available.
    encoder.to(device)
    decoder.to(device)

    get_prediction(encoder, decoder, data_loader, device)
Ejemplo n.º 10
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out image and generated caption.
    sentence = sentence.replace('<start> ',
                                '').replace(' <end>', '').replace('.',
                                                                  '').strip()
    translator = Translator()
    sentence_indo = translator.translate(sentence, dest='id').text
    print('This is an image of: ' + sentence_indo)
    tts = gTTS(sentence_indo, 'id')
    tts.save('result.mp3')
    playsound('result.mp3')

    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
    plt.show()
Ejemplo n.º 11
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.Scale(args.crop_size),
        transforms.CenterCrop(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    alexnet = models.alexnet(pretrained=True)
    alexnet2 = AlexNet2(alexnet)
    # Build Models
    encoder = EncoderCNN(4096, args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = Image.open(args.image)
    image_tensor = Variable(transform(image).unsqueeze(0))

    # Set initial states
    state = (Variable(torch.zeros(args.num_layers, 1, args.hidden_size)),
             Variable(torch.zeros(args.num_layers, 1, args.hidden_size)))

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
        alexnet2.cuda()
        state = [s.cuda() for s in state]
        image_tensor = image_tensor.cuda()

    # Generate caption from image
    alexnet2(image_tensor)
    feature = encoder(alexnet2.fc7_value)
    sampled_ids = decoder.sample(feature, state)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out image and generated caption.
    print(sentence)
Ejemplo n.º 12
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    #image = load_image(args.image, transform)
    #image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    data = []
    try:
        img_path = args.image
        # Prepare Image
        image = load_image(img_path, transform)
        image_tensor = to_var(image, volatile=True)
        # Generate caption from image
        feature = encoder(image_tensor)
        #pdb.set_trace()
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids.cpu().data.numpy()

        # Decode word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            if word == '<start>':
                continue
            if word == '<end>':
                break
            sampled_caption.append(word)
        sentence = ' '.join(sampled_caption)
        # Print out image and generated caption.
        print(sentence)
        data.append({'key': img_path.split('/')[-1], 'sentence': sentence})
    except:
        print(img_path)
Ejemplo n.º 13
0
def main(args):
    # Image preprocessing
    prediction = []
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    dirname=''

    fnames = listdir(os.getcwd)
    #with open(dirname)
    for fname in fnames:
        
        #print(fname)
        # Prepare an image
        image = load_image(''+fname, transform)
        image_tensor = image.to(device)
        
        # Generate an caption from the image
        feature = encoder(image_tensor)
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)
        
        # Convert word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)
        current_pred = [fname, sentence]
        predictions.append(current_pred)
        # Print out the image and the generated caption
        print(fname)
        print (sentence)
        #image = Image.open(args.image)
        #plt.imshow(np.asarray(image))

    df = pd.DataFrame(predictions, columns=['File Name', 'Caption'])
    df.to_excel('output.xls')
Ejemplo n.º 14
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))
    
    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    
    # Prepare Image
    image_dir = args.image
    images = os.listdir(image_dir)
    for image_id in images:
        if not image_id.endswith('.jpg'):
            continue
        image = os.path.join(image_dir, image_id)
        image = load_image(image, transform)
        image_tensor = to_var(image, volatile=True)
        
        # Generate caption from image
        try:
            feature, cnn_features = encoder(image_tensor)
            sampled_ids = decoder.sample(feature, cnn_features)
            sampled_ids = sampled_ids.cpu().data.numpy()
        except:
            continue
        
        # Decode word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)
        
        # Print out image and generated caption.
        print (image_id + '\t' + sentence)
Ejemplo n.º 15
0
class Neuraltalk2:

  def __init__(self):
    print("Defining I.A")
    # Device configuration
    self.device = torch.device('cpu')

    #vars
    embed_size = 256
    hidden_size = 512
    num_layers = 1
    encoder_path = 'models/encoder-5-3000.pkl'
    decoder_path = 'models/decoder-5-3000.pkl'
    vocab_path = 'data/vocab.pkl'

    # Image preprocessing
    self.transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])

    with open(vocab_path, 'rb') as f:
      self.vocab = pickle.load(f)

    print("Building Model")
    # Build models
    self.encoder = EncoderCNN(embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    self.decoder = DecoderRNN(embed_size, hidden_size, len(self.vocab), num_layers)
    self.encoder = self.encoder.to(self.device)
    self.decoder = self.decoder.to(self.device)

    print("loading checkpoint")
    # Load the trained model parameters
    self.encoder.load_state_dict(torch.load(encoder_path))
    self.decoder.load_state_dict(torch.load(decoder_path))

  def eval_image(self, image_path):
    # Prepare an image
    image = load_image(image_path, self.transform)
    image_tensor = image.to(self.device)
    
    # Generate an caption from the image
    feature = self.encoder(image_tensor)
    sampled_ids = self.decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()
    
    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
      word = self.vocab.idx2word[word_id]
      if word == '<end>':
        break
      if word == '<start>':
        continue
      sampled_caption.append(word)
        
    sentence = ' '.join(sampled_caption)
    return sentence
Ejemplo n.º 16
0
def main(args):   
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    
    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)
    
    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    
    # Generate caption from image
    feature = encoder(image_tensor)
    sentence = decode(feature,[],decoder,vocab)

    print (sentence)
    user_input = raw_input("Does it make sense to you?(y/n)\n")

    if str(user_input) == "n":
        f = open('data/step_1/caption_1.txt','r')
        ground_true = f.read()
        teach_wordid = []
        teach_wordid.append(vocab.word2idx["<start>"])
        while(True):
            print "This is the ground true:\n"+ground_true+"\n"+\
            "###################################################\n"
            reference = ground_true.split()
            hypothesis = sentence.split()
            BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis)
            print "Current BLEU score is "+str(BLEUscore)
            word = raw_input("next word:\n")
            word_idx = vocab.word2idx[word]
            teach_wordid.append(word_idx)
            sentence = decode(feature,teach_wordid,decoder,vocab)
            print "###################################################\n"
            print "Current Translated sentence is: \n"+sentence+"\n"
Ejemplo n.º 17
0
def main(args):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    vocab = Vocabulary.load_vocab(args['data_dir'])
    args['vocab_size'] = len(vocab)
    encoder = EncoderCNN(args).eval()
    decoder = DecoderRNN(args)
    encoder.to(device)
    decoder.to(device)

    encoder.load_state_dict(
        torch.load(os.path.join(args['model_dir'], args['encoder_name'])))
    decoder.load_state_dict(
        torch.load(os.path.join(args['model_dir'], args['decoder_name'])))

    test_caption_list = []
    for file_name in os.listdir(
            os.path.join(args['data_dir'], args['image_dir'])):
        if os.path.isfile(
                os.path.join(args['data_dir'], args['image_dir'], file_name)):
            image = load_image(
                os.path.join(args['data_dir'], args['image_dir'], file_name),
                transform)
            image_tensor = image.to(device)
        else:
            continue

        feature = encoder(image_tensor)
        sample_ids = decoder.sample(feature)
        sample_ids = sample_ids[0].cpu().numpy()

        sample_caption = []
        for word_id in sample_ids:
            word = vocab.idx2word[word_id]
            sample_caption.append(word)
            if word == '<end>':
                break

        sentence = ' '.join(sample_caption)
        print(sentence)
        test_caption_list.append((file_name, sentence))


#        image=Image.open(os.path.join(args['data_dir'],args['image_dir'],file_name))
#        plt.imshow(np.asarray(image))

    with open(os.path.join(args['data_dir'], 'test_caption.txt'), 'w') as f:
        for item in test_caption_list:
            f.write('image_name:{} ---- generated_caption:{}\n'.format(
                item[0], item[1]))
            f.write('\n')
Ejemplo n.º 18
0
    def predict(self, args):
        print('predict..start')
        device = torch.device('cpu')

        # Image preprocessing
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

        # Load vocabulary wrapper
        # Load vocabulary wrapper
        #vocab = Vocabulary()

        with open(args.vocab_path, 'rb') as f:
            vocab = pickle.load(f)

        # Build models
        encoder = EncoderCNN(args.embed_size).eval(
        )  # eval mode (batchnorm uses moving mean/variance)
        decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                             args.num_layers)
        encoder = encoder.to(device)
        decoder = decoder.to(device)

        # Load the trained model parameters
        encoder.load_state_dict(
            torch.load(args.encoder_path, map_location=device))
        decoder.load_state_dict(
            torch.load(args.decoder_path, map_location=device))

        # Prepare an image
        image = self.load_image(args.image, transform)
        image_tensor = image.to(device)

        # Generate an caption from the image
        feature = encoder(image_tensor)
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids[0].cpu().numpy(
        )  # (1, max_seq_length) -> (max_seq_length)

        # Convert word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)

        # Print out the image and the generated caption
        print(sentence)
        return sentence
Ejemplo n.º 19
0
def main():
    st.title('Image Captioning App')
    st.markdown(STYLE, unsafe_allow_html=True)

    file = st.file_uploader("Upload file", type=["png", "jpg", "jpeg"])
    show_file = st.empty()

    if not file:
        show_file.info("Please upload a file of type: " +
                       ", ".join(["png", "jpg", "jpeg"]))
        return

    content = file.getvalue()

    show_file.image(file)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoder_file = 'encoder-5-batch-128-hidden-256-epochs-5.pkl'
    decoder_file = 'decoder-5-batch-128-hidden-256-epochs-5.pkl'

    embed_size = 300
    hidden_size = 256

    vocab_size, word2idx, idx2word = get_vocab()

    encoder = EncoderCNN(embed_size)
    encoder.eval()
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
    decoder.eval()

    encoder.load_state_dict(torch.load(os.path.join('./models', encoder_file)))
    decoder.load_state_dict(torch.load(os.path.join('./models', decoder_file)))

    encoder.to(device)
    decoder.to(device)

    transform_test = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    PIL_image = Image.open(file).convert('RGB')
    orig_image = np.array(PIL_image)
    image = transform_test(PIL_image)
    image = image.to(device).unsqueeze(0)
    features = encoder(image).unsqueeze(1)
    output = decoder.sample(features)

    sentence = clean_sentence(output, idx2word)
    st.info("Generated caption --> " + sentence)

    file.close()
Ejemplo n.º 20
0
def sample(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    dissection.replace_layers(encoder, [
        ('resnet.7.2.bn3', 'final_layer'),
    ])
    vec = torch.zeros(2048).to(device)
    vec[0] = 100
    encoder.replacement['final_layer'] = vec

    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare an image
    image = load_image(args.image, transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out the image and the generated caption
    print(sentence)
Ejemplo n.º 21
0
def main2(image,
          encoder_path='models/encoder-5-3000.pkl',
          decoder_path='models/decoder-5-3000.pkl',
          vocab_path="data/vocab.pkl",
          embed_size=256,
          hidden_size=512,
          num_layers=1):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(
        embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(encoder_path))
    decoder.load_state_dict(torch.load(decoder_path))

    # Prepare an image
    image = load_image(image, transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out the image and the generated caption
    print(sentence)
    #image = Image.open(args.image)
    #plt.imshow(np.asarray(image))
    return sentence
Ejemplo n.º 22
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    bar = Bar('Processing', max=100)
    for i in range(100):
        bar.next()

# Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids.cpu().data.numpy()
    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    bar.finish()
    # Print out image and generated caption.
    print("\n")
    print(sentence)
    image = Image.open(args.image)
    imgplot = plt.imshow(np.asarray(image))
    plt.show()
Ejemplo n.º 23
0
def main(cfg):

    # print(cfg.pretty())
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(cfg.image.mean, cfg.image.std)
    ])
    print(hydra.utils.to_absolute_path(cfg.train.vocab_path))
    with open(hydra.utils.to_absolute_path(cfg.train.vocab_path), 'rb') as f:
        vocab = pickle.load(f)

    # モデルの構築
    encoder = EncoderCNN(cfg.train.embed_size).eval()
    decoder = DecoderRNN(cfg.train.embed_size, cfg.train.hidden_size,
                         len(vocab), cfg.train.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # 学習済みモデルのパラメータを読み込む
    encoder.load_state_dict(
        torch.load(hydra.utils.to_absolute_path(cfg.train.encoder_path)))
    decoder.load_state_dict(
        torch.load(hydra.utils.to_absolute_path(cfg.train.decoder_path)))

    with open(json_dir, encoding='utf-8') as f:
        data = json.loads(f.read())

        for key in data['images']:
            img_file_name = key['file_name']
            img_file_path = base_dir + '/data/val2014/' + img_file_name

            # 画像の準備
            image = load_image(hydra.utils.to_absolute_path(img_file_path),
                               transform)
            image_tensor = image.to(device)

            # 入力した画像からキャプションを生成する
            feature = encoder(image_tensor)
            sampled_ids = decoder.sample(feature)
            sampled_ids = sampled_ids[0].cpu().numpy()

            # word_idsをwordに変換する
            sampled_caption = []
            for word_id in sampled_ids:
                word = vocab.idx2word[word_id]
                sampled_caption.append(word)
                if word == '<end>':
                    break
            # <start>,<end>,"."を取り除いて処理を行う
            sentence = ' '.join(sampled_caption[1:-2])

            print(sentence)
Ejemplo n.º 24
0
class BFM(nn.Module):
    def __init__(self, args, vocab_len):
        super(BFM, self).__init__()
        self.encoder = EncoderCNN(args.embed_size).eval().cpu() 
        self.encoder.load_state_dict(torch.load('encoder.ckpt', map_location=torch.device('cpu')))
        self.decoder = DecoderRNN(args.embed_size, args.hidden_size, vocab_len, args.num_layers).eval().cpu() 
        self.decoder.forward = self.decoder.sample
        self.decoder.load_state_dict(torch.load('decoder.ckpt', map_location=torch.device('cpu')))
        
    def forward(self, image):
        feature = self.encoder(image)
        sampled_ids = self. decoder(feature)
        return sampled_ids
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare an image
    image = load_image(args.image, transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sampled_caption.remove('<end>')
    sampled_caption.remove('<start>')
    sentence = ' '.join(sampled_caption)
    # Print out the image and the generated caption
    print(sentence)
    f = open("demofile3.txt", "w")
    f.truncate(0)
    f.write(sentence)
    f.close()
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
Ejemplo n.º 26
0
    def load_coco_encoder(self):
        # Hard-code embedding size to that used in pretrained model at path
        init = torch.load('/data/rxdh/conventions_data/encoder-5-3000.pkl')
        encoder = EncoderCNN(256).to('cuda')
        encoder.eval()
        encoder.load_state_dict(init)

        self.transform = transforms.Compose([
            transforms.CenterCrop(self.imsize),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

        return encoder
Ejemplo n.º 27
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.Scale(args.crop_size),
        #transforms.CenterCrop(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
        image_tensor = image_tensor.cuda()
    # Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature, args.length)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        if word != '<start>' and word != '<end>':
            sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ''.join(sampled_caption)

    # Print out image and generated caption.
    print(sentence)
Ejemplo n.º 28
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)
    
    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    
    # Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids.cpu().data.numpy()
    
    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    
    # Print out image and generated caption.
    print (sentence)
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
Ejemplo n.º 29
0
def main(args):

    # Build models
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(
        args.encoder_path))  #path for dumbing output of encoder model
    decoder.load_state_dict(torch.load(
        args.decoder_path))  #path for dumbing output of decoder model

    test_data = generate_training_data(5)
    textures_test = generate_textures(test_data)
    transforms_test = generate_transforms(test_data)
    for i in range(len(textures_test)):
        plt.imsave('predictions/texture4_0%i.png' % i,
                   textures_test[i],
                   cmap="gray")

    print(transforms_test)
    predicted_progs = []

    for texture in textures_test:
        texture = torch.tensor(texture, device=device)
        texture = texture.unsqueeze(0)
        texture = texture.unsqueeze(
            0)  #for EncoderCNN ought to unsqueeze twice
        feature = encoder(texture)
        sampled_seq = decoder.sample(feature)
        sampled_seq = sampled_seq[0].cpu().numpy(
        )  # (1, max_seq_length) -> (max_seq_length)

        # Convert sampled sequence of transforms to words
        prog = []
        for int_word in sampled_seq:
            word = int_to_word(int_word)
            prog.append(word)
            if word == '<end>':
                break
        trans_seq = '-->'.join(prog)
        predicted_progs.append([trans_seq])

    # Print out the sequence of generated transform sequences
    print(predicted_progs)
Ejemplo n.º 30
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare an image
    '''
    image = load_image(args.image, transform)
    image_tensor = image.to(device)
    '''
    data_loader, _ = get_loader(transforms=False)
    inp, targets = next(iter(data_loader))
    audio = inp_transform(inp)
    audio = audio.to(device)
    
    # Generate an caption from the image
    feature = encoder(audio)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)
    
    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    
    # Print out the image and the generated caption
    print("Logits : {}\nTarget : {}".format(sentence, targets))
    '''
Ejemplo n.º 31
0
def encode(img,vocab):
    transform = transforms.Compose([
            transforms.ToTensor(), 
            transforms.Normalize((0.485, 0.456, 0.406), 
                                 (0.229, 0.224, 0.225))])
    encoder = EncoderCNN(256)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    encoder.load_state_dict(torch.load('../models/encoder-4-3000.pkl'))
    image = load_image(img, transform)
    image_tensor = to_var(image, volatile=True)
    
    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
    feature = encoder(image_tensor)
    return feature
Ejemplo n.º 32
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare an image
    image = load_image(args.image, transform)
    image_tensor = image.to(device)
    
    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)
    
    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    
    # Print out the image and the generated caption
    print (sentence)
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
Ejemplo n.º 33
0
                         cocoapi_loc=COCOPATH)

vocab = data_loader.dataset.vocab
# The size of the vocabulary.
vocab_size = len(vocab)

# Initialize the encoder and decoder. 
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

device = torch.device("cpu")
# encoder.to(device)
# decoder.to(device)

# Load the pretrained model
encoder.load_state_dict(torch.load(PRETRAINED_MODEL_PATH.format('encoder')))
decoder.load_state_dict(torch.load(PRETRAINED_MODEL_PATH.format('decoder')))

encoder.eval()
decoder.eval()

images, conv_images = next(iter(data_loader))
features = encoder(conv_images).unsqueeze(1)
output = decoder.sample(features, max_len=max_len)

word_list = []
for word_idx in output:
    if word_idx == vocab.word2idx[vocab.start_word]:
        continue
    if word_idx == vocab.word2idx[vocab.end_word]:
        break