Beispiel #1
0
def main(args):
    with open('data/vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
    encoder = EncoderCNN(256)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(256, 512, len(vocab), 1)
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder))
    decoder.load_state_dict(torch.load(args.decoder))

    measurement_score = test(encoder, decoder, vocab, args.num_samples,
                             args.num_hints, args.debug, args.c_step,
                             args.no_avg)
    if args.msm == "co":
        scores = cocoEval()
        scores_u = cocoEval(res='data/captions_val2014_results_u.json')
        print(scores)
        print(scores_u)

        with open(args.filepath, 'w+') as f:
            pickle.dump((scores, scores_u), f)
Beispiel #2
0
def main(image):
    # Configuration for hyper-parameters
    config = Config()

    # Image Preprocessing
    transform = config.test_transform

    # Load vocabulary
    with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(config.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(config.embed_size, config.hidden_size, len(vocab),
                         config.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(
        torch.load(
            os.path.join(config.teacher_cnn_path, config.trained_encoder)))
    decoder.load_state_dict(
        torch.load(
            os.path.join(config.teacher_lstm_path, config.trained_decoder)))
    # Prepare Image
    image = Image.open(image)
    image_tensor = Variable(transform(image).unsqueeze(0))

    # Set initial states
    state = (Variable(torch.zeros(config.num_layers, 1, config.hidden_size)),
             Variable(torch.zeros(config.num_layers, 1, config.hidden_size)))

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
        state = [s.cuda() for s in state]
        image_tensor = image_tensor.cuda()

    # Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature, state)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word_id == 96:
            sampled_caption.append('<end>')
            break
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out image and generated caption.
    print(sentence)
    return sentence
Beispiel #3
0
def main(args):
    # Val images folder
    filepath = '/scratch/ys2542/pytorch-tutorial/tutorials/03-advanced/image_captioning/data/resizedval2014'
    onlyfiles = [fl for fl in listdir(filepath) if isfile(join(filepath, fl))]

    # image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # load vocabulary wrapper pickle file
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    encoder = EncoderCNN(args.embed_size)  # build encoder
    encoder.eval()  # evaluation mode by moving mean and variance
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)  # build decoder

    # load the trained CNN and RNN parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Load all images in val folder
    for i in onlyfiles:

        badsize = 0  # count the unload images
        args_image = filepath + '/'  # val folder path with image names
        args_image = args_image + i

        # transform image and wrap it to tensor
        image = load_image(args_image, transform)
        image_tensor = to_var(image, volatile=True)

        if torch.cuda.is_available():  # load GPU
            encoder.cuda()
            decoder.cuda()

            # generate caption from image
            try:
                feature = encoder(image_tensor)
                sampled_ids = decoder.sample(feature)
                sampled_ids = sampled_ids.cpu().data.numpy()

                # decode word_ids to words
                sampled_caption = []
                for word_id in sampled_ids:
                    word = vocab.idx2word[word_id]
                    sampled_caption.append(word)
                    if word == '<end>':
                        break
                sentence = ' '.join(sampled_caption)

                # print out image and generated caption without start and end
                print('beam_size_1' + '\t' + i + '\t' + sentence[8:-8])

            except:
                badsize = badsize + 1  # count some wrong images
Beispiel #4
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.Scale(args.crop_size),
        transforms.CenterCrop(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    alexnet = models.alexnet(pretrained=True)
    alexnet2 = AlexNet2(alexnet)
    # Build Models
    encoder = EncoderCNN(4096, args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = Image.open(args.image)
    image_tensor = Variable(transform(image).unsqueeze(0))

    # Set initial states
    state = (Variable(torch.zeros(args.num_layers, 1, args.hidden_size)),
             Variable(torch.zeros(args.num_layers, 1, args.hidden_size)))

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
        alexnet2.cuda()
        state = [s.cuda() for s in state]
        image_tensor = image_tensor.cuda()

    # Generate caption from image
    alexnet2(image_tensor)
    feature = encoder(alexnet2.fc7_value)
    sampled_ids = decoder.sample(feature, state)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out image and generated caption.
    print(sentence)
def inference_coco(encoder_file: str, decoder_file: str, embed_size: int,
                   hidden_size: int, from_cpu: bool) -> None:
    """
    Displays an original image from coco test dataset and prints its associated caption.

    encoder_file:   Name of the encoder to load.
    decoder_file:   Name of the decoder to load.
    embed_size:     Word embedding size for the encoder.
    hidden_size:    Hidden layer of the LSTM size.
    from_cpu:       Whether the model has been saved on CPU.
    """
    # Define transform
    transform_test = transforms.Compose([
        transforms.Resize(256),  # smaller edge of image resized to 256
        transforms.RandomCrop(224),  # get 224x224 crop from random location
        transforms.ToTensor(),  # convert the PIL Image to a tensor
        transforms.Normalize(
            (0.485, 0.456, 0.406),  # normalize image for pre-trained model
            (0.229, 0.224, 0.225))
    ])

    # Device to use fo inference
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Create the data loader.
    data_loader = get_loader(transform=transform_test, mode='test')

    # Obtain sample image
    _, image = next(iter(data_loader))

    # The size of the vocabulary.
    vocab_size = len(data_loader.dataset.vocab)

    # Initialize the encoder and decoder, and set each to inference mode.
    encoder = EncoderCNN(embed_size)
    encoder.eval()
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
    decoder.eval()

    # Load the trained weights.
    if from_cpu:
        encoder.load_state_dict(
            torch.load(os.path.join('./models', encoder_file),
                       map_location='cpu'))
        decoder.load_state_dict(
            torch.load(os.path.join('./models', decoder_file),
                       map_location='cpu'))
    else:
        encoder.load_state_dict(
            torch.load(os.path.join('./models', encoder_file)))
        decoder.load_state_dict(
            torch.load(os.path.join('./models', decoder_file)))

    # Move models to GPU if CUDA is available.
    encoder.to(device)
    decoder.to(device)

    get_prediction(encoder, decoder, data_loader, device)
Beispiel #6
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out image and generated caption.
    sentence = sentence.replace('<start> ',
                                '').replace(' <end>', '').replace('.',
                                                                  '').strip()
    translator = Translator()
    sentence_indo = translator.translate(sentence, dest='id').text
    print('This is an image of: ' + sentence_indo)
    tts = gTTS(sentence_indo, 'id')
    tts.save('result.mp3')
    playsound('result.mp3')

    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
    plt.show()
Beispiel #7
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    #image = load_image(args.image, transform)
    #image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    data = []
    try:
        img_path = args.image
        # Prepare Image
        image = load_image(img_path, transform)
        image_tensor = to_var(image, volatile=True)
        # Generate caption from image
        feature = encoder(image_tensor)
        #pdb.set_trace()
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids.cpu().data.numpy()

        # Decode word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            if word == '<start>':
                continue
            if word == '<end>':
                break
            sampled_caption.append(word)
        sentence = ' '.join(sampled_caption)
        # Print out image and generated caption.
        print(sentence)
        data.append({'key': img_path.split('/')[-1], 'sentence': sentence})
    except:
        print(img_path)
Beispiel #8
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))
    
    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    
    # Prepare Image
    image_dir = args.image
    images = os.listdir(image_dir)
    for image_id in images:
        if not image_id.endswith('.jpg'):
            continue
        image = os.path.join(image_dir, image_id)
        image = load_image(image, transform)
        image_tensor = to_var(image, volatile=True)
        
        # Generate caption from image
        try:
            feature, cnn_features = encoder(image_tensor)
            sampled_ids = decoder.sample(feature, cnn_features)
            sampled_ids = sampled_ids.cpu().data.numpy()
        except:
            continue
        
        # Decode word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)
        
        # Print out image and generated caption.
        print (image_id + '\t' + sentence)
Beispiel #9
0
def main(args):   
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    
    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)
    
    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    
    # Generate caption from image
    feature = encoder(image_tensor)
    sentence = decode(feature,[],decoder,vocab)

    print (sentence)
    user_input = raw_input("Does it make sense to you?(y/n)\n")

    if str(user_input) == "n":
        f = open('data/step_1/caption_1.txt','r')
        ground_true = f.read()
        teach_wordid = []
        teach_wordid.append(vocab.word2idx["<start>"])
        while(True):
            print "This is the ground true:\n"+ground_true+"\n"+\
            "###################################################\n"
            reference = ground_true.split()
            hypothesis = sentence.split()
            BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis)
            print "Current BLEU score is "+str(BLEUscore)
            word = raw_input("next word:\n")
            word_idx = vocab.word2idx[word]
            teach_wordid.append(word_idx)
            sentence = decode(feature,teach_wordid,decoder,vocab)
            print "###################################################\n"
            print "Current Translated sentence is: \n"+sentence+"\n"
Beispiel #10
0
def main():
    st.title('Image Captioning App')
    st.markdown(STYLE, unsafe_allow_html=True)

    file = st.file_uploader("Upload file", type=["png", "jpg", "jpeg"])
    show_file = st.empty()

    if not file:
        show_file.info("Please upload a file of type: " +
                       ", ".join(["png", "jpg", "jpeg"]))
        return

    content = file.getvalue()

    show_file.image(file)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoder_file = 'encoder-5-batch-128-hidden-256-epochs-5.pkl'
    decoder_file = 'decoder-5-batch-128-hidden-256-epochs-5.pkl'

    embed_size = 300
    hidden_size = 256

    vocab_size, word2idx, idx2word = get_vocab()

    encoder = EncoderCNN(embed_size)
    encoder.eval()
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
    decoder.eval()

    encoder.load_state_dict(torch.load(os.path.join('./models', encoder_file)))
    decoder.load_state_dict(torch.load(os.path.join('./models', decoder_file)))

    encoder.to(device)
    decoder.to(device)

    transform_test = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    PIL_image = Image.open(file).convert('RGB')
    orig_image = np.array(PIL_image)
    image = transform_test(PIL_image)
    image = image.to(device).unsqueeze(0)
    features = encoder(image).unsqueeze(1)
    output = decoder.sample(features)

    sentence = clean_sentence(output, idx2word)
    st.info("Generated caption --> " + sentence)

    file.close()
Beispiel #11
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    bar = Bar('Processing', max=100)
    for i in range(100):
        bar.next()

# Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids.cpu().data.numpy()
    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    bar.finish()
    # Print out image and generated caption.
    print("\n")
    print(sentence)
    image = Image.open(args.image)
    imgplot = plt.imshow(np.asarray(image))
    plt.show()
    def load_coco_encoder(self):
        # Hard-code embedding size to that used in pretrained model at path
        init = torch.load('/data/rxdh/conventions_data/encoder-5-3000.pkl')
        encoder = EncoderCNN(256).to('cuda')
        encoder.eval()
        encoder.load_state_dict(init)

        self.transform = transforms.Compose([
            transforms.CenterCrop(self.imsize),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

        return encoder
Beispiel #13
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.Scale(args.crop_size),
        #transforms.CenterCrop(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
        image_tensor = image_tensor.cuda()
    # Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature, args.length)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        if word != '<start>' and word != '<end>':
            sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ''.join(sampled_caption)

    # Print out image and generated caption.
    print(sentence)
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)
    
    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    
    # Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids.cpu().data.numpy()
    
    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    
    # Print out image and generated caption.
    print (sentence)
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
Beispiel #15
0
def encode(img,vocab):
    transform = transforms.Compose([
            transforms.ToTensor(), 
            transforms.Normalize((0.485, 0.456, 0.406), 
                                 (0.229, 0.224, 0.225))])
    encoder = EncoderCNN(256)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    encoder.load_state_dict(torch.load('../models/encoder-4-3000.pkl'))
    image = load_image(img, transform)
    image_tensor = to_var(image, volatile=True)
    
    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
    feature = encoder(image_tensor)
    return feature
Beispiel #16
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(
        args.embed_size)  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    encoder.eval()
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare an image
    image = load_image(args.image, transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out the image and the generated caption
    print(sentence)
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
Beispiel #17
0
        def generatecaption(image):
            # Image preprocessing
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406),
                                     (0.229, 0.224, 0.225))])

            # Load vocabulary wrapper
            with open('/root/ImageCaptioning/data/vocab.pkl', 'rb') as f:
                vocab = pickle.load(f)

            # Build models
            encoder = EncoderCNN(256).eval()  # eval mode (batchnorm uses moving mean/variance)
            decoder = DecoderRNN(256, 512, len(vocab), 1)
            encoder = encoder.to(device)
            decoder = decoder.to(device)

            # Load the trained model parameters
            encoder.load_state_dict(torch.load('models/encoder-5-3000.pkl', map_location='cpu'))
            decoder.load_state_dict(torch.load('models/decoder-5-3000.pkl', map_location='cpu'))

            encoder.eval()
            decoder.eval()
            # Prepare an image
            image = load_image(image, transform)
            image_tensor = image.to(device)

            # Generate an caption from the image
            feature = encoder(image_tensor)
            sampled_ids = decoder.sample(feature)
            sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)

            # Convert word_ids to words
            sampled_caption = []
            for word_id in sampled_ids:
                word = vocab.idx2word[word_id]
                sampled_caption.append(word)
                if word == '<end>':
                    break
            self.sentence = ' '.join(sampled_caption)

            # Print out the image and the generated caption


            self.Entry1.delete(0, END)
            self.Entry1.insert(0,self.sentence[7:-5])
Beispiel #18
0
def get_text_caption(image):

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(
        args.embed_size, args.model_type,
        args.mode)  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(
        torch.load(args.model_path + "_" + args.model_type + "/encoder.pt"))
    encoder.eval()
    decoder.load_state_dict(
        torch.load(args.model_path + "_" + args.model_type + "/decoder.pt"))
    decoder.eval()

    # Prepare an image
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)

    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)
    print(sampled_ids)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    return (sentence.split("<start> ")[1].split(" <end>")[0]
            [:-2].capitalize().replace(" , ", ", "))
Beispiel #19
0
class Annotator():
    def __init__(self):
        self.transform = transforms.Compose([ 
            transforms.Resize(256),                          # smaller edge of image resized to 256
            transforms.CenterCrop(224),                      # get 224x224 crop from the center
            transforms.ToTensor(),                           # convert the PIL Image to a tensor
            transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                                 (0.229, 0.224, 0.225))])
        
        # Load cherckpoint with best model
        self.checkpoint = torch.load(os.path.join('./models', 'best-model.pkl'), 'cpu')
        # Specify values for embed_size and hidden_size - we use the same values as in training step
        self.embed_size = 512
        self.hidden_size = 512

        # Get the vocabulary and its size
        self.vocab = Vocabulary(None, './vocab.pkl', "<start>", "<end>", "<unk>", "<pad>", "", "", True)
        self.vocab_size = len(self.vocab)

        # Initialize the encoder and decoder, and set each to inference mode
        self.encoder = EncoderCNN(self.embed_size)
        self.encoder.eval()
        self.decoder = DecoderRNN(self.embed_size, self.hidden_size, self.vocab_size)
        self.decoder.eval()

        # Load the pre-trained weights
        self.encoder.load_state_dict(self.checkpoint['encoder'])
        self.decoder.load_state_dict(self.checkpoint['decoder'])

        # Move models to GPU if CUDA is available.
        #if torch.cuda.is_available():
        #   encoder.cuda()
        #   decoder.cuda()

    def annotate(self, image):
        transformed = self.transform(image).unsqueeze(0)
        features = self.encoder(transformed).unsqueeze(1)

        # Pass the embedded image features through the model to get a predicted caption.
        output = self.decoder.sample_beam_search(features)
        print('example output:', output)
        sentence = clean_sentence(output[0], self.vocab)
        print('example sentence:', sentence)
        return sentence
    def get_caption(self, img_tensor):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(device)
        print("running")

        # Models
        encoder_file = 'legit_model/encoder_1.pkl'
        decoder_file = 'legit_model/decoder_1.pkl'

        # Embed and hidden
        embed_size = 512
        hidden_size = 512

        # The size of the vocabulary.
        vocab_size = 8856

        # Initialize the encoder and decoder, and set each to inference mode.
        encoder = EncoderCNN(embed_size)
        encoder.eval()

        decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
        decoder.eval()

        # Load the trained weights.
        encoder.load_state_dict(
            torch.load(os.path.join('./models', encoder_file)))
        decoder.load_state_dict(
            torch.load(os.path.join('./models', decoder_file)))

        # Move models to GPU if CUDA is available.
        encoder.to(device)
        decoder.to(device)

        img_d = img_tensor.to(device)

        # Obtain the embedded image features.
        features = encoder(img_d).unsqueeze(1)

        # Pass the embedded image features through the model to get a predicted caption.
        img_output = decoder.sample(features)

        sentence = self.clean_sentence(img_output)

        return sentence
def initialize():
    checkpoint = torch.load(os.path.join('./models', 'best-model.pkl'), map_location=torch.device('cpu'))

    embed_size = 256
    hidden_size = 512

    with open('./vocab.pkl', "rb") as f:
        vocab = pickle.load(f)
    vocab_size = len(vocab)

    encoder = EncoderCNN(embed_size)
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
    encoder.eval()
    decoder.eval()

    encoder.load_state_dict(checkpoint['encoder'])
    decoder.load_state_dict(checkpoint['decoder'])

    return encoder, decoder, vocab
Beispiel #22
0
def get_model(device,vocab_size):
    # model weights file
    encoder_file = "models/encoder-3.pkl" 
    decoder_file = "models/decoder-3.pkl"

    embed_size = 512
    hidden_size = 512

    # Initialize the encoder and decoder, and set each to inference mode.
    encoder = EncoderCNN(embed_size)
    encoder.eval()
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
    decoder.eval()

    # Load the trained weights.
    #print(torch.load(encoder_file))
    encoder.load_state_dict(torch.load(encoder_file))
    decoder.load_state_dict(torch.load(decoder_file))

    # Move models to GPU if CUDA is available.
    encoder.to(device)
    decoder.to(device)

    return encoder,decoder
Beispiel #23
0
# The size of the vocabulary.
vocab_size = len(vocab)

# Initialize the encoder and decoder. 
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

device = torch.device("cpu")
# encoder.to(device)
# decoder.to(device)

# Load the pretrained model
encoder.load_state_dict(torch.load(PRETRAINED_MODEL_PATH.format('encoder')))
decoder.load_state_dict(torch.load(PRETRAINED_MODEL_PATH.format('decoder')))

encoder.eval()
decoder.eval()

images, conv_images = next(iter(data_loader))
features = encoder(conv_images).unsqueeze(1)
output = decoder.sample(features, max_len=max_len)

word_list = []
for word_idx in output:
    if word_idx == vocab.word2idx[vocab.start_word]:
        continue
    if word_idx == vocab.word2idx[vocab.end_word]:
        break
    word_list.append(vocab.idx2word[word_idx])

print(' '.join(word_list))
def main():
    # Configuration for hyper-parameters
    config = Config()
    
    # Image preprocessing
    transform = config.train_transform
    
    # Load vocabulary wrapper
    with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    image_path = os.path.join(config.image_path, 'train2014')
    json_path = os.path.join(config.caption_path, 'captions_train2014.json')
    train_loader = get_data_loader(image_path, json_path, vocab, 
                                   transform, config.batch_size,
                                   shuffle=True, num_workers=config.num_threads) 
    total_step = len(train_loader)

    # Build Models
    teachercnn = EncoderCNN(config.embed_size)
    teachercnn.eval()
    studentcnn = StudentCNN_Model1(config.embed_size)
    #Load the best teacher model
    teachercnn.load_state_dict(torch.load(os.path.join('../TrainedModels/TeacherCNN', config.trained_encoder))) 
    studentlstm = DecoderRNN(config.embed_size, config.hidden_size/2, 
                         len(vocab), config.num_layers/2)

    if torch.cuda.is_available():
        teachercnn.cuda()
	studentcnn.cuda()
        studentlstm.cuda()

    # Loss and Optimizer
    criterion_lstm = nn.CrossEntropyLoss()
    criterion_cnn = nn.MSELoss()
    params = list(studentlstm.parameters()) + list(studentcnn.parameters())
    optimizer_lstm = torch.optim.Adam(params, lr=config.learning_rate)    
    optimizer_cnn = torch.optim.Adam(studentcnn.parameters(), lr=config.cnn_learningrate)    
    
    print('entering in to training loop')    
    # Train the Models
	
    for epoch in range(config.num_epochs):
        for i, (images, captions, lengths, img_ids) in enumerate(train_loader):
	    images = Variable(images)
            captions = Variable(captions)
            if torch.cuda.is_available():
                images = images.cuda()
                captions = captions.cuda()
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
            # Forward, Backward and Optimize
	    optimizer_lstm.zero_grad()
	    optimizer_cnn.zero_grad()
            features_tr = teachercnn(images)
	    features_st = studentcnn(images)
            outputs = studentlstm(features_st, captions, lengths)
            loss = criterion(features_st, features_tr.detach()) + criterion_lstm(outputs, targets)
            loss.backward()
            optimizer_cnn.step()
            optimizer_lstm.step()
     
           # Print log info
            if i % config.log_step == 0:
                print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                      %(epoch, config.num_epochs, i, total_step, 
                        loss.data[0], np.exp(loss.data[0]))) 
                
            # Save the Model
            if (i+1) % config.save_step == 0:
                torch.save(studentlstm.state_dict(), 
                           os.path.join(config.student_lstm_path, 
                                        'decoder-%d-%d.pkl' %(epoch+1, i+1)))
		torch.save(studentcnn.state_dict(), 
                           os.path.join(config.student_cnn_path, 
                                        'encoder-%d-%d.pkl' %(epoch+1, i+1)))
Beispiel #25
0
    if checkpoints:
        for cp in checkpoints:
            name, num = cp[:-4].split('_')
            num = int(num)
            if name == model_name and model_idx == num:
                state_dict = torch.load(
                    'checkpoint/{}_{}.tar'.format(model_name, num))
                encoder.load_state_dict(state_dict['encoder_state_dict'])
                decoder.load_state_dict(state_dict['decoder_state_dict'])
                #optimizer.load_state_dict(state_dict['optimizer_state_dict'])
                print('model_{}_{} is being used'.format(name,state_dict['epoch']))
                break 

    # test
    decoder.eval()
    encoder.eval()

    with torch.no_grad():
        all_ref = []
        all_pred = []
        #print('to device finish')
        for i, (images, batch_captions) in enumerate(BLEU4loader):
            if i >= 40:
                continue
            all_ref.extend(batch_captions)
            images = images.to(device)
            #all_ref.extend(batch_captions)
            
            # Generate an caption from the image
            feature = encoder(images)
            all_pred.extend(decoder.beam_search(feature))
Beispiel #26
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Image preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    val_loader = get_loader('./data/val_resized2014/',
                            './data/annotations/captions_val2014.json', vocab,
                            transform, 1, False, 1)

    start_epoch = 0

    encoder_state = args.encoder
    decoder_state = args.decoder

    # Build the models
    encoder = EncoderCNN(args.embed_size)
    if not args.train_encoder:
        encoder.eval()
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    if args.restart:
        encoder_state, decoder_state = 'new', 'new'

    if encoder_state == '': encoder_state = 'new'
    if decoder_state == '': decoder_state = 'new'

    if decoder_state != 'new':
        start_epoch = int(decoder_state.split('-')[1])

    print("Using encoder: {}".format(encoder_state))
    print("Using decoder: {}".format(decoder_state))

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)
    """ Make logfile and log output """
    with open(args.model_path + args.logfile, 'a+') as f:
        f.write("Training on vanilla loss (using new model). Started {} .\n".
                format(str(datetime.now())))
        f.write("Using encoder: new\nUsing decoder: new\n\n")

    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    batch_loss = []
    batch_acc = []

    # Train the Models
    total_step = len(data_loader)
    for epoch in range(start_epoch, args.num_epochs):
        for i, (images, captions, lengths, _, _) in enumerate(data_loader):

            # Set mini-batch dataset
            images = to_var(images, volatile=True)
            captions = to_var(captions)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            # Forward, Backward and Optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(images)
            out = decoder(features, captions, lengths)
            loss = criterion(out, targets)
            batch_loss.append(loss.data[0])

            loss.backward()
            optimizer.step()

            # # Print log info
            # if i % args.log_step == 0:
            #     print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f, Val: %.5f, %.5f'
            #           %(epoch, args.num_epochs, i, total_step,
            #             loss.data[0], np.exp(loss.data[0]), acc, gt_acc))

            #     with open(args.model_path + args.logfile, 'a') as f:
            #         f.write('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f, Val: %.5f, %.5f\n'
            #               %(epoch, args.num_epochs, i, total_step,
            #                 loss.data[0], np.exp(loss.data[0]), acc, gt_acc))

            # Save the models
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                with open(args.model_path + 'training_loss.pkl', 'w+') as f:
                    pickle.dump(batch_loss, f)
                with open(args.model_path + 'training_val.pkl', 'w+') as f:
                    pickle.dump(batch_acc, f)
    with open(args.model_path + args.logfile, 'a') as f:
        f.write("Training finished at {} .\n\n".format(str(datetime.now())))
Beispiel #27
0
def _main():
    parser = argparse.ArgumentParser()
    parser.add_argument("filename", help="(optional) path to photograph, for which a caption will be generated", nargs = "?")
    parser.add_argument("--host", help="(optional) host to start a webserver on. Default: 0.0.0.0", nargs = "?", default = "0.0.0.0")
    parser.add_argument("--port", help="(optional) port to start a webserver on. http://hostname:port/query", nargs = "?", type = int, default = 1985)
    parser.add_argument("--verbose", "-v", help="print verbose query information", action="store_true")
   
    global _args
    _args = parser.parse_args()

    if not _args.filename and not _args.port:
        parser.print_help()
        sys.exit(-1)

    global _device
    _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("PyTorch device = ", _device)

    # Load the vocabulary dictionary
    vocab_threshold = None,
    vocab_file = "./vocab.pkl"
    start_word = "<start>"
    end_word   = "<end>"
    unk_word   = "<unk>"
    load_existing_vocab = True
    #annotations_file = "/opt/cocoapi/annotations/captions_train2014.json"
    annotations_file = None

    print("Loading vocabulary...")
    global _vocab
    _vocab = Vocabulary(vocab_threshold, vocab_file, start_word, end_word, unk_word, annotations_file, load_existing_vocab)
    vocab_size = len (_vocab)
    print("Vocabulary contains %d words" % vocab_size)

    # Load pre-trained models: 
    # encoder (Resnet + embedding layers)
    # decoder (LSTM)
    global _encoder
    global _decoder
    encoder_path = os.path.join("./models/", _encoder_file)
    decoder_path = os.path.join("./models/", _decoder_file)
    print("Loading ", encoder_path)
    _encoder = EncoderCNN(_embed_size)
    _encoder.load_state_dict(torch.load(encoder_path))
    _encoder.eval()
    _encoder.to(_device)

    print("Loading ", decoder_path)
    _decoder = DecoderRNN(_embed_size, _hidden_size, vocab_size, _num_layers)
    _decoder.load_state_dict(torch.load(decoder_path))
    _decoder.eval()
    _decoder.to(_device)

    # Caption the photo, or start a web server if no photo specified
    if _args.filename:
        _get_prediction_from_file(_args.filename)
    else:
        global _app
        global _api

        _app = Flask(__name__)
        _api = Api(_app)

        _api.add_resource(ImageCaptionResource,
                "/v1/caption",
                "/v1/caption/")
        _app.run(host = _args.host, port = _args.port)
Beispiel #28
0
def extract(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.Resize(SIZE),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    # decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers)

    dissection.retain_layers(encoder, [
        ('resnet.7.2.relu', 'final_layer'),
    ])

    encoder = encoder.to(device)
    # decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    # decoder.load_state_dict(torch.load(args.decoder_path))

    encoder.eval()
    encoder = encoder.to(device)
    # decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    # decoder.load_state_dict(torch.load(args.decoder_path))

    # Load data
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    # Run the models
    with torch.no_grad():
        total_step = len(data_loader)
        os.makedirs(os.path.join(PARENT_DIR, 'results', 'activations'),
                    exist_ok=True)
        path = os.path.join(PARENT_DIR, 'results', 'samples.txt')
        with open(path, 'w') as results_file:
            start = time.time()
            for batch, (images, captions, lengths) in enumerate(data_loader):

                # Set mini-batch dataset
                images = images.to(device)
                # captions = captions.to(device)
                # targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]

                # Forward, backward and optimize
                features = encoder(images)
                # outputs = decoder(features, captions, lengths)
                # loss = criterion(outputs, targets)
                # decoder.zero_grad()
                # encoder.zero_grad()
                # loss.backward()
                # optimizer.step()

                activations = encoder.retained['final_layer']

                images = dissection.ReverseNormalize(
                    (0.485, 0.456, 0.406), (0.229, 0.224, 0.225))(images)
                images = images.cpu().numpy().transpose([0, 2, 3, 1])
                activations = activations.cpu().numpy()

                scores = np.max(activations, axis=(-1, -2))
                samples = np.argmax(scores, axis=-1)
                gathered = activations[np.arange(len(samples)),
                                       samples].transpose([1, 2, 0])
                mask = cv2.resize(gathered, SIZE).transpose([2, 0, 1])
                k = int(0.8 * mask.size)
                threshhold = np.partition(mask, k, axis=None)[k]
                mask = mask >= threshhold
                mask = np.expand_dims(mask, axis=-1)
                outimg = np.concatenate((images, (1 + mask) / 2.), axis=-1)
                # outimg = outimg * mask
                activations = outimg

                for i, sample in enumerate(samples):
                    i += args.batch_size * batch
                    results_file.write('{} {}\n'.format(i, sample))
                for i, activation in enumerate(activations):
                    i += args.batch_size * batch
                    path = os.path.join(PARENT_DIR, 'results', 'activations',
                                        '{}.png'.format(i))
                    outactivation = skimage.img_as_ubyte(activation)
                    imageio.imwrite(path, outactivation)
                clock = time.time()
                delay = clock - start
                start = clock
                max_batch = 100
                # print('Step {}/{}: Time = {:.2f}'.format(batch, len(data_loader), delay))
                print('Step {}/{}: Time = {:.2f}'.format(
                    batch, max_batch, delay))
                if batch == max_batch:
                    break
def main():

    #write predicted caption
    if not os.path.exists(args['generate_caption_path']):
        os.makedirs(args['generate_caption_path'])

    caption_string = os.path.join(args['generate_caption_path'], "caption_ncrt_class5.txt")   
    #mode = "a" if os.path.exists(caption_string) else "w"
    fp =open(caption_string, "w+")
    
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.9638, 0.9638, 0.9638), 
                             (0.1861, 0.1861, 0.1861))])
    
    # Load vocabulary wrapper
    with open(args['vocab_path'], 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args['embed_size'])
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args['embed_size'], args['hidden_size'], 
                         len(vocab), args['num_layers'], max_seq_length=50)
    decoder.eval()
    

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args['encoder_path']))
    decoder.load_state_dict(torch.load(args['decoder_path']))
    
    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    
    # Prepare Image
    image_dir = args['image_path']
    images = os.listdir(image_dir)
    i = 1
    for image_id in images:
        #print('i->',i)
        #i = i+1  
        if not image_id.endswith('.jpg'):
            continue
        image = os.path.join(image_dir, image_id)
        image = load_image(image, transform)
        image_tensor = image.cuda()
        
        # Generate caption from image
        try:
            feature, cnn_features = encoder(image_tensor)
            sampled_ids = decoder.sample(feature, cnn_features)
            sampled_ids = sampled_ids.cpu().data.numpy()
        except:
              continue
        #print('image_ids->',image_id)      
        # Decode word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)
        print ('i->', i, image_id + '\t' + sentence)
        fp.write(image_id)
        fp.write('\t')
        fp.write(sentence)
        if i<398:
           fp.write("\n")
        i = i+1         
        
    fp.close()
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    '''
    # Load vocabulary wrapper
    with open(args.inverse_object_id_mapping, 'rb') as f:
        inverse_object_id_mapping = pickle.load(f)
    num_objects = len(inverse_object_id_mapping.keys())
    '''

    # Build models
    encoderCNN = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    encoderRNN = EncoderRNN(num_objects, args.embed_size, args.hidden_size)
    model = Model(num_objects, args.embed_size)
    encoderCNN = encoderCNN.to(device)
    encoderRNN = encoderRNN.to(device)
    model = model.to(device)
    encoderCNN.eval()
    encoderRNN.eval()
    model.eval()

    # Load the trained model parameters
    encoderCNN.load_state_dict(torch.load(args.encoderCNN_path))
    encoderRNN.load_state_dict(torch.load(args.encoderRNN_path))
    model.load_state_dict(torch.load(args.model_path))

    # Prepare an image
    image = load_image(args.image, transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    image_features = encoderCNN(image_tensor)
    input = torch.LongTensor([[[1]]]).to(device)
    h0 = torch.zeros((1, 1, args.hidden_size)).to(device)
    c0 = torch.zeros((1, 1, args.hidden_size)).to(device)
    max_seqlen = 10
    result = []
    K = 3
    all_candidates = [([1], 1.0, h0, c0) for i in range(K)]
    for i in range(max_seqlen):
        Q = []
        for _k in range(K):
            if i == 0 and _k == 1:  # first word
                break

            hashtag_features, (h0,
                               c0), Ul = encoderRNN(input[_k],
                                                    all_candidates[_k][2],
                                                    all_candidates[_k][3])
            outputs = model(image_features, hashtag_features, Ul)
            prob, topk = torch.topk(outputs, 20, dim=1)
            tup = list(zip(topk[0].cpu().tolist(), prob[0].cpu().tolist()))
            topk = [a for a in tup if a[0] not in all_candidates[_k][0]]
            try:
                topk.remove(1)
                topk.remove(0)
            except:
                pass

            for _k_ in range(K):
                Q.append((all_candidates[_k][0] + [topk[_k_][0]],
                          abs(all_candidates[_k][1] * topk[_k_][1]), h0, c0))

        all_candidates = sorted(Q, key=lambda x: x[1], reverse=True)[:K]
        input = []
        for _k in range(K):
            input.append([[all_candidates[_k][0][-1]]])
        input = torch.LongTensor(input).to(device)
        #result.append(top1.cpu().numpy()[0][0])
    result = sorted(all_candidates, key=lambda x: x[1], reverse=True)
    result = [i[0] for i in result]
    print(result)
    for i in range(K):
        tmp = [inverse_object_id_mapping[j] for j in result[i]]
        final = zip([j['name'] for j in tmp],
                    [j['supercategory'] for j in tmp])
        for j in final:
            print(j)
        print("-" * 50)
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
def main(args):
    with open('./data/vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
    encoder = EncoderCNN(256)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(256, 512, len(vocab), 1)
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder))
    decoder.load_state_dict(torch.load(args.decoder))

    if args.test_prop0:
        decoder.test_h_from_c()
        return

    if args.test_c_step:
        data_points = test(encoder, decoder, vocab, args.num_samples,
                           args.num_hints)

        with open(args.filepath, 'w+') as f:
            pickle.dump(data_points, f)

        print("Done sampling for c_step evaluation. Data saved to {}".format(
            args.filepath))

        return

    measurement_score = test(encoder, decoder, vocab, args.num_samples,
                             args.num_hints, args.debug, args.c_step,
                             args.no_avg)
    if args.msm == "ps":
        if not args.no_avg:
            print "ground truth prediction score without update\n" + str(
                measurement_score[0])
            print "ground truth prediction score with update\n" + str(
                measurement_score[1])
            print "Difference\n" + str(measurement_score[1] -
                                       measurement_score[0])
        else:
            with open(args.filepath, 'w+') as f:
                pickle.dump(measurement_score, f)
            print "Done. Data saved to {}".format(args.filepath)
    elif args.msm == "ce":
        if not args.no_avg:
            print "Cross Entropy Loss without update\n" + str(
                measurement_score[0])
            print "Cross Entropy Loss with update\n" + str(
                measurement_score[1])
            print "Difference\n" + str(measurement_score[1] -
                                       measurement_score[0])
        else:
            with open(args.filepath, 'w+') as f:
                pickle.dump(measurement_score, f)
            print "Done. Data saved to {}".format(args.filepath)
    elif args.msm == "co":
        scores = cocoEval()
        scores_u = cocoEval(res='data/captions_val2014_results_u.json')
        print(scores)
        print(scores_u)
Beispiel #32
0
def main():
    # Configuration for hyper-parameters

    torch.cuda.set_device(0)
    config = Config()
    # Image preprocessing
    transform = config.train_transform
    # Load vocabulary wrapper
    with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f:
        vocab = pickle.load(f)
    # Build data loader
    train_image_path = os.path.join(config.image_path, 'train2017')
    json_path = os.path.join(config.caption_path, 'captions_train2017.json')
    train_loader = get_data_loader(train_image_path,
                                   json_path,
                                   vocab,
                                   transform,
                                   config.batch_size,
                                   shuffle=False,
                                   num_workers=config.num_threads)

    val_image_path = os.path.join(config.image_path, 'val2017')
    json_path = os.path.join(config.caption_path, 'captions_val2017.json')
    val_loader = get_data_loader(val_image_path,
                                 json_path,
                                 vocab,
                                 transform,
                                 config.batch_size,
                                 shuffle=False,
                                 num_workers=config.num_threads)

    total_step = len(train_loader)

    # Build Models
    encoder = EncoderCNN(config.embed_size)
    encoder.eval()
    decoder = DecoderRNN(config.embed_size, config.hidden_size, len(vocab),
                         config.num_layers)

    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters())
    optimizer = torch.optim.Adam(params, lr=config.learning_rate)

    print('entering in to training loop')
    # Train the Models

    with open('train1_log.txt', 'w') as logfile:
        logfile.write('Validation Error,Training Error')
        for epoch in range(0, 25):
            for i, (images, captions, lengths,
                    img_ids) in enumerate(train_loader):
                images = Variable(images)
                captions = Variable(captions)
                if torch.cuda.is_available():
                    images = images.cuda()
                    captions = captions.cuda()
                targets = pack_padded_sequence(captions,
                                               lengths,
                                               batch_first=True)[0]
                # Forward, Backward and Optimize
                optimizer.zero_grad()
                features = encoder(images)
                outputs = decoder(features, captions, lengths)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()
                # Print log info
                if i % config.log_step == 0:
                    print(
                        'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                        % (epoch, config.num_epochs, i, total_step,
                           loss.data[0], np.exp(loss.data[0])))

                # Save the Model
                if (i + 1) % config.save_step == 0:
                    torch.save(
                        encoder.state_dict(),
                        os.path.join(config.teacher_cnn_path,
                                     'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                    torch.save(
                        decoder.state_dict(),
                        os.path.join(config.teacher_lstm_path,
                                     'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))

            print('Just Completed an Epoch, Initite Validation Error Test')
            avgvalloss = 0
            for j, (images, captions, lengths,
                    img_ids) in enumerate(val_loader):
                images = Variable(images)
                captions = Variable(captions)
                if torch.cuda.is_available():
                    images = images.cuda()
                    captions = captions.cuda()
                targets = pack_padded_sequence(captions,
                                               lengths,
                                               batch_first=True)[0]
                optimizer.zero_grad()
                features = encoder(images)
                outputs = decoder(features, captions, lengths)
                valloss = criterion(outputs, targets)
                if j == 0:
                    avgvalloss = valloss.data[0]
                avgvalloss = (avgvalloss + valloss.data[0]) / 2
                if ((j + 1) % 1000 == 0):
                    print('Average Validation Loss: %.4f' % (avgvalloss))
                    logfile.write(
                        str(avgvalloss) + ',' + str(loss.data[0]) + str('\n'))
                    break