Esempio n. 1
0
def main(args):
    with open('data/vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
    encoder = EncoderCNN(256)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(256, 512, len(vocab), 1)
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder))
    decoder.load_state_dict(torch.load(args.decoder))

    measurement_score = test(encoder, decoder, vocab, args.num_samples,
                             args.num_hints, args.debug, args.c_step,
                             args.no_avg)
    if args.msm == "co":
        scores = cocoEval()
        scores_u = cocoEval(res='data/captions_val2014_results_u.json')
        print(scores)
        print(scores_u)

        with open(args.filepath, 'w+') as f:
            pickle.dump((scores, scores_u), f)
Esempio n. 2
0
def main(args):
	# Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    # encoder = EncoderCNN(args.embed_size)
    # encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    layout_encoder = LayoutEncoder(args.layout_embed_size, args.embed_size, 100, args.num_layers)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    

    # Load the trained model parameters
    layout_encoder.load_state_dict(torch.load(args.layout_encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    
    # If use gpu
    if torch.cuda.is_available():
        layout_encoder.cuda()
        decoder.cuda()

    # validation(layout_encoder,decoder, args,vocab,transform,args.batch_size)
    out = save_output(layout_encoder,decoder, args,vocab,transform,args.batch_size)
    with open('bsl_output.txt', 'w') as outfile:
        json.dump(out, outfile)
Esempio n. 3
0
def main(args):
    # Val images folder
    filepath = '/scratch/ys2542/pytorch-tutorial/tutorials/03-advanced/image_captioning/data/resizedval2014'
    onlyfiles = [fl for fl in listdir(filepath) if isfile(join(filepath, fl))]

    # image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # load vocabulary wrapper pickle file
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    encoder = EncoderCNN(args.embed_size)  # build encoder
    encoder.eval()  # evaluation mode by moving mean and variance
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)  # build decoder

    # load the trained CNN and RNN parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Load all images in val folder
    for i in onlyfiles:

        badsize = 0  # count the unload images
        args_image = filepath + '/'  # val folder path with image names
        args_image = args_image + i

        # transform image and wrap it to tensor
        image = load_image(args_image, transform)
        image_tensor = to_var(image, volatile=True)

        if torch.cuda.is_available():  # load GPU
            encoder.cuda()
            decoder.cuda()

            # generate caption from image
            try:
                feature = encoder(image_tensor)
                sampled_ids = decoder.sample(feature)
                sampled_ids = sampled_ids.cpu().data.numpy()

                # decode word_ids to words
                sampled_caption = []
                for word_id in sampled_ids:
                    word = vocab.idx2word[word_id]
                    sampled_caption.append(word)
                    if word == '<end>':
                        break
                sentence = ' '.join(sampled_caption)

                # print out image and generated caption without start and end
                print('beam_size_1' + '\t' + i + '\t' + sentence[8:-8])

            except:
                badsize = badsize + 1  # count some wrong images
Esempio n. 4
0
def main():
    # Load vocabulary wrapper.
    with open(vocab_path) as f:
        vocab = pickle.load(f)

    encoder = EncoderCNN(4096, embed_dim)
    decoder = DecoderRNN(embed_dim, hidden_size, len(vocab), num_layers_rnn)
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters())
    optimizer = torch.optim.Adam(params, lr=0.001)

    #load data
    with open(image_data_file) as f:
        image_data = pickle.load(f)
    image_features = si.loadmat(image_feature_file)

    img_features = image_features['fc7'][0]
    img_features = np.concatenate(img_features)

    print 'here'
    iteration = 0
    save_loss = []
    for i in range(10):  # epoch
        use_caption = i % 5
        print 'Epoch', i
        for x, y in make_mini_batch(img_features,
                                    image_data,
                                    use_caption=use_caption):
            word_padding, lengths = make_word_padding(y, vocab)

            x = Variable(torch.from_numpy(x).cuda())
            word_index = Variable(torch.from_numpy(word_padding).cuda())

            encoder.zero_grad()
            decoder.zero_grad()

            features = encoder(x)
            targets = pack_padded_sequence(word_index,
                                           lengths,
                                           batch_first=True)[0]
            outputs = decoder(features, word_index, lengths)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            if iteration % 100 == 0:
                print 'loss', loss.data[0]
                save_loss.append(loss.data[0])

            iteration += 1

        torch.save(decoder.state_dict(), 'decoder.pkl')
        torch.save(encoder.state_dict(), 'encoder.pkl')
        with open('losses.txt', 'w') as f:
            print >> f, losses
Esempio n. 5
0
def main(image):
    # Configuration for hyper-parameters
    config = Config()

    # Image Preprocessing
    transform = config.test_transform

    # Load vocabulary
    with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(config.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(config.embed_size, config.hidden_size, len(vocab),
                         config.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(
        torch.load(
            os.path.join(config.teacher_cnn_path, config.trained_encoder)))
    decoder.load_state_dict(
        torch.load(
            os.path.join(config.teacher_lstm_path, config.trained_decoder)))
    # Prepare Image
    image = Image.open(image)
    image_tensor = Variable(transform(image).unsqueeze(0))

    # Set initial states
    state = (Variable(torch.zeros(config.num_layers, 1, config.hidden_size)),
             Variable(torch.zeros(config.num_layers, 1, config.hidden_size)))

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
        state = [s.cuda() for s in state]
        image_tensor = image_tensor.cuda()

    # Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature, state)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word_id == 96:
            sampled_caption.append('<end>')
            break
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out image and generated caption.
    print(sentence)
    return sentence
Esempio n. 6
0
def main(args):
    vectore_dir = '/root/server/best_model/'

    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    # encoder = EncoderCNN(args.embed_size)
    qvecs_pca = np.load(
        os.path.join(vectore_dir, "q_2{}.npy".format(args.embed_size)))
    # encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    # encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    #image = load_image(args.image, transform)
    #image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        # encoder.cuda()
        decoder.cuda()

    data = []
    # img_path = args.image
    # # Prepare Image
    # image = load_image(img_path, transform)
    # image_tensor = to_var(image, volatile=True)
    # Generate caption from image
    # feature = encoder(image_tensor)
    num = 29
    feature = torch.from_numpy(qvecs_pca[num:num + 1, :]).cuda()
    #pdb.set_trace()
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        if word == '<start>':
            continue
        if word == '<end>':
            break
        sampled_caption.append(word)
    sentence = ' '.join(sampled_caption)
    # Print out image and generated caption.
    print(sentence)
Esempio n. 7
0
def main(args):

    # Image preprocessing
    transform = transforms.Compose([ 
        transforms.ToTensor(), 
        transforms.Normalize((0.033, 0.032, 0.033), 
                             (0.027, 0.027, 0.027))])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    #encoder = AttnEncoder(ResidualBlock, [3, 3, 3])
    encoder = ResNet(ResidualBlock, [3, 3, 3], args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    # decoder = AttnDecoderRnn(args.feature_size, args.hidden_size, 
    #                     len(vocab), args.num_layers)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)

    print('load')

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    print('load')

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda(1)
        decoder.cuda(1)


    trg_bitmap_dir = args.root_path + 'bitmap/'
    save_directory = 'predict_base/'
    svg_from_out = args.root_path + save_directory + 'svg/'   # svg from output caption 
    bitmap_from_out = args.root_path + save_directory + 'bitmap/'   #bitmap from out caption 

    if not os.path.exists(bitmap_from_out):
        os.makedirs(bitmap_from_out)
    if not os.path.exists(svg_from_out):
        os.makedirs(svg_from_out)

    test_list = os.listdir(trg_bitmap_dir)
    for i, fname in enumerate(test_list): 
        print(fname)
        test_path = trg_bitmap_dir + fname
        test_image = load_image(test_path, transform)
        image_tensor = to_var(test_image)
        in_sentence = gen_caption_from_image(image_tensor, encoder, decoder, vocab)
        print(in_sentence)
        image_matrix = cv2.imread(test_path)
        doc = gen_svg_from_predict(in_sentence.split(' '), image_matrix)

        with open(os.path.join(svg_from_out, fname.split('.')[0]+'.svg'), 'w+') as f:
            f.write(doc)
        cairosvg.svg2png(url=svg_from_out+ fname.split('.')[0] + '.svg', write_to= bitmap_from_out+fname)
Esempio n. 8
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out image and generated caption.
    sentence = sentence.replace('<start> ',
                                '').replace(' <end>', '').replace('.',
                                                                  '').strip()
    translator = Translator()
    sentence_indo = translator.translate(sentence, dest='id').text
    print('This is an image of: ' + sentence_indo)
    tts = gTTS(sentence_indo, 'id')
    tts.save('result.mp3')
    playsound('result.mp3')

    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
    plt.show()
Esempio n. 9
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.Scale(args.crop_size),
        transforms.CenterCrop(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    alexnet = models.alexnet(pretrained=True)
    alexnet2 = AlexNet2(alexnet)
    # Build Models
    encoder = EncoderCNN(4096, args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = Image.open(args.image)
    image_tensor = Variable(transform(image).unsqueeze(0))

    # Set initial states
    state = (Variable(torch.zeros(args.num_layers, 1, args.hidden_size)),
             Variable(torch.zeros(args.num_layers, 1, args.hidden_size)))

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
        alexnet2.cuda()
        state = [s.cuda() for s in state]
        image_tensor = image_tensor.cuda()

    # Generate caption from image
    alexnet2(image_tensor)
    feature = encoder(alexnet2.fc7_value)
    sampled_ids = decoder.sample(feature, state)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out image and generated caption.
    print(sentence)
Esempio n. 10
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    #image = load_image(args.image, transform)
    #image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    data = []
    try:
        img_path = args.image
        # Prepare Image
        image = load_image(img_path, transform)
        image_tensor = to_var(image, volatile=True)
        # Generate caption from image
        feature = encoder(image_tensor)
        #pdb.set_trace()
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids.cpu().data.numpy()

        # Decode word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            if word == '<start>':
                continue
            if word == '<end>':
                break
            sampled_caption.append(word)
        sentence = ' '.join(sampled_caption)
        # Print out image and generated caption.
        print(sentence)
        data.append({'key': img_path.split('/')[-1], 'sentence': sentence})
    except:
        print(img_path)
Esempio n. 11
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))
    
    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    
    # Prepare Image
    image_dir = args.image
    images = os.listdir(image_dir)
    for image_id in images:
        if not image_id.endswith('.jpg'):
            continue
        image = os.path.join(image_dir, image_id)
        image = load_image(image, transform)
        image_tensor = to_var(image, volatile=True)
        
        # Generate caption from image
        try:
            feature, cnn_features = encoder(image_tensor)
            sampled_ids = decoder.sample(feature, cnn_features)
            sampled_ids = sampled_ids.cpu().data.numpy()
        except:
            continue
        
        # Decode word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)
        
        # Print out image and generated caption.
        print (image_id + '\t' + sentence)
Esempio n. 12
0
def main(args):   
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    
    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)
    
    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    
    # Generate caption from image
    feature = encoder(image_tensor)
    sentence = decode(feature,[],decoder,vocab)

    print (sentence)
    user_input = raw_input("Does it make sense to you?(y/n)\n")

    if str(user_input) == "n":
        f = open('data/step_1/caption_1.txt','r')
        ground_true = f.read()
        teach_wordid = []
        teach_wordid.append(vocab.word2idx["<start>"])
        while(True):
            print "This is the ground true:\n"+ground_true+"\n"+\
            "###################################################\n"
            reference = ground_true.split()
            hypothesis = sentence.split()
            BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis)
            print "Current BLEU score is "+str(BLEUscore)
            word = raw_input("next word:\n")
            word_idx = vocab.word2idx[word]
            teach_wordid.append(word_idx)
            sentence = decode(feature,teach_wordid,decoder,vocab)
            print "###################################################\n"
            print "Current Translated sentence is: \n"+sentence+"\n"
Esempio n. 13
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    bar = Bar('Processing', max=100)
    for i in range(100):
        bar.next()

# Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids.cpu().data.numpy()
    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    bar.finish()
    # Print out image and generated caption.
    print("\n")
    print(sentence)
    image = Image.open(args.image)
    imgplot = plt.imshow(np.asarray(image))
    plt.show()
Esempio n. 14
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)
    
    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    
    # Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids.cpu().data.numpy()
    
    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    
    # Print out image and generated caption.
    print (sentence)
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
Esempio n. 15
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.Scale(args.crop_size),
        #transforms.CenterCrop(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare Image
    image = load_image(args.image, transform)
    image_tensor = to_var(image, volatile=True)

    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
        image_tensor = image_tensor.cuda()
    # Generate caption from image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature, args.length)
    sampled_ids = sampled_ids.cpu().data.numpy()

    # Decode word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        if word != '<start>' and word != '<end>':
            sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ''.join(sampled_caption)

    # Print out image and generated caption.
    print(sentence)
Esempio n. 16
0
def runTest(n_layers, hidden_size, reverse, modelFile, beam_size, input,
            corpus):

    voc, pairs, valid_pairs, test_pairs = loadPrepareData(corpus)

    print('Building encoder and decoder ...')
    '''# attribute embeddings
    attr_size = 64
    attr_num = 2
    with open(os.path.join(save_dir, 'user_item.pkl'), 'rb') as fp:
        user_dict, item_dict = pickle.load(fp)
    num_user = len(user_dict)
    num_item = len(item_dict)
    attr_embeddings = []
    attr_embeddings.append(nn.Embedding(num_user, attr_size))    
    attr_embeddings.append(nn.Embedding(num_item, attr_size)) 
    if USE_CUDA:
        for attr_embedding in attr_embeddings:
            attr_embedding = attr_embedding.cuda()
   
    encoder = AttributeEncoder(attr_size, attr_num, hidden_size, attr_embeddings, n_layers)
    '''
    embedding = nn.Embedding(voc.n_words, hidden_size,
                             padding_idx=0)  # word embedding
    encoder = EncoderRNN(voc.n_words, hidden_size, embedding, n_layers)
    attn_model = 'concat'
    decoder = DecoderRNN(embedding, hidden_size, voc.n_words, n_layers)

    checkpoint = torch.load(modelFile)
    encoder.load_state_dict(checkpoint['en'])
    decoder.load_state_dict(checkpoint['de'])
    # train mode set to false, effect only on dropout, batchNorm
    encoder.train(False)
    decoder.train(False)

    if USE_CUDA:
        encoder = encoder.cuda()
        decoder = decoder.cuda()

    evaluateRandomly(encoder, decoder, voc, pairs, reverse, beam_size, 2)
Esempio n. 17
0
def main(args):

    #setup tensorboard
    if args.tensorboard:
        cc = CrayonClient(hostname="localhost")
        print(cc.get_experiment_names())
        #if args.name in cc.get_experiment_names():
        try:
            cc.remove_experiment(args.name)
        except:
            print("experiment didnt exist")
        cc_server = cc.create_experiment(args.name)

    # Create model directory
    full_model_path = args.model_path + "/" + args.name
    if not os.path.exists(full_model_path):
        os.makedirs(full_model_path)
    with open(full_model_path + "/parameters.json", 'w') as f:
        f.write((json.dumps(vars(args))))

    # Image preprocessing

    transform = transforms.Compose([
        transforms.Scale(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
    mini_transform = transforms.Compose(
        [transforms.ToPILImage(),
         transforms.Scale(20),
         transforms.ToTensor()])

    # Load vocabulary wrapper.
    if args.vocab_path is not None:
        with open(args.vocab_path, 'rb') as f:
            vocab = pickle.load(f)
    else:
        print("building new vocab")
        vocab = build_vocab(args.image_dir, 1, None)
        with open((full_model_path + "/vocab.pkl"), 'wb') as f:
            pickle.dump(vocab, f)

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)
    code_data_set = ProcessingDataset(root=args.image_dir,
                                      vocab=vocab,
                                      transform=transform)
    train_ds, val_ds = validation_split(code_data_set)
    train_loader = torch.utils.data.DataLoader(train_ds, collate_fn=collate_fn)
    test_loader = torch.utils.data.DataLoader(val_ds, collate_fn=collate_fn)
    train_size = len(train_loader)
    test_size = len(test_loader)

    # Build the models
    encoder = EncoderCNN(args.embed_size, args.train_cnn)
    print(encoder)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    print(decoder)
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    #params = list(decoder.parameters()) #+ list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)
    start_time = time.time()
    add_log_entry(args.name, start_time, vars(args))

    # Train the Models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):
            decoder.train()
            encoder.train()
            # Set mini-batch dataset
            image_ts = to_var(images, volatile=True)
            captions = to_var(captions)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]
            count = images.size()[0]

            # Forward, Backward and Optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(image_ts)
            outputs = decoder(features, captions, lengths)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total = targets.size(0)
            max_index = outputs.max(dim=1)[1]
            #correct = (max_index == targets).sum()
            _, predicted = torch.max(outputs.data, 1)
            correct = predicted.eq(targets.data).cpu().sum()
            accuracy = 100. * correct / total

            if args.tensorboard:
                cc_server.add_scalar_value("train_loss", loss.data[0])
                cc_server.add_scalar_value("perplexity", np.exp(loss.data[0]))
                cc_server.add_scalar_value("accuracy", accuracy)

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, accuracy: %2.2f Perplexity: %5.4f'
                    % (epoch, args.num_epochs, i, total_step, loss.data[0],
                       accuracy, np.exp(loss.data[0])))

            # Save the models
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(full_model_path,
                                 'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(full_model_path,
                                 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
            if 1 == 2 and i % int(train_size / 10) == 0:
                encoder.eval()
                #decoder.eval()
                correct = 0
                for ti, (timages, tcaptions,
                         tlengths) in enumerate(test_loader):
                    timage_ts = to_var(timages, volatile=True)
                    tcaptions = to_var(tcaptions)
                    ttargets = pack_padded_sequence(tcaptions,
                                                    tlengths,
                                                    batch_first=True)[0]
                    tfeatures = encoder(timage_ts)
                    toutputs = decoder(tfeatures, tcaptions, tlengths)
                    print(ttargets)
                    print(toutputs)
                    print(ttargets.size())
                    print(toutputs.size())
                    #correct = (ttargets.eq(toutputs[0].long())).sum()

                accuracy = 100 * correct / test_size
                print('accuracy: %.4f' % (accuracy))
                if args.tensorboard:
                    cc_server.add_scalar_value("accuracy", accuracy)

    torch.save(
        decoder.state_dict(),
        os.path.join(full_model_path,
                     'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
    torch.save(
        encoder.state_dict(),
        os.path.join(full_model_path,
                     'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
    end_time = time.time()
    print("finished training, runtime: %d", [(end_time - start_time)])
def main():
    # Configuration for hyper-parameters
    config = Config()
    
    # Image preprocessing
    transform = config.train_transform
    
    # Load vocabulary wrapper
    with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    image_path = os.path.join(config.image_path, 'train2014')
    json_path = os.path.join(config.caption_path, 'captions_train2014.json')
    train_loader = get_data_loader(image_path, json_path, vocab, 
                                   transform, config.batch_size,
                                   shuffle=True, num_workers=config.num_threads) 
    total_step = len(train_loader)

    # Build Models
    teachercnn = EncoderCNN(config.embed_size)
    teachercnn.eval()
    studentcnn = StudentCNN_Model1(config.embed_size)
    #Load the best teacher model
    teachercnn.load_state_dict(torch.load(os.path.join('../TrainedModels/TeacherCNN', config.trained_encoder))) 
    studentlstm = DecoderRNN(config.embed_size, config.hidden_size/2, 
                         len(vocab), config.num_layers/2)

    if torch.cuda.is_available():
        teachercnn.cuda()
	studentcnn.cuda()
        studentlstm.cuda()

    # Loss and Optimizer
    criterion_lstm = nn.CrossEntropyLoss()
    criterion_cnn = nn.MSELoss()
    params = list(studentlstm.parameters()) + list(studentcnn.parameters())
    optimizer_lstm = torch.optim.Adam(params, lr=config.learning_rate)    
    optimizer_cnn = torch.optim.Adam(studentcnn.parameters(), lr=config.cnn_learningrate)    
    
    print('entering in to training loop')    
    # Train the Models
	
    for epoch in range(config.num_epochs):
        for i, (images, captions, lengths, img_ids) in enumerate(train_loader):
	    images = Variable(images)
            captions = Variable(captions)
            if torch.cuda.is_available():
                images = images.cuda()
                captions = captions.cuda()
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
            # Forward, Backward and Optimize
	    optimizer_lstm.zero_grad()
	    optimizer_cnn.zero_grad()
            features_tr = teachercnn(images)
	    features_st = studentcnn(images)
            outputs = studentlstm(features_st, captions, lengths)
            loss = criterion(features_st, features_tr.detach()) + criterion_lstm(outputs, targets)
            loss.backward()
            optimizer_cnn.step()
            optimizer_lstm.step()
     
           # Print log info
            if i % config.log_step == 0:
                print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                      %(epoch, config.num_epochs, i, total_step, 
                        loss.data[0], np.exp(loss.data[0]))) 
                
            # Save the Model
            if (i+1) % config.save_step == 0:
                torch.save(studentlstm.state_dict(), 
                           os.path.join(config.student_lstm_path, 
                                        'decoder-%d-%d.pkl' %(epoch+1, i+1)))
		torch.save(studentcnn.state_dict(), 
                           os.path.join(config.student_cnn_path, 
                                        'encoder-%d-%d.pkl' %(epoch+1, i+1)))
Esempio n. 19
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Image preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    val_loader = get_loader('./data/val_resized2014/',
                            './data/annotations/captions_val2014.json', vocab,
                            transform, 1, False, 1)

    start_epoch = 0

    encoder_state = args.encoder
    decoder_state = args.decoder

    # Build the models
    encoder = EncoderCNN(args.embed_size)
    if not args.train_encoder:
        encoder.eval()
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    if args.restart:
        encoder_state, decoder_state = 'new', 'new'

    if encoder_state == '': encoder_state = 'new'
    if decoder_state == '': decoder_state = 'new'

    if decoder_state != 'new':
        start_epoch = int(decoder_state.split('-')[1])

    print("Using encoder: {}".format(encoder_state))
    print("Using decoder: {}".format(decoder_state))

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)
    """ Make logfile and log output """
    with open(args.model_path + args.logfile, 'a+') as f:
        f.write("Training on vanilla loss (using new model). Started {} .\n".
                format(str(datetime.now())))
        f.write("Using encoder: new\nUsing decoder: new\n\n")

    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    batch_loss = []
    batch_acc = []

    # Train the Models
    total_step = len(data_loader)
    for epoch in range(start_epoch, args.num_epochs):
        for i, (images, captions, lengths, _, _) in enumerate(data_loader):

            # Set mini-batch dataset
            images = to_var(images, volatile=True)
            captions = to_var(captions)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            # Forward, Backward and Optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(images)
            out = decoder(features, captions, lengths)
            loss = criterion(out, targets)
            batch_loss.append(loss.data[0])

            loss.backward()
            optimizer.step()

            # # Print log info
            # if i % args.log_step == 0:
            #     print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f, Val: %.5f, %.5f'
            #           %(epoch, args.num_epochs, i, total_step,
            #             loss.data[0], np.exp(loss.data[0]), acc, gt_acc))

            #     with open(args.model_path + args.logfile, 'a') as f:
            #         f.write('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f, Val: %.5f, %.5f\n'
            #               %(epoch, args.num_epochs, i, total_step,
            #                 loss.data[0], np.exp(loss.data[0]), acc, gt_acc))

            # Save the models
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                with open(args.model_path + 'training_loss.pkl', 'w+') as f:
                    pickle.dump(batch_loss, f)
                with open(args.model_path + 'training_val.pkl', 'w+') as f:
                    pickle.dump(batch_acc, f)
    with open(args.model_path + args.logfile, 'a') as f:
        f.write("Training finished at {} .\n\n".format(str(datetime.now())))
Esempio n. 20
0
# Specify values for embed_size and hidden_size - we use the same values as in training step
embed_size = 256
hidden_size = 512

# Get the vocabulary and its size
vocab = data_loader.dataset.vocab
vocab_size = len(vocab)

# Initialize the encoder and decoder, and set each to inference mode
encoder = EncoderCNN(embed_size)
encoder.eval()
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
decoder.eval()

# Load the pre-trained weights
encoder.load_state_dict(checkpoint['encoder'])
decoder.load_state_dict(checkpoint['decoder'])

# Move models to GPU if CUDA is available.
if torch.cuda.is_available():
    encoder.cuda()
    decoder.cuda()

# In[5]:

x = get_prediction(data_loader, encoder, decoder, vocab)

# In[6]:

print(x)
def main():

    #write predicted caption
    if not os.path.exists(args['generate_caption_path']):
        os.makedirs(args['generate_caption_path'])

    caption_string = os.path.join(args['generate_caption_path'], "caption_ncrt_class5.txt")   
    #mode = "a" if os.path.exists(caption_string) else "w"
    fp =open(caption_string, "w+")
    
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.9638, 0.9638, 0.9638), 
                             (0.1861, 0.1861, 0.1861))])
    
    # Load vocabulary wrapper
    with open(args['vocab_path'], 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args['embed_size'])
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args['embed_size'], args['hidden_size'], 
                         len(vocab), args['num_layers'], max_seq_length=50)
    decoder.eval()
    

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args['encoder_path']))
    decoder.load_state_dict(torch.load(args['decoder_path']))
    
    # If use gpu
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()
    
    # Prepare Image
    image_dir = args['image_path']
    images = os.listdir(image_dir)
    i = 1
    for image_id in images:
        #print('i->',i)
        #i = i+1  
        if not image_id.endswith('.jpg'):
            continue
        image = os.path.join(image_dir, image_id)
        image = load_image(image, transform)
        image_tensor = image.cuda()
        
        # Generate caption from image
        try:
            feature, cnn_features = encoder(image_tensor)
            sampled_ids = decoder.sample(feature, cnn_features)
            sampled_ids = sampled_ids.cpu().data.numpy()
        except:
              continue
        #print('image_ids->',image_id)      
        # Decode word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)
        print ('i->', i, image_id + '\t' + sentence)
        fp.write(image_id)
        fp.write('\t')
        fp.write(sentence)
        if i<398:
           fp.write("\n")
        i = i+1         
        
    fp.close()
Esempio n. 22
0
def main(args):

    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(args.seed)

    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([
        # transforms.RandomCrop(args.crop_size),
        # transforms.RandomHorizontalFlip(),
        transforms.Scale(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             args.MSCOCO_result,
                             args.coco_detection_result,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers,
                             dummy_object=99,
                             yolo=False)

    # Build the models
    encoder = EncoderCNN(args.embed_size)
    # the layout encoder hidden state size must be the same with decoder input size
    layout_encoder = LayoutEncoder(args.layout_embed_size, args.embed_size,
                                   100, args.num_layers)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    if torch.cuda.is_available():
        encoder.cuda()
        layout_encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(layout_encoder.parameters()) + list(decoder.parameters()) + \
      list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the Models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths, label_seqs, location_seqs,
                visual_seqs, layout_lengths) in enumerate(data_loader):
            # Set mini-batch dataset
            images = to_var(images)
            captions = to_var(captions)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            # Forward, Backward and Optimize
            # decoder.zero_grad()
            # layout_encoder.zero_grad()
            # encoder.zero_grad()

            # Modify This part for using visual features or not

            # features = encoder(images)
            layout_encoding = layout_encoder(label_seqs, location_seqs,
                                             layout_lengths)
            # comb_features = features + layout_encoding
            comb_features = layout_encoding

            outputs = decoder(comb_features, captions, lengths)

            loss = criterion(outputs, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                    % (epoch, args.num_epochs, i, total_step, loss.data[0],
                       np.exp(loss.data[0])))

                # Save the models
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))

                torch.save(
                    layout_encoder.state_dict(),
                    os.path.join(
                        args.model_path,
                        'layout_encoding-%d-%d.pkl' % (epoch + 1, i + 1)))
Esempio n. 23
0
def train(batch_size=32,
          vocab_threshold=5,
          vocab_from_file=True,
          embed_size=256,
          hidden_size=512,
          num_epochs=10,
          latest_model=None,
          cocoapi_dir="./Coco/"):
    # Keep track of train and validation losses and validation Bleu-4 scores by epoch
    train_losses = []

    # Define a transform to pre-process the training images
    transform_train = transforms.Compose([
        transforms.Resize(256),
        transforms.RandomCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Build data loader, applying the transforms
    train_loader = get_loader(transform=transform_train,
                              mode='train',
                              batch_size=batch_size,
                              vocab_threshold=vocab_threshold,
                              vocab_from_file=vocab_from_file,
                              cocoapi_loc=cocoapi_dir)

    # The size of the vocabulary
    vocab_size = len(train_loader.dataset.vocab)

    # Initialize the encoder and decoder
    checkpoint = None
    if latest_model:
        checkpoint = torch.load(latest_model)
    start_epoch = 1
    if checkpoint:
        train_losses = checkpoint['train_losses']
        val_losses = checkpoint['val_losses']
        start_epoch = checkpoint['epoch']
    encoder = EncoderCNN(embed_size)
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
    if checkpoint:
        encoder.load_state_dict(checkpoint['encoder'])
        decoder.load_state_dict(checkpoint['decoder'])

    # Move models to GPU if CUDA is available
    if torch.cuda.is_available():
        torch.cuda.set_device(1)
        encoder.cuda()
        decoder.cuda()

    # Define the loss function
    loss = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available(
    ) else nn.CrossEntropyLoss()

    # Specify the learnable parameters of the model
    params = list(decoder.parameters()) + list(
        encoder.embed.parameters()) + list(encoder.bn.parameters())

    # Define the optimizer
    optimizer = torch.optim.Adam(params=params, lr=0.001)
    if checkpoint:
        optimizer.load_state_dict(checkpoint['optimizer'])

    # Set the total number of training and validation steps per epoch
    total_train_step = math.ceil(
        len(train_loader.dataset.caption_lengths) /
        train_loader.batch_sampler.batch_size)

    start_time = time.time()
    for epoch in range(start_epoch, num_epochs + 1):
        train_loss = train_one(train_loader, encoder, decoder, loss, optimizer,
                               vocab_size, epoch, total_train_step)
        train_losses.append(train_loss)
        # Save the entire model anyway, regardless of being the best model so far or not
        filename = os.path.join("./models", "model-{}.pkl".format(epoch))
        save_epoch(filename, encoder, decoder, optimizer, train_losses, epoch)
        print("Epoch [%d/%d] took %ds" %
              (epoch, num_epochs, time.time() - start_time))
        start_time = time.time()
Esempio n. 24
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.033, 0.032, 0.033), (0.027, 0.027, 0.027))
    ])

    # Build vocab
    vocab = build_vocab(args.root_path, threshold=0)
    vocab_path = args.vocab_path
    with open(vocab_path, 'wb') as f:
        pickle.dump(vocab, f)
    len_vocab = vocab.idx
    print(vocab.idx2word)

    # Build data loader
    data_loader = get_loader(args.root_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Build the models
    encoder = ResNet(ResidualBlock, [3, 3, 3], args.embed_size)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    #Build atten models
    if torch.cuda.is_available():
        encoder.cuda(1)
        decoder.cuda(1)

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the Models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):

            # make one hot
            # cap_ = torch.unsqueeze(captions,2)
            # one_hot_ = torch.FloatTensor(captions.size(0),captions.size(1),len_vocab).zero_()
            # one_hot_caption = one_hot_.scatter_(2, cap_, 1)

            # Set mini-batch dataset
            images = to_var(images)
            captions = to_var(captions)
            #captions_ = to_var(one_hot_caption)

            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]
            # Forward, Backward and Optimize
            optimizer.zero_grad()
            features = encoder(images)
            outputs = decoder(features, captions, lengths)

            captions = captions.view(-1)
            outputs = outputs.view(-1, len_vocab)

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            #print(targets)
            #print(outputs)

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                    % (epoch, args.num_epochs, i, total_step, loss.data[0],
                       np.exp(loss.data[0])))

                #test set accuracy
                #print(outputs.max(1)[1])
                outputs_np = outputs.max(1)[1].cpu().data.numpy()
                targets_np = targets.cpu().data.numpy()

                print(outputs_np)
                print(targets_np)

                location_match = 0
                size_match = 0
                shape_match = 0
                exact_match = 0
                for i in range(len(targets_np)):
                    if outputs_np[i] == targets_np[i]:
                        exact_match += 1
                    if i >= args.batch_size and i < args.batch_size * 2 and outputs_np[
                            i] == targets_np[i]:
                        shape_match += 1
                    elif i >= args.batch_size * 2 and i < args.batch_size * 3 and outputs_np[
                            i] == targets_np[i]:
                        location_match += 1
                    elif i >= args.batch_size * 3 and i < args.batch_size * 4 and outputs_np[
                            i] == targets_np[i]:
                        size_match += 1

                print(
                    'location match : %.4f, shape match : %.4f, exact_match: %.4f'
                    % (location_match / (args.batch_size), shape_match /
                       args.batch_size, exact_match / len(targets_np)))

            # Save the models
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
Esempio n. 25
0
def main():
    # Configuration for hyper-parameters

    torch.cuda.set_device(0)
    config = Config()
    # Image preprocessing
    transform = config.train_transform
    # Load vocabulary wrapper
    with open(os.path.join(config.vocab_path, 'vocab.pkl'), 'rb') as f:
        vocab = pickle.load(f)
    # Build data loader
    train_image_path = os.path.join(config.image_path, 'train2017')
    json_path = os.path.join(config.caption_path, 'captions_train2017.json')
    train_loader = get_data_loader(train_image_path,
                                   json_path,
                                   vocab,
                                   transform,
                                   config.batch_size,
                                   shuffle=False,
                                   num_workers=config.num_threads)

    val_image_path = os.path.join(config.image_path, 'val2017')
    json_path = os.path.join(config.caption_path, 'captions_val2017.json')
    val_loader = get_data_loader(val_image_path,
                                 json_path,
                                 vocab,
                                 transform,
                                 config.batch_size,
                                 shuffle=False,
                                 num_workers=config.num_threads)

    total_step = len(train_loader)

    # Build Models
    encoder = EncoderCNN(config.embed_size)
    encoder.eval()
    decoder = DecoderRNN(config.embed_size, config.hidden_size, len(vocab),
                         config.num_layers)

    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters())
    optimizer = torch.optim.Adam(params, lr=config.learning_rate)

    print('entering in to training loop')
    # Train the Models

    with open('train1_log.txt', 'w') as logfile:
        logfile.write('Validation Error,Training Error')
        for epoch in range(0, 25):
            for i, (images, captions, lengths,
                    img_ids) in enumerate(train_loader):
                images = Variable(images)
                captions = Variable(captions)
                if torch.cuda.is_available():
                    images = images.cuda()
                    captions = captions.cuda()
                targets = pack_padded_sequence(captions,
                                               lengths,
                                               batch_first=True)[0]
                # Forward, Backward and Optimize
                optimizer.zero_grad()
                features = encoder(images)
                outputs = decoder(features, captions, lengths)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()
                # Print log info
                if i % config.log_step == 0:
                    print(
                        'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                        % (epoch, config.num_epochs, i, total_step,
                           loss.data[0], np.exp(loss.data[0])))

                # Save the Model
                if (i + 1) % config.save_step == 0:
                    torch.save(
                        encoder.state_dict(),
                        os.path.join(config.teacher_cnn_path,
                                     'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                    torch.save(
                        decoder.state_dict(),
                        os.path.join(config.teacher_lstm_path,
                                     'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))

            print('Just Completed an Epoch, Initite Validation Error Test')
            avgvalloss = 0
            for j, (images, captions, lengths,
                    img_ids) in enumerate(val_loader):
                images = Variable(images)
                captions = Variable(captions)
                if torch.cuda.is_available():
                    images = images.cuda()
                    captions = captions.cuda()
                targets = pack_padded_sequence(captions,
                                               lengths,
                                               batch_first=True)[0]
                optimizer.zero_grad()
                features = encoder(images)
                outputs = decoder(features, captions, lengths)
                valloss = criterion(outputs, targets)
                if j == 0:
                    avgvalloss = valloss.data[0]
                avgvalloss = (avgvalloss + valloss.data[0]) / 2
                if ((j + 1) % 1000 == 0):
                    print('Average Validation Loss: %.4f' % (avgvalloss))
                    logfile.write(
                        str(avgvalloss) + ',' + str(loss.data[0]) + str('\n'))
                    break
Esempio n. 26
0
def main():
    # load vocab Data here!

    with open('VocabData.pkl', 'rb') as f:
        VocabData = pickle.load(f)
    with open('FullImageCaps.pkl', 'rb') as f:
        FullImageCaps = pickle.load(f)
    # FullImageCaps_sub = loadData("full_image_descriptions.json")
    coco = loadCoco('captions_train2017.json')

    data = FullImageCaps + coco
    print(len(data) / 128)
    recovery = sys.argv[2]
    mode = sys.argv[1]

    lmdata = LMDataset(VocabData, data)
    lmloader = lmdata.getLoader(batchSize=128, shuffle=True)
    testloader = lmdata.getLoader(batchSize=1, shuffle=False)
    embedding = torch.Tensor(lmdata.embedding)
    vocab_size = len(lmdata.wordDict)
    max_len = 100
    hidden_size = 1024
    embedding_size = 300
    max_epoch = 10
    sos_id = lmdata.sos_id
    eos_id = lmdata.eos_id
    pad_id = lmdata.pad_id

    wordDict = VocabData['word_dict']
    rev_vocab = [''] * vocab_size
    for word in wordDict:
        rev_vocab[wordDict[word]] = word

    they = torch.zeros(1, vocab_size)
    are = torch.zeros(1, vocab_size)
    students = torch.zeros(1, vocab_size)
    _from = torch.zeros(1, vocab_size)
    that = torch.zeros(1, vocab_size)
    school = torch.zeros(1, vocab_size)
    they_id = wordDict['they']
    are_id = wordDict['are']
    students_id = wordDict['students']
    from_id = wordDict['from']
    that_id = wordDict['that']
    school_id = wordDict['school']

    they[0, they_id] = 1
    are[0, are_id] = 1
    students[0, students_id] = 1
    _from[0, from_id] = 1
    that[0, that_id] = 1
    school[0, school_id] = 1

    strange_sentence = torch.cat([they, are, are, are, are, are],
                                 0).unsqueeze(0)
    regular_sentence = torch.cat([they, are, students, _from, that, school],
                                 0).unsqueeze(0)

    PATH = 'LMcheckpoint(1)'

    model = DecoderRNN(vocab_size,
                       max_len,
                       hidden_size,
                       embedding_size,
                       sos_id,
                       eos_id,
                       embedding_parameter=embedding,
                       rnn_cell='lstm')
    if recovery == '1':
        model = loadCheckpoint(PATH, model)
    optimizer = optim.Adam(model.parameters(), lr=0.0002)
    criterion = nn.NLLLoss(ignore_index=pad_id)
    if torch.cuda.is_available():
        model = model.cuda()
        criterion = criterion.cuda()
    if mode == 'train':
        train_LM(lmloader, model, optimizer, criterion, pad_id, max_epoch,
                 max_len)

    if mode == 'test':
        lm_loss = LanguageModelLoss(PATH,
                                    vocab_size,
                                    max_len,
                                    hidden_size,
                                    embedding_size,
                                    sos_id,
                                    eos_id,
                                    use_prob_vector=True)
        loss1 = lm_loss(strange_sentence)
        loss2 = lm_loss(regular_sentence)
        print(loss1.item(), loss2.item())

    sampleSentence(model, testloader, rev_vocab)
def main(args):
    with open('./data/vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
    encoder = EncoderCNN(256)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(256, 512, len(vocab), 1)
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder))
    decoder.load_state_dict(torch.load(args.decoder))

    if args.test_prop0:
        decoder.test_h_from_c()
        return

    if args.test_c_step:
        data_points = test(encoder, decoder, vocab, args.num_samples,
                           args.num_hints)

        with open(args.filepath, 'w+') as f:
            pickle.dump(data_points, f)

        print("Done sampling for c_step evaluation. Data saved to {}".format(
            args.filepath))

        return

    measurement_score = test(encoder, decoder, vocab, args.num_samples,
                             args.num_hints, args.debug, args.c_step,
                             args.no_avg)
    if args.msm == "ps":
        if not args.no_avg:
            print "ground truth prediction score without update\n" + str(
                measurement_score[0])
            print "ground truth prediction score with update\n" + str(
                measurement_score[1])
            print "Difference\n" + str(measurement_score[1] -
                                       measurement_score[0])
        else:
            with open(args.filepath, 'w+') as f:
                pickle.dump(measurement_score, f)
            print "Done. Data saved to {}".format(args.filepath)
    elif args.msm == "ce":
        if not args.no_avg:
            print "Cross Entropy Loss without update\n" + str(
                measurement_score[0])
            print "Cross Entropy Loss with update\n" + str(
                measurement_score[1])
            print "Difference\n" + str(measurement_score[1] -
                                       measurement_score[0])
        else:
            with open(args.filepath, 'w+') as f:
                pickle.dump(measurement_score, f)
            print "Done. Data saved to {}".format(args.filepath)
    elif args.msm == "co":
        scores = cocoEval()
        scores_u = cocoEval(res='data/captions_val2014_results_u.json')
        print(scores)
        print(scores_u)
Esempio n. 28
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    #Load vocab_list for uniskip
    vocab_list = pd.read_csv("./data/vocab_list.csv", header=None)
    vocab_list = vocab_list.values.tolist()[0]

    #Build data loader
    data_loader = get_loader(args.image_dir,
                             args.img_embeddings_dir,
                             args.data_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Build the models
    #im_encoder = preprocess_get_model.model()
    attention = T_Att()
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers, args.dropout)
    uniskip = UniSkip('./data/skip-thoughts', vocab_list)
    decoder.eval()

    if torch.cuda.is_available():
        #im_encoder.cuda()
        attention.cuda()
        decoder.cuda()
        uniskip.cuda()

    attention.load_state_dict(torch.load(args.attention_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    for i, (images, captions, cap_lengths, qa, qa_lengths,
            vocab_words) in enumerate(data_loader):

        #         # Set mini-batch dataset
        img_embeddings = to_var(images.data, volatile=True)
        captions = to_var(captions)
        #         qa = to_var(qa)
        #         targets = pack_padded_sequence(qa, qa_lengths, batch_first=True)[0]

        #         # Forward, Backward and Optimize
        #         decoder.zero_grad()
        #         attention.zero_grad()
        #         #features = encoder(images)

        #img_embeddings = im_encoder(images)
        #uniskip = UniSkip('/Users/tushar/Downloads/code/data/skip-thoughts', vocab_list)
        cap_embeddings = uniskip(captions, cap_lengths)
        cap_embeddings = cap_embeddings.data
        img_embeddings = img_embeddings.data
        ctx_vec = attention(img_embeddings, cap_embeddings)
        outputs = decoder.sample(ctx_vec)
        output_ids = outputs.cpu().data.numpy()
        qa = qa.numpy()
        qa = qa[0]

        #     predicted_q = []
        #     predicted_a = []
        sample = []
        #     flag = -1
        for word_id in output_ids:
            word = vocab.idx2word[word_id]
            sample.append(word)
        #    if word == '<end>':
        #        if flag == -1:
        #            predicted_q = sample
        #            sample = []
        #            flag = 0
        #        else:
        #            predicted_a = sample
        # predicted_q = ' '.join(predicted_q[1:])
        # predicted_a = ' '.join(predicted_a[1:])
        sample = ' '.join(sample)
        actual = []
        # print("predicted q was : " + predicted_q)
        for word_id in qa:
            word = vocab.idx2word[word_id]
            actual.append(word)
        actual = ' '.join(actual)
        #print(im_id)
        print("actual_qa : " + actual + " | predicted_qa : " + sample)
Esempio n. 29
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)
    
    # Image preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([ 
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(), 
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)
    
    # Build data loader
    data_loader = get_loader(args.image_dir, args.caption_path, vocab, 
                             transform, args.batch_size,
                             shuffle=True, num_workers=args.num_workers) 

    # Build the models
    encoder = EncoderCNN(args.embed_size)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, 
                         len(vocab), args.num_layers)
    
    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)
    
    # Train the Models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):
            
            # Set mini-batch dataset
            images = to_var(images, volatile=True)
            captions = to_var(captions)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
            
            # Forward, Backward and Optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                      %(epoch, args.num_epochs, i, total_step, 
                        loss.data[0], np.exp(loss.data[0]))) 
                
            # Save the models
            if (i+1) % args.save_step == 0:
                torch.save(decoder.state_dict(), 
                           os.path.join(args.model_path, 
                                        'decoder-%d-%d.pkl' %(epoch+1, i+1)))
                torch.save(encoder.state_dict(), 
                           os.path.join(args.model_path, 
                                        'encoder-%d-%d.pkl' %(epoch+1, i+1)))
Esempio n. 30
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing
    # Composea all processing together, to a tensor with (C,H,W) and value in range (0 - 1)
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Build the models
    encoder = EncoderCNN(args.embed_size)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.resnet.fc.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the Models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):

            # Set mini-batch dataset
            images = Variable(images)
            captions = Variable(captions)
            print("cap size %s" % str(captions.size()))
            if torch.cuda.is_available():
                images = images.cuda()
                captions = captions.cuda()
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]
            print(targets)
            # Forward, Backward and Optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(images)
            print("cnn feats %s" % str(features.size()))
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                    % (epoch, args.num_epochs, i, total_step, loss.data[0],
                       np.exp(loss.data[0])))

            # Save the models
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
Esempio n. 31
0
File: train.py Progetto: afcarl/sn
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    worker_thread_count = 1
    retry_for_failed = 2

    # Image preprocessing
    # For normalization, see https://github.com/pytorch/vision#models
    transform = transforms.Compose([
        #     transforms.RandomCrop(args.crop_size),
        #     transforms.RandomHorizontalFlip(),
        transforms.Scale(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
    #transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])

    # Load vocabulary wrapper.
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Build the models
    encoder = EncoderCNN(args.embed_size)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)

    if torch.cuda.is_available():
        encoder.cuda()
        decoder.cuda()

    # Loss and Optimizer
    criterion = nn.L1Loss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the Models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):
            processed_items = []
            threads = []
            has_data_to_process = True

            def do_request(item):
                position = item['position']
                #print(position)
                #print(item)
                retry = retry_for_failed
                while retry:
                    r = requests.post('http://localhost:4567/', data=item)
                    if r.status_code == 200:
                        pil = Image.open(io.BytesIO(r.content)).convert('RGB')
                        processed_items[position] = transform(pil)
                        #print(position, processed_items[position])
                        break
                    else:
                        print("shouldb be here")
                        time.sleep(2)
                        retry -= 1

            # Set mini-batch dataset
            image_tensors = to_var(images, volatile=True)
            captions = to_var(captions)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]
            #print(images.size())
            #print(torch.equal(images[0] ,images[1]))

            # Forward, Backward and Optimize
            decoder.zero_grad()
            encoder.zero_grad()
            features = encoder(image_tensors)
            outputs = decoder(features, captions, lengths)
            codes = []

            def worker():
                while items_to_process.qsize() > 0 or has_data_to_process:
                    item = items_to_process.get()
                    if item is None:
                        break
                    do_request(item)
                    items_to_process.task_done()
                print("ended thread processing")

            for j in range(worker_thread_count):
                t = threading.Thread(target=worker)
                t.daemon = True  # thread dies when main thread (only non-daemon thread) exits.
                t.start()
                threads.append(t)
            for ii, image in enumerate(images):
                image_tensor = to_var(image.unsqueeze(0), volatile=True)
                feature = encoder(image_tensor)
                sampled_ids = decoder.sample(feature)
                sampled_ids = sampled_ids.cpu().data.numpy()
                sampled_caption = []
                for word_id in sampled_ids:
                    word = vocab.idx2word[word_id]
                    sampled_caption.append(word)
                    if word == '<end>':
                        break
                sentence = ' '.join(sampled_caption)
                payload = {'code': sentence}
                data = {'position': ii, 'code': sentence}
                items_to_process.put(data)
                processed_items.append('failed')
                codes.append(sentence)
            has_data_to_process = False
            print(codes)
            print(items_to_process.qsize())
            print(image.size())
            print("waiting for threads")
            for t in threads:
                t.join()
            print("done reassembling images")
            for t in threads:
                t.shutdown = True
                t.join()
            bad_value = False
            for pi in processed_items:
                if isinstance(pi, str) and pi == "failed":
                    bad_value = True
            if bad_value == True:
                print("failed conversion,skipping batch")
                continue
            output_tensor = torch.FloatTensor(len(processed_items), 3,
                                              images.size()[2],
                                              images.size()[3])
            for ii, image_tensor in enumerate(processed_items):
                output_tensor[ii] = processed_items[ii]
            output_var = to_var(output_tensor, False)
            target_var = to_var(images, False)
            #loss = criterion(output_var,target_var)
            print("loss")
            print(loss)

            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f'
                    % (epoch, args.num_epochs, i, total_step, loss.data[0],
                       np.exp(loss.data[0])))

            # Save the models
            if (i + 1) % args.save_step == 0:
                torch.save(
                    decoder.state_dict(),
                    os.path.join(args.model_path,
                                 'decoder-%d-%d.pkl' % (epoch + 1, i + 1)))
                torch.save(
                    encoder.state_dict(),
                    os.path.join(args.model_path,
                                 'encoder-%d-%d.pkl' % (epoch + 1, i + 1)))
Esempio n. 32
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406),
                             (0.229, 0.224, 0.225))])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build Models
    encoder = EncoderCNN(args.embed_size)
    encoder.eval()  # evaluation mode (BN uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size,
                         len(vocab), args.num_layers)

    # Load the trained model parameters
    #print args.encoder_path
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    data_loader = get_loader(args.image_dir, args.caption_path, vocab,
                             transform, args.batch_size,
                             shuffle=True, num_workers=args.num_workers)

    total_num = len(data_loader)*args.batch_size
    print total_num
    num_correct=0
    tested=0

    hypotheses=[]
    references=[]

    for i, (images, captions, lengths) in enumerate(data_loader):
        tested += args.batch_size
        if i==1:
            break;

        # If use gpu
        if torch.cuda.is_available():
            encoder.cuda()
            decoder.cuda()

        # Prepare Image
        images = to_var(images, volatile=True)
        captions = to_var(captions)
        #targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]
        max_sent_length=captions[-1].size(0)
        print max_sent_length,'length'
        print captions.size(),'caption_size'
        #print captions[0].size()
        #print captions[0]
        #print targets.size()


        # Generate caption from image
        features=encoder(images)

        sampled_captions = decoder.sample(features,max_sent_length)
        targets=torch.transpose(sampled_captions.view(max_sent_length,-1),0,1);
        print targets.size(),'ans'
        #print targets
        #print captions
        ref_sents=translate(captions,vocab)
        hypo_sents=translate(targets,vocab)

        references.extend(ref_sents)
        hypotheses.extend(hypo_sents)
        num_correct_t = targets.data.eq(captions.data).sum()
        print num_correct_t,'num correct'
        num_correct += num_correct_t


        #feature = encoder(image_tensor)
        #sampled_ids = decoder.sample(feature)
        #sampled_ids = sampled_ids.cpu().data.numpy()

        # Decode word_ids to words
        #sampled_caption = []
        #for word_id in sampled_ids:
        #    word = vocab.idx2word[word_id]
        #    sampled_caption.append(word)
        #    if word == '<end>':
        #        break
        #sentence = ' '.join(sampled_caption)

        # Print out image and generated caption.
        #print (sentence)


    hypo_ref_out=(hypotheses,references)
    with open('hypo_out.txt', 'wb') as handle:
        pickle.dump(hypo_ref_out,handle)
    print len(hypotheses)
    print hypotheses[0:10]
    print references[0:10]
    bleu_score=bleu.BLEU(hypotheses,[references])
    print bleu_score

    print 'num_correct',num_correct,'total',tested,total_num
    score = BLEU.corpus_bleu(references,hypotheses)
    score1 = BLEU.corpus_bleu(references,hypotheses,weights=[1,0,0,0])
    score2 = BLEU.corpus_bleu(references, hypotheses,weights=[0.5,0.5,0,0])
    score3 = BLEU.corpus_bleu(references, hypotheses,weights=[0.3,0.3,0.3,0])
    score4 = BLEU.corpus_bleu(references, hypotheses, weights=[0.25,0.25,0.25,0.25])
    print score,score1,score2,score3,score4