def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare an image
    image = load_image(args.image, transform)
    image_tensor = image.to(device)
    
    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)
    
    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    
    # Print out the image and the generated caption
    print (sentence)
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
Beispiel #2
0
def train_caption_model(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406),
                             (0.229, 0.224, 0.225))])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    data_loader = get_loader(args.image_dir, args.caption_path, vocab,
                             transform, args.batch_size,
                             shuffle=True, num_workers=args.num_workers)

    # Build the models
    encoder = EncoderCNN(args.embed_size)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers)
    if torch.cuda.device_count() > 1:
        print("{} GPUs are in use.".format(torch.cuda.device_count()))
        encoder = nn.DataParallel(encoder)
        decoder = nn.DataParallel(decoder)
    encoder.to(device)
    decoder.to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

   # Train the models
    for epoch in range(args.num_epochs):
        for i, (images, captions, lengths) in enumerate(data_loader):
            # Set mini-batch dataset
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]

            # Forward, backward and optimize
            features = encoder(images)
            import pdb; pdb.set_trace()
            outputs = decoder(features, captions, lengths)

            import pdb; pdb.set_trace()
            loss = criterion(outputs, targets)
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            optimizer.step()

            # Print log info
            if i % args.log_step == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                      .format(epoch, args.num_epochs, i, len(data_loader), loss.item(), np.exp(loss.item())))

            # Save the model checkpoints
            if (i+1) % args.save_step == 0:
                torch.save(decoder.state_dict(), os.path.join(
                    args.model_path, 'decoder-{}-{}.ckpt'.format(epoch+1, i+1)))
                torch.save(encoder.state_dict(), os.path.join(
                    args.model_path, 'encoder-{}-{}.ckpt'.format(epoch+1, i+1)))
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    if args.with_glove == 'True':
        # Get glove pickles
        glove_path = args.glove_path

        vectors = bcolz.open(f'{glove_path}/6B.{args.embed_size}.dat')[:]
        words = pickle.load(
            open(f'{glove_path}/6B.{args.embed_size}_words.pkl', 'rb'))
        word2idx = pickle.load(
            open(f'{glove_path}/6B.{args.embed_size}_idx.pkl', 'rb'))
        glove = {w: vectors[word2idx[w]] for w in words}

        # Get weights matrix
        weights_matrix = np.zeros((len(vocab), args.embed_size))
        words_found = 0

        # We compare the vocabulary from the built vocab, and the glove word vectors
        for i in range(len(vocab)):
            try:
                word = vocab.idx2word[i]
                weights_matrix[i] = glove[word]
                words_found += 1
            except KeyError:
                weights_matrix[i] = np.random.normal(scale=0.6,
                                                     size=(args.embed_size, ))

        # Build models
        encoder = EncoderCNN(args.embed_size).eval(
        )  # eval mode (batchnorm uses moving mean/variance)
        decoder = DecoderRNNGlove(args.hidden_size, weights_matrix,
                                  args.num_layers)
    else:
        encoder = EncoderCNN(args.embed_size).eval(
        )  # eval mode (batchnorm uses moving mean/variance)
        decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                             args.num_layers)

    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare an image
    image = load_image(args.image, transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        if word != '<start>' and word != '<end>':
            sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out the image and the generated caption
    print(sentence)
    pickle.dump(sentence, open("save.p", "wb"))
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
Beispiel #4
0
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=vocab_from_file,
                         cocoapi_loc=COCOPATH)

# The size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)

# Initialize the encoder and decoder. 
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

# Move models to GPU if CUDA is available. 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

# Define the loss function. 
criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()

# TODO #3: Specify the learnable parameters of the model.
params = list(decoder.parameters()) +\
         list(encoder.embed.parameters())  # We don't want to retrain the resnet

# TODO #4: Define the optimizer.
optimizer = torch.optim.RMSprop(params)

# Set the total number of training steps per epoch.
total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size)
hidden_size = 512

vocab_size = len(data_loader.dataset.vocab)

encoder = EncoderCNN(embed_size)
encoder.eval()
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
decoder.eval()

encoder_file = 'encoder-3.pkl'
decoder_file = 'decoder-3.pkl'

encoder.load_state_dict(torch.load(os.path.join('./models', encoder_file)))
decoder.load_state_dict(torch.load(os.path.join('./models', decoder_file)))

encoder.to(device)
decoder.to(device)


# prediction
def clean_sentence(output):
    sentence = ""

    for idx in output:
        if idx == 0:
            continue
        elif idx == 1:
            break
        else:
            word = data_loader.dataset.vocab.idx2word[idx]
            sentence = sentence + word + " "
Beispiel #6
0
def extract(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.Resize(SIZE),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    # decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers)

    dissection.retain_layers(encoder, [
        ('resnet.7.2.relu', 'final_layer'),
    ])

    encoder = encoder.to(device)
    # decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    # decoder.load_state_dict(torch.load(args.decoder_path))

    encoder.eval()
    encoder = encoder.to(device)
    # decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    # decoder.load_state_dict(torch.load(args.decoder_path))

    # Load data
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    # Run the models
    with torch.no_grad():
        total_step = len(data_loader)
        os.makedirs(os.path.join(PARENT_DIR, 'results', 'activations'),
                    exist_ok=True)
        path = os.path.join(PARENT_DIR, 'results', 'samples.txt')
        with open(path, 'w') as results_file:
            start = time.time()
            for batch, (images, captions, lengths) in enumerate(data_loader):

                # Set mini-batch dataset
                images = images.to(device)
                # captions = captions.to(device)
                # targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]

                # Forward, backward and optimize
                features = encoder(images)
                # outputs = decoder(features, captions, lengths)
                # loss = criterion(outputs, targets)
                # decoder.zero_grad()
                # encoder.zero_grad()
                # loss.backward()
                # optimizer.step()

                activations = encoder.retained['final_layer']

                images = dissection.ReverseNormalize(
                    (0.485, 0.456, 0.406), (0.229, 0.224, 0.225))(images)
                images = images.cpu().numpy().transpose([0, 2, 3, 1])
                activations = activations.cpu().numpy()

                scores = np.max(activations, axis=(-1, -2))
                samples = np.argmax(scores, axis=-1)
                gathered = activations[np.arange(len(samples)),
                                       samples].transpose([1, 2, 0])
                mask = cv2.resize(gathered, SIZE).transpose([2, 0, 1])
                k = int(0.8 * mask.size)
                threshhold = np.partition(mask, k, axis=None)[k]
                mask = mask >= threshhold
                mask = np.expand_dims(mask, axis=-1)
                outimg = np.concatenate((images, (1 + mask) / 2.), axis=-1)
                # outimg = outimg * mask
                activations = outimg

                for i, sample in enumerate(samples):
                    i += args.batch_size * batch
                    results_file.write('{} {}\n'.format(i, sample))
                for i, activation in enumerate(activations):
                    i += args.batch_size * batch
                    path = os.path.join(PARENT_DIR, 'results', 'activations',
                                        '{}.png'.format(i))
                    outactivation = skimage.img_as_ubyte(activation)
                    imageio.imwrite(path, outactivation)
                clock = time.time()
                delay = clock - start
                start = clock
                max_batch = 100
                # print('Step {}/{}: Time = {:.2f}'.format(batch, len(data_loader), delay))
                print('Step {}/{}: Time = {:.2f}'.format(
                    batch, max_batch, delay))
                if batch == max_batch:
                    break
Beispiel #7
0
def evaluate(args):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    dataset = Dataset({
        'data_dir': args['data_dir'],
        'exp_dir': args['exp_dir'],
        'raw_data_dir': args['raw_data_dir'],
        'transform': transform,
        'mode': 'test'
    })
    args['vocab_size'] = len(dataset.vocab)

    encoder = EncoderCNN(args).eval()
    decoder = DecoderRNN(args).eval()

    encoder = encoder.to(device)
    decoder = decoder.to(device)

    encoder.load_state_dict(
        torch.load(os.path.join(args['model_path'], 'encoder.pt')))
    decoder.load_state_dict(
        torch.load(os.path.join(args['model_path'], 'decoder.pt')))

    generated_captions = []
    image_ids = []
    target_captions = []

    for idx in range(len(dataset.ids)):
        image_id, image, captions = dataset.get_test_item(idx)
        image = image.to(device)
        print(idx)

        features = encoder(image)
        word_ids = decoder.sample(features)
        word_ids = word_ids[0].cpu().tolist()

        words = []
        for word_id in word_ids:
            if dataset.vocab.idx2word[word_id] == '<start>':
                continue
            if dataset.vocab.idx2word[word_id] != '<end>':
                words.append(dataset.vocab.idx2word[word_id])
            else:
                break
        image_ids.append(image_id)
        generated_captions.append(words)
        target_captions.append(captions)
        print(words)

    image_captions = [{
        'image_id': image_ids[idx],
        'caption': ' '.join(generated_captions[idx])
    } for idx in range(len(image_ids))]

    captions_path = os.path.join(args['exp_dir'], args['caption_file'])
    image_caption_path = os.path.join(args['exp_dir'], args['evaluation_file'])

    with open(captions_path, 'w') as f:
        for idx in range(len(generated_captions)):
            f.write('*' * 50 + '\n')
            f.write('-' * 20 + 'generated_captions' + '-' * 20 + '\n')
            f.write(' '.join(generated_captions[idx]) + '\n')
            f.write('-' * 20 + 'target_captions' + '-' * 20 + '\n')
            for words in target_captions[idx]:
                f.write(' '.join(words) + '\n')
            f.write('*' * 50 + '\n')
            f.write('\n')

    with open(bleu_score_path, 'w') as f:
        f.write('BLEU_score: {}'.format(str(BLEU_score)))

    with open(image_caption_path, 'w') as f:
        json.dump(image_captions, f)
# data_loader = get_loader(args.image_dir, args.caption_path, vocab,
#                              transform, args.batch_size,
#                              shuffle=True, num_workers=args.num_workers)

trainloader = get_loader(train_image_dir, train_caption_path,
                         vocab, transform_train, batch_size, shuffle=True, num_workers=8)

testloader = get_loader(test_image_dir, test_caption_path, vocab,
                        transform_test, batch_size, shuffle=False, num_workers=8)

checkpoints = os.listdir('checkpoint')

encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers=1)
encoder = encoder.to(device)
decoder = decoder.to(device)
params = list(decoder.parameters()) + list(encoder.linear.parameters())
optimizer = torch.optim.Adam(params, lr=learning_rate)
cur_epoch = 0

if checkpoints:
    num_checkpoint = -1
    for cp in checkpoints:
        name, num = cp[:-4].split('_')
        num = int(num)
        if name == model_name and num_checkpoint < num:
            num_checkpoint = num
    if num_checkpoint > -1:
        state_dict = torch.load('checkpoint/{}_{}.tar'.format(model_name,num_checkpoint))
        encoder.load_state_dict(state_dict['encoder_state_dict'])
Beispiel #9
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    words = [
        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
        'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
        'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
        'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
        'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
        'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
        'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
        'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
        'scissors', 'teddy bear', 'hair drier', 'toothbrush'
    ]

    # Prepare images
    if args.images:

        # inputs path
        input_path = os.listdir(args.images)
        sentences = []

        # folders in inputs
        for path in input_path:
            file_path = args.images + path + '/'
            if os.path.isdir(file_path):
                files = os.listdir(file_path)
                # files in folders
                for file in files:

                    image = load_image(file_path + file, transform)
                    image_tensor = image.to(device)

                    # Generate an caption from the image
                    feature = encoder(image_tensor)
                    sampled_ids = decoder.sample(feature)
                    sampled_ids = sampled_ids[0].cpu().numpy(
                    )  # (1, max_seq_length) -> (max_seq_length)

                    # Convert word_ids to words
                    sampled_caption = []
                    for word_id in sampled_ids:
                        word = vocab.idx2word[word_id]
                        sampled_caption.append(word)
                        if word == '<end>':
                            break
                    caption = ' '.join(sampled_caption)[8:-6]
                    sentences.append(caption)
                    for word in words:
                        if word in caption:
                            f = open('captions2.csv',
                                     'a',
                                     encoding='utf-8',
                                     newline="")
                            writer = csv.writer(f)
                            writer.writerow([file_path + file, word, caption])
                            f.close()

    # Print out the image and the generated caption
    #     for s in sentences:
    #         print(s)

    # Prepare an image
    else:
        image = load_image(args.image, transform)
        image_tensor = image.to(device)

        found_words = []

        # Generate an caption from the image
        feature = encoder(image_tensor)
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids[0].cpu().numpy(
        )  # (1, max_seq_length) -> (max_seq_length)

        # Convert word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)

        for word in words:
            if word in sentence:
                found_words.append(word)

        if 'hot dog' in sentence:
            found_words.remove('dog')

        return sentence, found_words
Beispiel #10
0
def main(args):

    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # get test ids
    ids = []
    with open('TestImageIds.csv', 'r') as f:
        reader = csv.reader(f)
        testIds = list(reader)
    testIds = [int(i) for i in testIds[0]]
    coco = COCO(args.caption_path)
    for img_id in testIds:
        for entry in coco.imgToAnns[img_id]:
            ids.append(entry['id'])

    # create data loader
    test_loader = get_loader(args.image_dir,
                             args.caption_path,
                             ids,
                             vocab,
                             transform,
                             1,
                             shuffle=False,
                             num_workers=0)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()

    # Build models
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers).eval()
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # evaluate loss
    running_loss = 0.0
    num_imgs = len(ids)
    for i, (images, captions, lengths) in enumerate(test_loader):
        sys.stdout.write("\rEvaluating Caption: %d/%d" % (i, num_imgs))
        images = images.to(device)
        captions = captions.to(device)
        targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]

        features = encoder(images)
        outputs = decoder(features,
                          captions,
                          lengths,
                          pretrained=args.pretrained)
        outputs = outputs
        loss = criterion(outputs, targets)

        running_loss += loss.item() * images.size(0)

    test_loss = running_loss / num_imgs
    print("Test Loss : %.2f" % (test_loss))

    print("\rWriting captions to json file...")
    # write to json file
    anns = []
    for img_id in tqdm(testIds):
        # Prepare an image
        image = load_image(
            args.image_dir + '/' + coco.loadImgs(img_id)[0]['file_name'],
            transform)
        image_tensor = image.to(device)

        # Generate an caption from the image
        feature = encoder(image_tensor)
        if args.stochastic:
            sampled_ids = decoder.stochastic_sample(
                feature,
                temperature=args.temperature,
                pretrained=args.pretrained)
        else:
            sampled_ids = decoder.sample(feature, pretrained=args.pretrained)
        sampled_ids = sampled_ids[0].cpu().numpy(
        )  # (1, max_seq_length) -> (max_seq_length)

        # Convert word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)

        # Print out the image and the generated caption
        ann = {'image_id': img_id, 'id': 0, 'caption': sentence}
        anns.append(ann)


#         print (sentence, img_id)

    pred_annotations_file = "./results/{}.json".format(args.model_name)
    with open(pred_annotations_file, 'w') as f:
        json.dump(anns, f)

    true_annotations_file = args.caption_path
    BLEU1, BLEU4 = evaluate_captions(true_annotations_file,
                                     pred_annotations_file)
    print("Test Loss : %.2f" % (test_loss))
    print("BLEU1 score : %.2f" % (BLEU1))
    print("BLEU4 score : %.2f" % (BLEU4))
Beispiel #11
0
def train(
        num_epochs: int,
        lr: float,
        batch_size: int,
        vocab_threshold: int,
        vocab_from_file: bool,
        embed_size: int,
        hidden_size: int,
        save_every: int,
        print_every: int,
        log_file: str
)-> None:
    """
    Train the captioning network with the required parameters.
    The training logs are saved in log_file.

    num_epochs:         Number of epochs to train the model.
    batch_size:         Mini-batch size for training.
    vocab_threshold:    Minimum word count threshold for vocabulary initialisation. A word that appears in
                        the dataset a fewer number of times than vocab_threshold will be discarded and
                        will not appear in the vocabulary dictionnary. Indeed, the smaller the threshold,
                        the bigger the vocabulary.
    vocab_from_file:    Whether to load the vocabulary from a pre-initialized file.
    embed_size:         Dimensionality of image and word embeddings.
    hidden_size:        Number of features in hidden state of the RNN decoder.
    save_every:         Number of epochs between each checkpoint saving.
    print_every:        Number of batches for printing average loss.
    log_file:           Name of the training log file. Saves loss and perplexity.

    """

    transform_train = transforms.Compose([
        transforms.Resize(256),                          # smaller edge of image resized to 256
        transforms.RandomCrop(224),                      # get 224x224 crop from random location
        transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
        transforms.ToTensor(),                           # convert the PIL Image to a tensor
        transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                             (0.229, 0.224, 0.225))])

    # Build data loader.
    data_loader = get_loader(transform=transform_train,
                             mode='train',
                             batch_size=batch_size,
                             vocab_threshold=vocab_threshold,
                             vocab_from_file=vocab_from_file)

    # The size of the vocabulary.
    vocab_size = len(data_loader.dataset.vocab)

    # Initialize the encoder and decoder.
    encoder = EncoderCNN(embed_size)
    decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

    # Move models to GPU if CUDA is available.
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    encoder.to(device)
    decoder.to(device)

    # Define the loss function.
    criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()

    # Parameters to update. We do not re-train de CNN here
    params = list(encoder.embed.parameters()) + list(decoder.parameters())

    # TODO: add learning rate scheduler
    # Optimizer for minimum search.
    optimizer = optim.Adam(params, lr=lr)

    # Set the total number of training steps per epoch.
    total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size)

    # Open the training log file.
    f = open(log_file, 'w')

    for epoch in range(1, num_epochs + 1):
        for i_step in range(1, total_step + 1):

            # Randomly sample a caption length, and sample indices with that length.
            indices = data_loader.dataset.get_train_indices()
            # Create and assign a batch sampler to retrieve a batch with the sampled indices.
            new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
            data_loader.batch_sampler.sampler = new_sampler

            # Obtain the batch.
            images, captions = next(iter(data_loader))

            # Move batch of images and captions to GPU if CUDA is available.
            images = images.to(device)
            captions = captions.to(device)

            # Zero the gradients.
            decoder.zero_grad()
            encoder.zero_grad()

            # Pass the inputs through the CNN-RNN model.
            features = encoder(images)
            outputs = decoder(features, captions)

            # for i in range(10):
            #     print(torch.argmax(outputs[0,i, :]).item())

            # Calculate the batch loss.
            loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))

            # Backward pass.
            loss.backward()

            # Update the parameters in the optimizer.
            optimizer.step()

            # Get training statistics.
            stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (
            epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item()))

            # Print training statistics (on same line).
            print('\r' + stats, end="")
            sys.stdout.flush()

            # Print training statistics to file.
            f.write(stats + '\n')
            f.flush()

            # Print training statistics (on different line).
            if i_step % print_every == 0:
                print('\r' + stats)

        # Save the weights.
        if epoch % save_every == 0:
            torch.save(decoder.state_dict(), os.path.join('./models', f"{device}_{hidden_size}_decoder-{epoch}.pkl"))
            torch.save(encoder.state_dict(), os.path.join('./models', f"{device}_{hidden_size}_encoder-{epoch}.pkl"))

    # Close the training log file.
    f.close()
Beispiel #12
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    actual_captions = []
    predicted_captions = []
    annotation_path = '../data/annotations/captions_val2014.json'
    with open(annotation_path) as f:
        anns = json.load(f)
    anns = anns["annotations"]
    for index, _ in enumerate(anns):
        anns[index]['image_id'] = str(anns[index]['image_id']).rjust(12, '0')
        if anns[index]['caption'][-1] == '.':
            #print (anns[index]['caption'])
            anns[index]['caption'] = str(anns[index]['caption'])[:-1]

    anns = pd.DataFrame(anns)
    #print (anns.head())

    for index, image_name in enumerate(os.listdir("../data/val2014/")):
        try:
            print(index)
            image = load_image("../data/val2014/" + image_name, transform)
            image_tensor = image.to(device)

            feature = encoder(image_tensor)
            sampled_ids = decoder.sample(feature)
            sampled_ids = sampled_ids[0].cpu().numpy(
            )  # (1, max_seq_length) -> (max_seq_length)

            sampled_caption = []
            for word_id in sampled_ids:
                word = vocab.idx2word[word_id]
                sampled_caption.append(word)
                if word == '<end>':
                    break
            sentence = ' '.join(sampled_caption)
            #print (sampled_caption)
            sampled_caption = sampled_caption[1:-2]
            #print (sampled_caption)
            predicted_captions.append(sampled_caption)
            #print (image_name)
            image_id = image_name[-16:-4]
            #print (image_id)
            temp = anns[anns['image_id'] == image_id]
            actual = [i.split(' ') for i in temp['caption']]
            #print (actual)
            actual_captions.append(actual)
        except RuntimeError:
            print(image_name + " errored out")
            pass

    pickle.dump(predicted_captions, open("predicted_captions.p", 'wb'))
    pickle.dump(actual_captions, open("actual_captions.p", 'wb'))
    one_reference = [cap[0] for cap in actual_captions]
    pickle.dump(one_reference, open("one_reference_actual.p", 'wb'))
    print(corpus_bleu(one_reference, predicted_captions))
Beispiel #13
0
def main(args):

    # Image preprocessing
    transform = transforms.Compose([
        transforms.Resize(224),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Loss
    criterion = nn.CrossEntropyLoss()

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)
    print('val_loader length = {}'.format(len(data_loader)))

    val_loss = 0
    start = time.time()
    with torch.no_grad():
        for i, (images, captions, lengths) in enumerate(data_loader):
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]
            references = [idx2word2list(vocab, targets)]
            pdb.set_trace()

            # Forward
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            val_loss += criterion(outputs, targets)

            # Print log info
            if i % args.log_step == 0:
                print('step {}/{}, time {}'.format(i, len(data_loader),
                                                   timeSince(start)))

    val_loss = val_loss / len(data_loader)
    print('val_loss = {:.3f}'.format(val_loss))
Beispiel #14
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])
    
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters


    encoder.load_state_dict(torch.load(os.path.join('./Models', args.encoder_path)))
    decoder.load_state_dict(torch.load(os.path.join('./Models', args.decoder_path)))

    # Prepare an image
    for images in valimages:




    image = load_image(args.image, transform)
    image_tensor = image.to(device)
    


    # Generate a caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy()          # (1, max_seq_length) -> (max_seq_length)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)
    
    # Print out the image and the generated caption
    print (sentence)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.set_title(sentence)

    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
    
    plt.show()


def cal_bleu_score(dataset, model, source_vocab, target_vocab):
    targets = []
    predictions = []
 
    for i in range(len(dataset)):
        target = vars(test_data.examples[i])['trg']
        predicted_words = predict(i, model, source_vocab, target_vocab, dataset)
        predictions.append(predicted_words[1:-1])
        targets.append([target])
 
    print(f'BLEU Score: {round(bleu_score(predictions, targets) * 100, 2)}')

source_vocab = args.vocab_path
target_vocab

cal_bleu_score(dataset, model, source_vocab, target_vocab)
    
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--valImage', type=str, default = '/home/khaaq/Documents/COCOTorch_Yunjey/ResizeTest2014/COCO_val2014_000000000536.jpg', help='input image for generating caption')
    parser.add_argument('--encoder_path', type=str, default='/home/khaaq/Documents/COCOTorch_Yunjey/Models/encoder-10-3000.ckpt', help='path for trained encoder')
    parser.add_argument('--decoder_path', type=str, default='/home/khaaq/Documents/COCOTorch_Yunjey/Models/decoder-10-3000.ckpt', help='path for trained decoder')
    parser.add_argument('--vocab_path', type=str, default='/home/khaaq/Documents/COCOTorch_Yunjey/vocab.pkl', help='path for vocabulary wrapper')
    # parser.add_argument('--caption_path', type=str, default='/home/khaaq/Documents/COCO_KarepathyData2014/annotations/captions_val2014.json', help='path for train annotation json file')
    # Model parameters (should be same as paramters in train.py)
    parser.add_argument('--embed_size', type=int , default=256, help='dimension of word embedding vectors')
    parser.add_argument('--hidden_size', type=int , default=512, help='dimension of lstm hidden states')
    parser.add_argument('--num_layers', type=int , default=1, help='number of layers in lstm')
    args = parser.parse_args()
    main(args)
Beispiel #15
0
def main(args):
    global best_bleu4
    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    encoder = EncoderCNN(reso=args.reso)
    decoder = AttnDecoderRNN(attention_dim=args.attention_dim,
                             embed_dim=args.embed_dim,
                             decoder_dim=args.decoder_dim,
                             vocab_size=len(vocab),
                             dropout=args.dropout)
    encoder.to(device)
    decoder.to(device)
    decoder_optimizer = torch.optim.Adam(params=filter(
        lambda p: p.requires_grad, decoder.parameters()),
                                         lr=args.decoder_lr)

    encoder_optimizer = torch.optim.Adam(
        params=filter(lambda p: p.requires_grad, encoder.parameters()),
        lr=args.encoder_lr) if args.fine_tune_encoder else None

    criterion = nn.CrossEntropyLoss().to(device)
    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Build data loader
    train_loader = get_loader(args.image_dir,
                              args.caption_path,
                              vocab,
                              transform,
                              args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers)
    val_loader = get_loader(args.image_dir_val,
                            args.caption_path_val,
                            vocab,
                            transform,
                            args.batch_size,
                            shuffle=True,
                            num_workers=args.num_workers)
    TrainStdout = Logger('train.txt')
    ValStdout = Logger('val.txt')
    for epoch in range(args.start_epoch, args.epochs):
        if args.epochs_since_improvement == 20:
            break
        if args.epochs_since_improvement > 0 and args.epochs_since_improvement % 8 == 0:
            adjust_learning_rate(decoder_optimizer, 0.8)
            if args.fine_tune_encoder:
                adjust_learning_rate(encoder_optimizer, 0.8)
        train(train_loader=train_loader,
              encoder=encoder,
              decoder=decoder,
              criterion=criterion,
              encoder_optimizer=encoder_optimizer,
              decoder_optimizer=decoder_optimizer,
              epoch=epoch,
              stdout=TrainStdout)
        recent_bleu4 = validate(val_loader=val_loader,
                                encoder=encoder,
                                decoder=decoder,
                                criterion=criterion,
                                word_map=vocab.word2idx,
                                stdout=ValStdout)

        is_best = recent_bleu4 > best_bleu4
        best_bleu4 = max(recent_bleu4, best_bleu4)
        if not is_best:
            args.epochs_since_improvement += 1
            print("\nEpoch since last improvement: %d\n" %
                  (args.epochs_since_improvement, ))
        else:
            args.epochs_since_improvement = 0

        save_checkpoint(args.data_name, epoch, args.epochs_since_improvement,
                        encoder, decoder, encoder_optimizer, decoder_optimizer,
                        recent_bleu4, is_best)
Beispiel #16
0
def main(args):

    configure(os.path.join(args['exp_dir'], 'log_dir'))

    transform = transforms.Compose([
        transforms.RandomCrop(args['crop_size']),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    data_loader = get_loader({
        'data_dir': args['data_dir'],
        'exp_dir': args['exp_dir'],
        'raw_data_dir': args['raw_data_dir'],
        'batch_size': args['batch_size'],
        'transform': transform,
        'num_workers': args['num_workers'],
        'shuffle': args['shuffle'],
        'mode': 'train'
    })

    #    valid_data_loader=get_loader({'data_dir' : args['data_dir'],
    #                             'raw_data_dir' : args['raw_data_dir'],
    #                             'batch_size' : int(args['batch_size']/4),
    #                             'transform' : transform,
    #                             'num_workers' : args['num_workers'],
    #                             'shuffle' : args['shuffle'],
    #                             'mode':'validate'})

    args['vocab_size'] = len(Vocabulary.load_vocab(args['exp_dir']))

    encoder = EncoderCNN(args).train()
    decoder = DecoderRNN(args).train()

    if args['pretrained']:
        checkpoint_path = Checkpoint.get_latest_checkpoint(args['exp_dir'])
        checkpoint = Checkpoint.load(checkpoint_path)
        encoder.load_state_dict(checkpoint.encoder)
        decoder.load_state_dict(checkpoint.decoder)
        step = checkpoint.step
        epoch = checkpoint.epoch
        omit = True

    else:
        step = 0
        epoch = 0
        omit = False

    encoder.to(device)
    decoder.to(device)

    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(
        encoder.linear.parameters()) + list(encoder.bn.parameters())
    #    params=list(decoder.parameters()) + list(encoder.parameters())
    optimizer = torch.optim.Adam(params, lr=args['lr'])
    scheduler = StepLR(optimizer, step_size=40, gamma=0.1)
    #    optimizer=YFOptimizer(params)

    total_step = len(data_loader)
    min_valid_loss = float('inf')

    for epoch in range(epoch, args['num_epochs']):
        scheduler.step()
        for idx, (images, captions, leng) in enumerate(data_loader):

            if omit:
                if idx < (step - total_step * epoch):
                    logger.info(
                        'idx:{},step:{}, epoch:{}, total_step:{}, diss:{}'.
                        format(idx, step, epoch, total_step,
                               step - total_step * epoch))
                    continue
                else:
                    omit = False

            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, leng, batch_first=True)[0]

            features = encoder(images)
            outputs = decoder(features, captions, leng)
            loss = criterion(outputs, targets)
            decoder.zero_grad()
            encoder.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(decoder.parameters(), 5)
            optimizer.step()

            log_value('loss', loss.item(), step)
            step += 1

            if step % args['log_step'] == 0:
                logger.info(
                    'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                    .format(epoch, args['num_epochs'], idx, total_step,
                            loss.item(), np.exp(loss.item())))

            if step % args['valid_step'] == 0:
                #                valid_loss=validate(encoder.eval(),decoder,criterion,valid_data_loader)
                #                if valid_loss<min_valid_loss:
                #                    min_valid_loss=valid_loss
                Checkpoint(encoder, decoder, optimizer, epoch,
                           step).save(args['exp_dir'])
Beispiel #17
0
def uploaded_file(filename):
    print("####Entry File Name", filename)
    PATH_TO_TEST_IMAGES_DIR = app.config['UPLOAD_FOLDER']
    TEST_IMAGE_PATHS = [
        os.path.join(PATH_TO_TEST_IMAGES_DIR, filename.format(i))
        for i in range(1, 2)
    ]
    print("*******PRINT******", TEST_IMAGE_PATHS)
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    '''Load vocabulary wrapper'''
    with open('data/vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
    '''created instance to build models'''
    encoder = EncoderCNN(
        256).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(256, 512, len(vocab), 1)
    encoder = encoder.to(device)
    decoder = decoder.to(device)
    '''
    Load the trained model parameters 
    EncoderCNN pickle- objects detection
    Decoder RNN pickle pretrained- sequence prediction
    '''
    encoder.load_state_dict(torch.load('models/encoder-5-3000.pkl'))
    decoder.load_state_dict(torch.load('models/decoder-5-3000.pkl'))

    for img in TEST_IMAGE_PATHS:
        '''Prepare an image'''
        image = load_image(img, transform)
        image_tensor = image.to(device)
        '''Generate an caption from the image'''
        feature = encoder(image_tensor)
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids[0].cpu().numpy()
        '''Convert word_ids to words'''
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            if word == '<start>':  #or word == '<end>':# or word == '.':
                continue
            if word == '<end>':
                break
            sampled_caption.append(word)
        sentence = ' '.join(sampled_caption)
        '''
        Print the sentence in the console
        read the image and overlay the predicted text on the image
        save the result image
        route/return the saved image result location as output
        '''
        print(sentence)
        print("FileName", img)
        image = Image.open(img)
        draw = ImageDraw.Draw(image)
        font = ImageFont.truetype(
            '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf', size=15)
        (x, y) = (10, 10)
        color = 'rgb(244,208,63)'
        draw.text((x, y), sentence, fill=color, font=font)
        image.save('uploads/' + filename)

    return send_from_directory(app.config['UPLOAD_FOLDER'], filename)
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])
    '''
    # Load vocabulary wrapper
    with open(args.inverse_object_id_mapping, 'rb') as f:
        inverse_object_id_mapping = pickle.load(f)
    num_objects = len(inverse_object_id_mapping.keys())
    '''

    # Build models
    encoderCNN = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    encoderRNN = EncoderRNN(num_objects, args.embed_size, args.hidden_size)
    model = Model(num_objects, args.embed_size)
    encoderCNN = encoderCNN.to(device)
    encoderRNN = encoderRNN.to(device)
    model = model.to(device)
    encoderCNN.eval()
    encoderRNN.eval()
    model.eval()

    # Load the trained model parameters
    encoderCNN.load_state_dict(torch.load(args.encoderCNN_path))
    encoderRNN.load_state_dict(torch.load(args.encoderRNN_path))
    model.load_state_dict(torch.load(args.model_path))

    # Prepare an image
    image = load_image(args.image, transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    image_features = encoderCNN(image_tensor)
    input = torch.LongTensor([[[1]]]).to(device)
    h0 = torch.zeros((1, 1, args.hidden_size)).to(device)
    c0 = torch.zeros((1, 1, args.hidden_size)).to(device)
    max_seqlen = 10
    result = []
    K = 3
    all_candidates = [([1], 1.0, h0, c0) for i in range(K)]
    for i in range(max_seqlen):
        Q = []
        for _k in range(K):
            if i == 0 and _k == 1:  # first word
                break

            hashtag_features, (h0,
                               c0), Ul = encoderRNN(input[_k],
                                                    all_candidates[_k][2],
                                                    all_candidates[_k][3])
            outputs = model(image_features, hashtag_features, Ul)
            prob, topk = torch.topk(outputs, 20, dim=1)
            tup = list(zip(topk[0].cpu().tolist(), prob[0].cpu().tolist()))
            topk = [a for a in tup if a[0] not in all_candidates[_k][0]]
            try:
                topk.remove(1)
                topk.remove(0)
            except:
                pass

            for _k_ in range(K):
                Q.append((all_candidates[_k][0] + [topk[_k_][0]],
                          abs(all_candidates[_k][1] * topk[_k_][1]), h0, c0))

        all_candidates = sorted(Q, key=lambda x: x[1], reverse=True)[:K]
        input = []
        for _k in range(K):
            input.append([[all_candidates[_k][0][-1]]])
        input = torch.LongTensor(input).to(device)
        #result.append(top1.cpu().numpy()[0][0])
    result = sorted(all_candidates, key=lambda x: x[1], reverse=True)
    result = [i[0] for i in result]
    print(result)
    for i in range(K):
        tmp = [inverse_object_id_mapping[j] for j in result[i]]
        final = zip([j['name'] for j in tmp],
                    [j['supercategory'] for j in tmp])
        for j in final:
            print(j)
        print("-" * 50)
    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
Beispiel #19
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    # Prepare an image
    image = load_image(args.image, transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)
    print(sampled_ids)
    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        #print(word_id)
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out the image and the generated caption
    print(sentence)

    #add
    #message = raw_input("Enter message to encode: ")

    #print("Decoded string (in ASCII):")
    #for ch in sentence:
    # print(ord(ch))
    #    print("\t")

    sen = list(sentence.split(" "))
    # print(sen)
    sen1 = sen[1:-1]
    #print([sen1])
    #end

    image = Image.open(args.image)
    plt.imshow(np.asarray(image))
    #print(args.image)

    #add2
    if args.image == "png/ex1.jpg":
        caption = [
            'a', 'picture', 'of', 'an', 'elephant', 'on', 'a', 'road', '.'
        ]
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/ex4.jpg":
        caption = [
            'a', 'man', 'is', 'sitting', 'at', 'a', 'table', 'with', 'a',
            'laptop', 'on', 'it'
        ]
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)

    elif args.image == "png/ex2.jpg":
        caption = [
            'a', 'man', 'holding', 'tennis', 'racket', 'in', 'a', 'tennis',
            'court'
        ]
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/ex3.jpg":
        caption = [
            'a', 'man', 'and', 'woman', 'are', 'standing', 'near', 'a',
            'beach', '.'
        ]
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/ex5.jpg":
        caption = [
            'a', 'group', 'of', 'people', 'sitting', 'in', 'a', 'room',
            'working'
        ]
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/ex6.jpg":
        caption = ['a', 'man', 'playing', 'tennis', 'in', 'a', 'court']
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/ex7.jpg":
        caption = [
            'a', 'fire', 'hydrant', 'is', 'on', 'a', 'snowy', 'streets',
            'with', 'trees', '.'
        ]
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/ex8.jpg":
        caption = [
            'an', 'indoor', 'court', 'with', 'table', 'tennis', 'tables'
        ]
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/ex9.jpg":
        caption = [
            'a', 'man', 'sitting', 'at', 'a', 'table', 'talking', 'to',
            'another', 'man', '.'
        ]
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/ex10.jpg":
        caption = [
            'a', 'cat', 'is', 'sitting', 'on', 'floor', 'with', 'a', 'man',
            'standing', 'behind', 'it'
        ]
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/img5.jpg":
        caption = [
            'a', 'vase', 'filled', 'with', 'flowers', 'on', 'a', 'table', '.'
        ]
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/img10.jpg":
        caption = [
            'a', 'woman', 'is', 'sitting', 'at', 'a', 'table', 'with', 'a',
            'cake', 'on', 'it', '.'
        ]
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/img18.jpg":
        caption = ['a', 'person', 'holding', 'a', 'coconut', '.']
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/ex12.jpg":
        caption = ['motocycles', 'parked', 'in', 'a', 'parking', 'lot', '.']
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/ex13.jpg":
        caption = [
            'a', 'zebra', 'standing', 'next', 'to', 'a', 'zebra', 'on', 'an',
            'ice', 'road', '.'
        ]
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/ex14.jpg":
        caption = [
            'a', 'black', 'dog', 'and', 'two', 'cats', 'laying', 'on', 'a',
            'bed', '.'
        ]
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/ex16.jpg":
        caption = [
            'a', 'woman', 'is', 'cutting', 'apples', 'at', 'a', 'table', '.'
        ]
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/ex16.jpg":
        caption = [
            'a', 'woman', 'is', 'cutting', 'apples', 'at', 'a', 'table', '.'
        ]
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/ex19.jpg":
        caption = [
            'a', 'black', 'bear', 'is', 'walking', 'through', 'a', 'stony',
            'road', '.'
        ]
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/ex21.jpg":
        caption = ['a', 'table', 'with', 'many', 'plates', 'of', 'food', '.']
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/ex22.jpg":
        caption = [
            'a', 'brown', 'bear', 'is', 'sitting', 'in', 'the', 'graph', '.'
        ]
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/ex24.jpg":
        caption = [
            'a', 'group', 'of', 'people', 'playing', 'in', 'a', 'field',
            'with', 'a', 'frisbee', '.'
        ]
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/ex25.jpg":
        caption = [
            'a', 'group', 'of', 'sheep', 'standing', 'in', 'a', 'field', '.'
        ]
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/example2.jpg":
        caption = [
            'a', 'truck', 'and', 'a', 'car', 'parked', 'in', 'a', 'parking',
            'lot', '.'
        ]
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
    elif args.image == "png/puppy.jpg":
        caption = [
            'a', 'dog', 'is', 'laying', 'on', 'the', 'floor', 'with', 'a',
            'pillow', 'at', 'its', 'side', '.'
        ]
        print(caption)
        score1 = bluescore([sen1], caption)
        print(score1)
Beispiel #20
0
def evaluate_with_beam_search(args):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    dataset = Dataset({
        'data_dir': args['data_dir'],
        'exp_dir': args['exp_dir'],
        'raw_data_dir': args['raw_data_dir'],
        'transform': transform,
        'mode': 'test'
    })
    args['vocab_size'] = len(dataset.vocab)

    encoder = EncoderCNN(args).eval()
    decoder = DecoderRNN(args).eval()

    encoder = encoder.to(device)
    decoder = decoder.to(device)

    encoder.load_state_dict(
        torch.load(os.path.join(args['model_path'], 'encoder.pt')))
    decoder.load_state_dict(
        torch.load(os.path.join(args['model_path'], 'decoder.pt')))

    generated_captions = []
    image_ids = []
    target_captions = []

    for idx in range(len(dataset.ids)):
        image_id, image, captions = dataset.get_test_item(idx)
        image = image.to(device)
        print(idx)

        features = encoder(image)
        generated_sents = decoder.decode_with_beam_search(features)
        #        print(generated_sents)
        sents = []
        for sent_id in generated_sents:
            words = []
            for word_id in sent_id[0]:
                if dataset.vocab.idx2word[word_id] == '<start>':
                    continue
                elif dataset.vocab.idx2word[word_id] != '<end>':
                    words.append(dataset.vocab.idx2word[word_id])
                else:
                    break

            sents.append((' '.join(words), sent_id[1] / len(sent_id[0])))
        sents = sorted(sents, key=lambda x: x[1], reverse=True)
        generated_captions.append(sents)
        image_ids.append(image_id)
        target_captions.append(captions)

    image_captions = [{
        'image_id': image_ids[idx],
        'caption': generated_captions[idx][0][0]
    } for idx in range(len(image_ids))]

    captions_path = os.path.join(args['exp_dir'], args['model_dir'],
                                 args['caption_fils'])
    image_caption_path = os.path.join(args['exp_dir'], args['model_dir'],
                                      args['evaluation_file'])

    with open(captions_path, 'w') as f:
        for idx in range(len(generated_captions)):
            f.write('*' * 50 + '\n')
            f.write('-' * 20 + 'generated_captions' + '-' * 20 + '\n')
            for sent in generated_captions[idx]:
                f.write(sent[0] + '\n')
            f.write('-' * 20 + 'target_captions' + '-' * 20 + '\n')
            for words in target_captions[idx]:
                f.write(' '.join(words) + '\n')
            f.write('*' * 50 + '\n')
            f.write('\n')

    with open(image_caption_path, 'w') as f:
        json.dump(image_captions, f)
Beispiel #21
0
def main(args):
    threshold = 20
    captions_dict = load_captions(train_dir)
    vocab = Vocabulary(captions_dict, threshold)
    vocab_size = vocab.index
    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
    dataloader = DataLoader(val_dir, vocab, transform)
    imagenumbers, captiontotal, imagetotal = dataloader.gen_data()

    # Build data loader
    data_loader = get_loader(imagenumbers,
                             captiontotal,
                             imagetotal,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval()
    decoder = DecoderRNN(args.embed_size, args.hidden_size, vocab_size,
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(encoder_path))
    decoder.load_state_dict(torch.load(decoder_path))

    # Build data loader

    total_step = len(data_loader)

    # List to score the BLEU scores
    bleu_scores = []

    for i, (images, captions, lengths) in enumerate(data_loader):

        # Set mini-batch dataset
        images = images.to(device)
        # captions = captions.to(device)

        # Generate an caption from the image
        feature = encoder(images)
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids[0].cpu().numpy()

        # Convert word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.get_word(word_id)
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)

        score = sentence_bleu([captions], sentence, args.bleu_weights)
        bleu_scores.append(score)

        # Print log info
        if i % args.log_step == 0:
            print('Finish [{}/{}], Current BLEU Score: {:.4f}'.format(
                i, total_step, np.mean(bleu_scores)))
            print(sentence)
            print(captions)

    np.save('test_results.npy', [bleu_scores, np.mean(bleu_scores)])
Beispiel #22
0
def _main():
    parser = argparse.ArgumentParser()
    parser.add_argument("filename", help="(optional) path to photograph, for which a caption will be generated", nargs = "?")
    parser.add_argument("--host", help="(optional) host to start a webserver on. Default: 0.0.0.0", nargs = "?", default = "0.0.0.0")
    parser.add_argument("--port", help="(optional) port to start a webserver on. http://hostname:port/query", nargs = "?", type = int, default = 1985)
    parser.add_argument("--verbose", "-v", help="print verbose query information", action="store_true")
   
    global _args
    _args = parser.parse_args()

    if not _args.filename and not _args.port:
        parser.print_help()
        sys.exit(-1)

    global _device
    _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("PyTorch device = ", _device)

    # Load the vocabulary dictionary
    vocab_threshold = None,
    vocab_file = "./vocab.pkl"
    start_word = "<start>"
    end_word   = "<end>"
    unk_word   = "<unk>"
    load_existing_vocab = True
    #annotations_file = "/opt/cocoapi/annotations/captions_train2014.json"
    annotations_file = None

    print("Loading vocabulary...")
    global _vocab
    _vocab = Vocabulary(vocab_threshold, vocab_file, start_word, end_word, unk_word, annotations_file, load_existing_vocab)
    vocab_size = len (_vocab)
    print("Vocabulary contains %d words" % vocab_size)

    # Load pre-trained models: 
    # encoder (Resnet + embedding layers)
    # decoder (LSTM)
    global _encoder
    global _decoder
    encoder_path = os.path.join("./models/", _encoder_file)
    decoder_path = os.path.join("./models/", _decoder_file)
    print("Loading ", encoder_path)
    _encoder = EncoderCNN(_embed_size)
    _encoder.load_state_dict(torch.load(encoder_path))
    _encoder.eval()
    _encoder.to(_device)

    print("Loading ", decoder_path)
    _decoder = DecoderRNN(_embed_size, _hidden_size, vocab_size, _num_layers)
    _decoder.load_state_dict(torch.load(decoder_path))
    _decoder.eval()
    _decoder.to(_device)

    # Caption the photo, or start a web server if no photo specified
    if _args.filename:
        _get_prediction_from_file(_args.filename)
    else:
        global _app
        global _api

        _app = Flask(__name__)
        _api = Api(_app)

        _api.add_resource(ImageCaptionResource,
                "/v1/caption",
                "/v1/caption/")
        _app.run(host = _args.host, port = _args.port)
Beispiel #23
0
def main():
    ####################################################
    # config
    ####################################################
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    config = {}
    config['dataset'] = 'COCO'
    config[
        'vocab_word2idx_path'] = './vocab/save/' + 'COCO' + '/vocab/' + 'thre5_word2idx.pkl'
    config[
        'vocab_idx2word_path'] = './vocab/save/' + 'COCO' + '/vocab/' + 'thre5_idx2word.pkl'
    config[
        'vocab_idx_path'] = './vocab/save/' + 'COCO' + '/vocab/' + 'thre5_idx.pkl'
    config['crop_size'] = 224
    config['images_root'] = './data/COCO/train2014_resized'
    config[
        'json_file_path_train'] = './data/COCO/annotations/captions_mini100.json'
    config[
        'json_file_path_val'] = './data/COCO/annotations/captions_val2014.json'
    config['batch_size'] = 128
    config['embed_size'] = 256
    config['hidden_size'] = 512
    config['learning_rate'] = 1e-4
    config['epoch_num'] = 20
    config['save_step'] = 10
    config['model_save_root'] = './save/'

    config['encoder_path'] = './save/'
    config['decoder_path'] = './save/'

    ####################################################
    # load vocabulary
    ####################################################
    vocab = Vocabulary()
    with open(config['vocab_word2idx_path'], 'rb') as f:
        vocab.word2idx = pickle.load(f)
    with open(config['vocab_idx2word_path'], 'rb') as f:
        vocab.idx2word = pickle.load(f)
    with open(config['vocab_idx_path'], 'rb') as f:
        vocab.idx = pickle.load(f)

    ####################################################
    # create data_loader
    ####################################################
    normalize = {
        'Flickr8k': [(0.4580, 0.4464, 0.4032), (0.2318, 0.2229, 0.2269)],
        'Flickr30k': None,
        'COCO': [(0.485, 0.456, 0.406), (0.229, 0.224, 0.225)]
    }

    transform = transforms.Compose([
        transforms.RandomCrop(config['crop_size']),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(normalize[config['dataset']][0],
                             normalize[config['dataset']][1])
    ])

    loader_train = get_loader(dataset_name=config['dataset'],
                              images_root=config['images_root'],
                              json_file_path=config['json_file_path_train'],
                              vocab=vocab,
                              transform=transform,
                              batch_size=config['batch_size'],
                              shuffle=True,
                              is_train=True)
    loader_val = get_loader(dataset_name=config['dataset'],
                            images_root=config['images_root'],
                            json_file_path=config['json_file_path_val'],
                            vocab=vocab,
                            transform=transform,
                            batch_size=1,
                            shuffle=False,
                            is_val=True)

    ####################################################
    # create model
    ####################################################
    encoder = EncoderCNN(config['embed_size'])
    decoder = DecoderRNN(config['embed_size'], config['hidden_size'],
                         len(vocab), 1)
    encoder.load_state_dict(torch.load(config['encoder_path']))
    decoder.load_state_dict(torch.load(config['decoder_path']))
    encoder.to(device)
    decoder.to(device)

    ####################################################
    # create trainer
    ####################################################
    raw_captions = []
    sampled_captions = []

    encoder.eval()
    decoder.eval()
    for i, (image, caption, length) in enumerate(tqdm(loader_val)):
        image = image.to(device)
        feature = encoder(image)
        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids[0].cpu().numpy()
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<END>':
                break
        raw_caption = [[vocab(int(token)) for token in list(caption[0])]]
        sampled_caption = sampled_caption[1:-1]  # delete <START> and <END>
        # if sampled_caption[-1] != '.':
        #     sampled_caption.append('.')
        raw_caption[0] = raw_caption[0][1:-1]  # delete <START> and <END>
        raw_captions.append(raw_caption)
        sampled_captions.append(sampled_caption)

    hypo = {}
    for i, caption in enumerate(sampled_captions):
        hypo[i] = [' '.join(caption)]
    ref = {}
    for i, caption in enumerate(raw_captions):
        ref[i] = [' '.join(caption[0])]

    final_scores = Bleu().compute_score(ref, hypo)
    print(final_scores[0])
Beispiel #24
0
test_dataset = CategoryDataset(
    transform=transform,
    data_file="test_no_dup_with_category_3more_name.json",
    use_mean_img=False,
    neg_samples=False)
test_loader = DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False,
    num_workers=4,
    collate_fn=lstm_collate_fn,
)
###############################################################################

encoder_cnn = EncoderCNN(emb_size)
encoder_cnn = encoder_cnn.to(device)

if model == "lstm":
    f_rnn = LSTMModel(emb_size,
                      emb_size,
                      emb_size,
                      device,
                      bidirectional=False)
    b_rnn = LSTMModel(emb_size,
                      emb_size,
                      emb_size,
                      device,
                      bidirectional=False)
f_rnn = f_rnn.to(device)
b_rnn = b_rnn.to(device)
Beispiel #25
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    ###### Prepare a batch of test images  #########
    data_loader = get_loader(args.image_path,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    actual = []
    predicted = []
    count = 0
    pdict = {}
    adict = {}

    for i, (images, captions, lengths) in enumerate(data_loader):
        #print(captions.shape,lengths)
        images = images.to(device)
        captions = captions.to(device)
        features = encoder(images)
        outputs = decoder.sample(features)
        for bnum in range(len(outputs)):
            output = outputs[bnum].cpu().numpy()
            predicted_array = []
            for wid in output:
                word = vocab.idx2word[wid]
                if word == '<end>':
                    break
                predicted_array.append(word)

            predicted.append(' '.join(predicted_array))

            actual_caption = captions[bnum]
            actual_arr = []
            actual_caption = actual_caption.cpu().numpy()
            for wid in actual_caption:
                word = vocab.idx2word[wid]
                actual_arr.append(word)
            actual.append(' '.join(actual_arr))
            if count % 128 == 0:
                pdict['output'] = predicted
                adict['output'] = actual
                with open('test_set_results/test_prediction.json',
                          'w') as outfile:
                    json.dump(pdict, outfile)
                with open('test_set_results/test_actual.json', 'w') as outfile:
                    json.dump(adict, outfile)
            count = count + 1

    pdict['output'] = predicted
    adict['output'] = actual
    with open('test_set_results/test_prediction.json', 'w') as outfile:
        json.dump(pdict, outfile)
    with open('test_set_results/test_actual.json', 'w') as outfile:
        json.dump(adict, outfile)
Beispiel #26
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    print('Loading vocab  ')
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    print('Building models')
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size,
                         args.hidden_size,
                         len(vocab),
                         args.num_layers,
                         use_inference=args.use_inference).eval()
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    print('Loading models')
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    frame_rate = 5.0  # フレームレート
    im_size = 500
    w, h = 1000, 800
    size = (w, h)  # 動画の画面サイズ
    fmt = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')  # ファイル形式(ここではmp4)
    writer = cv2.VideoWriter(args.out_file, fmt, frame_rate, size)  # ライター作成

    results_dict = {}

    accounts = [
        p.split('/')[-1].split('.')[0] for p in glob.glob('data/annos/*')
    ]
    # accounts = args.use_account
    for user in accounts:
        print(user)
        ann_path = os.path.join(f"data/annos/{user}.pickle")
        annos = loadPickle(ann_path)

        if args.split == 'train':
            annos = annos[:-20]
        elif args.split == 'val':
            annos = annos[-20:]

        m = min(len(annos), 100)  # 枚数
        for i in range(m):
            ann = annos[i]

            image_path = f'data/images/{ann["filename"]}'
            if args.split == 'val' and image_path in duplicated_images_path:
                print('is duplicated image.')
            else:
                orig_text = ann['text']

                image = load_image(image_path, transform)
                # image = load_image(args.image, transform)
                image_tensor = image.to(device)

                # Generate an caption from the image
                feature = encoder(image_tensor)
                sampled_ids = decoder.sample(feature)
                sampled_ids = sampled_ids[0].cpu().numpy(
                )  # (1, max_seq_length) -> (max_seq_length)

                # Convert word_ids to words
                sampled_caption = []
                for word_id in sampled_ids:
                    word = vocab.idx2word[word_id]
                    sampled_caption.append(word)
                    if word == '<end>':
                        break
                sentence = ' '.join(sampled_caption)

                # Print out the image and the generated caption
                # print(type(sentence))
                print(sentence)
                # print(sentence.encode('utf_8'))
                # image = Image.open(args.image)
                # plt.imshow(np.asarray(image))

                img = cv2.imread(image_path)
                img = resize_square_pad(img)

                frame = np.zeros((h, w, 3)).astype('uint8')
                frame[h - im_size:, :im_size, :] = img
                frame = cv2.putText(frame, image_path, (10, h - 20),
                                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0),
                                    1, cv2.LINE_AA)

                n = 20
                s = 'GT: \n'
                for i in range(len(orig_text) // n + 1):
                    s += orig_text[n * i:n * (i + 1)] + '\n'
                frame = puttext(frame,
                                s,
                                point=(15, 20),
                                color=(255, 255, 255))

                s = 'Result: \n'
                res = sentence.replace('<start>',
                                       '').replace('<end>',
                                                   '').replace(' ', '')
                for i in range(len(res) // n + 1):
                    s += res[n * i:n * (i + 1)] + '\n'
                frame = puttext(frame,
                                s,
                                point=(15, (h - im_size) / 2 + 20),
                                color=(0, 255, 0))

                # 出力結果と似たキャプションの画像を探す
                sim_image_file = None
                for text in text2file:
                    if (res[:8] in text) or (res[-8:] in text):
                        sim_image_file = text2file[text]
                        break
                if sim_image_file is not None:
                    img = cv2.imread(sim_image_file)
                    img = resize_square_pad(img)
                    frame[h - im_size:, im_size:, :] = img
                    frame = cv2.putText(frame, sim_image_file,
                                        (10 + im_size, h - 20),
                                        cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                                        (0, 255, 0), 1, cv2.LINE_AA)
                    s = 'Similar data: \n'
                    for i in range(len(text) // n + 1):
                        s += text[n * i:n * (i + 1)] + '\n'
                    frame = puttext(frame,
                                    s,
                                    point=(15 + im_size,
                                           (h - im_size) / 2 + 20),
                                    color=(0, 0, 255))
                sim_image_file = None

                writer.write(frame)

                # 結果を記録
                results_dict[image_path] = res

    writer.release()

    # 結果を記録
    with open(args.result_out_file, 'wb') as f:
        pickle.dump(results_dict, f)
def main(args):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(args.encoder_path))
    decoder.load_state_dict(torch.load(args.decoder_path))

    ###### Prepare a batch of test images  #########
    data_loader = get_loader(args.image_path,
                             args.caption_path,
                             vocab,
                             transform,
                             args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    arr_predicted = []
    arr_actual = []
    count = 0
    f_a = open('actual_label_coco.txt', 'w+')
    f_p = open('predict_label_coco.txt', 'w+')

    for i, (images, captions, lengths) in enumerate(data_loader):
        #print(captions.shape,lengths)
        images = images.to(device)
        captions = captions.to(device)
        features = encoder(images)
        outputs = decoder.sample(features)
        for bnum in range(128):
            output = outputs[bnum].cpu().numpy()
            predicted_array = []
            for wid in output:
                word = vocab.idx2word[wid]
                if word == '<end>':
                    break
                predicted_array.append(word)

            predicted = ' '.join(predicted_array)
            f_p.write(predicted)
            f_p.write('\n')

            actual_caption = captions[bnum]
            actual_arr = []
            actual_caption = actual_caption.cpu().numpy()
            for wid in actual_caption:
                word = vocab.idx2word[wid]
                actual_arr.append(word)
            actual = ' '.join(actual_arr)
            f_a.write(actual)
            f_a.write('\n')
            print(predicted)
            print(actual)
            print(images[bnum].cpu().numpy().transpose(1, 2, 0).shape)
            scipy.misc.imsave('temp_dir/{}.jpg'.format(bnum),
                              images[bnum].cpu().numpy().transpose(1, 2, 0))
            count = count + 1
        f_a.close()
        f_p.close()
        return