コード例 #1
0
 def __init__(self, subencoder_setting, decoder_setting, padding_idx=0):
     super(SubToSeqFix, self).__init__()
     self.trainStep = 0
     self.subRnn = EncoderRNN(**subencoder_setting)
     self.fc = nn.Linear(subencoder_setting["output_size"],
                         decoder_setting["feature_size"])
     self.decoderRnn = DecoderRNN(**decoder_setting)
コード例 #2
0
    def __init__(self,
                 video_setting,
                 subencoder_setting,
                 decoder_setting,
                 padding_idx=0):
        super(SubImgToSeq, self).__init__()
        self.trainStep = 0
        self.videoRnn = VideoEncoder(**video_setting)
        self.subRnn = EncoderRNN(**subencoder_setting)

        self.context = Context(video_feature=video_setting["output_size"],
                               sub_feature=subencoder_setting["output_size"],
                               output_size=decoder_setting["feature_size"])

        self.decoderRnn = DecoderRNN(**decoder_setting)
コード例 #3
0
def main(image_path, decoder_path, encoder_path, vocab_path, embed_size,
         hidden_size, num_layers):
    # Image preprocessing
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(
        embed_size).eval()  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    encoder.load_state_dict(torch.load(encoder_path, map_location=device))
    decoder.load_state_dict(torch.load(decoder_path, map_location=device))

    # Prepare an image
    image = load_image(image_path, transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)
    sampled_ids = sampled_ids[0].cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    # Print out the image and the generated caption
    print(sentence, "!")
    return sentence
コード例 #4
0
def test(image_path, state_path, vocab_path, embed_size, hidden_size,
         num_layers, img_size):

    transform = transforms.Compose([
        transforms.Resize((img_size, img_size)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    with open(vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build models
    encoder = EncoderCNN(embed_size).eval()
    decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    state = torch.load(state_path, map_location=device)
    encoder.load_state_dict(state["encoder"])
    decoder.load_state_dict(state["decoder"])

    # Prepare an image
    image = load_image(image_path, transform)
    image_tensor = image.to(device)

    # Generate an caption from the image
    feature = encoder(image_tensor)
    sampled_ids = decoder.sample(feature)

    sampled_ids = sampled_ids.cpu().numpy(
    )  # (1, max_seq_length) -> (max_seq_length)

    # Convert word_ids to words
    sampled_caption = []
    for word_id in sampled_ids:
        word = vocab.idx2word[word_id]
        sampled_caption.append(word)
        if word == '<end>':
            break
    sentence = ' '.join(sampled_caption)

    return sentence
コード例 #5
0
def main(args):
    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([
        transforms.Resize((args.crop_size, args.crop_size)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             args.test_img,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Build models
    encoder = EncoderCNN(args.embed_size).eval(
    )  # eval mode (batchnorm uses moving mean/variance)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers)
    encoder = encoder.to(device)
    decoder = decoder.to(device)

    # Load the trained model parameters
    state = torch.load(state_path, map_location=device)
    encoder.load_state_dict(state["encoder"])
    decoder.load_state_dict(state["decoder"])

    scores = []
    # Evaluate the model
    total_step = len(data_loader)
    for i, (images, captions, lengths) in enumerate(data_loader):
        # Set mini-batch dataset
        images = images.to(device)
        lengths = lengths.to(device)
        captions = captions.to(device)
        targets = pack_padded_sequence(captions, lengths, batch_first=True)[0]

        # Generate an caption from the image
        feature = encoder(images)

        sampled_ids = decoder.sample(feature)
        sampled_ids = sampled_ids.cpu().numpy()

        targets_captions = []
        captions = captions.cpu().numpy()
        for samp in captions:
            caption = []

            for word_id in samp:
                word = vocab.idx2word[word_id]
                caption.append(word)
                if word == '<end>':
                    break
            targets_captions.append(caption)

        # Convert word_ids to words
        predicted_captions = []
        for samp in sampled_ids:
            caption = []

            for word_id in samp:
                word = vocab.idx2word[word_id]
                caption.append(word)
                if word == '<end>':
                    break
            predicted_captions.append(caption)

        print("targets_captions: ", targets_captions[:20])
        print("predicted_captions: ", predicted_captions[:20])

        references = [[targets_captions[0]]]
        candidates = [predicted_captions[0]]
        score = corpus_bleu(references, candidates)

        print("references: ", references)
        print("candidates: ", candidates)
        print(score)
        scores.append(score)
    print('bleu av: ', sum(scores) / len(scores))
コード例 #6
0
])

# Build data loader.
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=vocab_from_file)

# The size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)
print(f'Vocabulary size: {vocab_size}')

# Initialize the encoder and decoder.
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

# Move models to GPU if CUDA is available.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

# Define the loss function.
criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available(
) else nn.CrossEntropyLoss()

# Specify the learnable parameters of the model.
# Dummy
params = list(decoder.parameters()) + list(encoder.embed.parameters())

# Define the optimizer.
コード例 #7
0
def main(args):
    # Create model directory
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([
        transforms.Resize((args.crop_size, args.crop_size)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])

    # Load vocabulary wrapper
    with open(args.vocab_path, 'rb') as f:
        vocab = pickle.load(f)

    # Build data loader
    data_loader = get_loader(args.image_dir,
                             args.caption_path,
                             vocab,
                             args.train_img,
                             transform,
                             args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers)

    # Build the models
    encoder = EncoderCNN(args.embed_size).to(device)
    decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab),
                         args.num_layers).to(device)

    # TODO: should be done `model.train()`

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    params = list(decoder.parameters()) + list(encoder.classifier.parameters())
    optimizer = torch.optim.Adam(params, lr=args.learning_rate)

    # Train the models
    total_step = len(data_loader)
    for epoch in range(args.num_epochs):

        for i, (images, captions, lengths) in enumerate(tqdm(data_loader)):
            lengths = lengths.to(device)
            images = images.to(device)
            captions = captions.to(device)
            targets = pack_padded_sequence(captions, lengths,
                                           batch_first=True)[0]

            # Forward, backward and optimize
            features = encoder(images)
            outputs = decoder(features, captions, lengths)
            loss = criterion(outputs, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # TODO: use `tqdm`
            # Print log info
            if i % args.log_step == 0:
                print(
                    'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'
                    .format(epoch, args.num_epochs, i, total_step, loss.item(),
                            np.exp(loss.item())))

        print('saving weights')
        torch.save(
            dict(decoder=decoder.state_dict(), encoder=encoder.state_dict()),
            os.path.join(args.model_path,
                         'state-epoch-{}-loss-{}.ckpt'.format(epoch + 1,
                                                              loss)))
コード例 #8
0
 def __init__(self, video_setting, decoder_setting, padding_idx=0):
     super(ImgToSeq, self).__init__()
     self.trainStep = 0
     self.videoRnn = VideoRNN(**video_setting)
     self.decoderRnn = DecoderRNN(**decoder_setting)
コード例 #9
0
    def __init__(self, subencoder_setting, decoder_setting, padding_idx=0):
        super(SubToSeq, self).__init__()
        self.trainStep = 0
        self.subRnn = EncoderRNN(**subencoder_setting)

        self.decoderRnn = DecoderRNN(**decoder_setting)
コード例 #10
0
# --- Load Model ---
# Specify the saved models to load.
encoder_file = 'encoder-3.pkl'
decoder_file = 'decoder-3.pkl'

# Select appropriate values for the Python variables below.
embed_size = 512
hidden_size = 256

# The size of the vocabulary.
vocab_size = len(data_loader.dataset.vocab)

# Initialize the encoder and decoder, and set each to inference mode.
encoder = EncoderCNN(embed_size)
encoder.eval()
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
decoder.eval()

# Load the trained weights.
encoder.load_state_dict(torch.load(os.path.join('./models', encoder_file)))
decoder.load_state_dict(torch.load(os.path.join('./models', decoder_file)))

# Move models to GPU if CUDA is available.
encoder.to(device)
decoder.to(device)

# --- Create Sample Output ---
# Move image Pytorch Tensor to GPU if CUDA is available.
image = image.to(device)

# Obtain the embedded image features.
コード例 #11
0
        plt.title(caps)
        plt.axis('off')
    plt.show()


# visualize(train_loader)

# assert len(os.listdir('/train2017')
#            ) == train_samples, 'Make sure to have full images in images/train2017'

# prepare training
VOCAB_SIZE = len(train_loader.dataset.vocab)

encoder = EncoderCNN(EMBED_SIZE)  # .to(device)
decoder = DecoderRNN(EMBED_SIZE, HIDDEN_SIZE, VOCAB_SIZE)  # .to(device)
print('Encoder:\n', encoder)
print('Decoder:\n', decoder)
all_params = list(encoder.linear.parameters()) + \
    list(encoder.bn1.parameters()) + list(decoder.parameters())

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=all_params, lr=1e-05)
total_step = math.ceil(len(train_loader.dataset.cap_len) / BATCH_SIZE)

model_save_path = '/checkpoint'
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path, exist_ok=True)


def train_predict(encoder, decoder):