def __init__(self, subencoder_setting, decoder_setting, padding_idx=0): super(SubToSeqFix, self).__init__() self.trainStep = 0 self.subRnn = EncoderRNN(**subencoder_setting) self.fc = nn.Linear(subencoder_setting["output_size"], decoder_setting["feature_size"]) self.decoderRnn = DecoderRNN(**decoder_setting)
def __init__(self, video_setting, subencoder_setting, decoder_setting, padding_idx=0): super(SubImgToSeq, self).__init__() self.trainStep = 0 self.videoRnn = VideoEncoder(**video_setting) self.subRnn = EncoderRNN(**subencoder_setting) self.context = Context(video_feature=video_setting["output_size"], sub_feature=subencoder_setting["output_size"], output_size=decoder_setting["feature_size"]) self.decoderRnn = DecoderRNN(**decoder_setting)
def main(image_path, decoder_path, encoder_path, vocab_path, embed_size, hidden_size, num_layers): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN( embed_size).eval() # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(encoder_path, map_location=device)) decoder.load_state_dict(torch.load(decoder_path, map_location=device)) # Prepare an image image = load_image(image_path, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out the image and the generated caption print(sentence, "!") return sentence
def test(image_path, state_path, vocab_path, embed_size, hidden_size, num_layers, img_size): transform = transforms.Compose([ transforms.Resize((img_size, img_size)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) with open(vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(embed_size).eval() decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters state = torch.load(state_path, map_location=device) encoder.load_state_dict(state["encoder"]) decoder.load_state_dict(state["decoder"]) # Prepare an image image = load_image(image_path, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) return sentence
def main(args): # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.Resize((args.crop_size, args.crop_size)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, args.test_img, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build models encoder = EncoderCNN(args.embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters state = torch.load(state_path, map_location=device) encoder.load_state_dict(state["encoder"]) decoder.load_state_dict(state["decoder"]) scores = [] # Evaluate the model total_step = len(data_loader) for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) lengths = lengths.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Generate an caption from the image feature = encoder(images) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids.cpu().numpy() targets_captions = [] captions = captions.cpu().numpy() for samp in captions: caption = [] for word_id in samp: word = vocab.idx2word[word_id] caption.append(word) if word == '<end>': break targets_captions.append(caption) # Convert word_ids to words predicted_captions = [] for samp in sampled_ids: caption = [] for word_id in samp: word = vocab.idx2word[word_id] caption.append(word) if word == '<end>': break predicted_captions.append(caption) print("targets_captions: ", targets_captions[:20]) print("predicted_captions: ", predicted_captions[:20]) references = [[targets_captions[0]]] candidates = [predicted_captions[0]] score = corpus_bleu(references, candidates) print("references: ", references) print("candidates: ", candidates) print(score) scores.append(score) print('bleu av: ', sum(scores) / len(scores))
]) # Build data loader. data_loader = get_loader(transform=transform_train, mode='train', batch_size=batch_size, vocab_threshold=vocab_threshold, vocab_from_file=vocab_from_file) # The size of the vocabulary. vocab_size = len(data_loader.dataset.vocab) print(f'Vocabulary size: {vocab_size}') # Initialize the encoder and decoder. encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, vocab_size) # Move models to GPU if CUDA is available. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder.to(device) decoder.to(device) # Define the loss function. criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available( ) else nn.CrossEntropyLoss() # Specify the learnable parameters of the model. # Dummy params = list(decoder.parameters()) + list(encoder.embed.parameters()) # Define the optimizer.
def main(args): # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.Resize((args.crop_size, args.crop_size)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, args.train_img, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the models encoder = EncoderCNN(args.embed_size).to(device) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) # TODO: should be done `model.train()` # Loss and optimizer criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.classifier.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # Train the models total_step = len(data_loader) for epoch in range(args.num_epochs): for i, (images, captions, lengths) in enumerate(tqdm(data_loader)): lengths = lengths.to(device) images = images.to(device) captions = captions.to(device) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # Forward, backward and optimize features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) optimizer.zero_grad() loss.backward() optimizer.step() # TODO: use `tqdm` # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) print('saving weights') torch.save( dict(decoder=decoder.state_dict(), encoder=encoder.state_dict()), os.path.join(args.model_path, 'state-epoch-{}-loss-{}.ckpt'.format(epoch + 1, loss)))
def __init__(self, video_setting, decoder_setting, padding_idx=0): super(ImgToSeq, self).__init__() self.trainStep = 0 self.videoRnn = VideoRNN(**video_setting) self.decoderRnn = DecoderRNN(**decoder_setting)
def __init__(self, subencoder_setting, decoder_setting, padding_idx=0): super(SubToSeq, self).__init__() self.trainStep = 0 self.subRnn = EncoderRNN(**subencoder_setting) self.decoderRnn = DecoderRNN(**decoder_setting)
# --- Load Model --- # Specify the saved models to load. encoder_file = 'encoder-3.pkl' decoder_file = 'decoder-3.pkl' # Select appropriate values for the Python variables below. embed_size = 512 hidden_size = 256 # The size of the vocabulary. vocab_size = len(data_loader.dataset.vocab) # Initialize the encoder and decoder, and set each to inference mode. encoder = EncoderCNN(embed_size) encoder.eval() decoder = DecoderRNN(embed_size, hidden_size, vocab_size) decoder.eval() # Load the trained weights. encoder.load_state_dict(torch.load(os.path.join('./models', encoder_file))) decoder.load_state_dict(torch.load(os.path.join('./models', decoder_file))) # Move models to GPU if CUDA is available. encoder.to(device) decoder.to(device) # --- Create Sample Output --- # Move image Pytorch Tensor to GPU if CUDA is available. image = image.to(device) # Obtain the embedded image features.
plt.title(caps) plt.axis('off') plt.show() # visualize(train_loader) # assert len(os.listdir('/train2017') # ) == train_samples, 'Make sure to have full images in images/train2017' # prepare training VOCAB_SIZE = len(train_loader.dataset.vocab) encoder = EncoderCNN(EMBED_SIZE) # .to(device) decoder = DecoderRNN(EMBED_SIZE, HIDDEN_SIZE, VOCAB_SIZE) # .to(device) print('Encoder:\n', encoder) print('Decoder:\n', decoder) all_params = list(encoder.linear.parameters()) + \ list(encoder.bn1.parameters()) + list(decoder.parameters()) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(params=all_params, lr=1e-05) total_step = math.ceil(len(train_loader.dataset.cap_len) / BATCH_SIZE) model_save_path = '/checkpoint' if not os.path.exists(model_save_path): os.makedirs(model_save_path, exist_ok=True) def train_predict(encoder, decoder):