def inference_coco(encoder_file: str, decoder_file: str, embed_size: int, hidden_size: int, from_cpu: bool) -> None: """ Displays an original image from coco test dataset and prints its associated caption. encoder_file: Name of the encoder to load. decoder_file: Name of the decoder to load. embed_size: Word embedding size for the encoder. hidden_size: Hidden layer of the LSTM size. from_cpu: Whether the model has been saved on CPU. """ # Define transform transform_test = transforms.Compose([ transforms.Resize(256), # smaller edge of image resized to 256 transforms.RandomCrop(224), # get 224x224 crop from random location transforms.ToTensor(), # convert the PIL Image to a tensor transforms.Normalize( (0.485, 0.456, 0.406), # normalize image for pre-trained model (0.229, 0.224, 0.225)) ]) # Device to use fo inference device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Create the data loader. data_loader = get_loader(transform=transform_test, mode='test') # Obtain sample image _, image = next(iter(data_loader)) # The size of the vocabulary. vocab_size = len(data_loader.dataset.vocab) # Initialize the encoder and decoder, and set each to inference mode. encoder = EncoderCNN(embed_size) encoder.eval() decoder = DecoderRNN(embed_size, hidden_size, vocab_size) decoder.eval() # Load the trained weights. if from_cpu: encoder.load_state_dict( torch.load(os.path.join('./models', encoder_file), map_location='cpu')) decoder.load_state_dict( torch.load(os.path.join('./models', decoder_file), map_location='cpu')) else: encoder.load_state_dict( torch.load(os.path.join('./models', encoder_file))) decoder.load_state_dict( torch.load(os.path.join('./models', decoder_file))) # Move models to GPU if CUDA is available. encoder.to(device) decoder.to(device) get_prediction(encoder, decoder, data_loader, device)
def main(args): transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) vocab = Vocabulary.load_vocab(args['data_dir']) args['vocab_size'] = len(vocab) encoder = EncoderCNN(args).eval() decoder = DecoderRNN(args) encoder.to(device) decoder.to(device) encoder.load_state_dict( torch.load(os.path.join(args['model_dir'], args['encoder_name']))) decoder.load_state_dict( torch.load(os.path.join(args['model_dir'], args['decoder_name']))) test_caption_list = [] for file_name in os.listdir( os.path.join(args['data_dir'], args['image_dir'])): if os.path.isfile( os.path.join(args['data_dir'], args['image_dir'], file_name)): image = load_image( os.path.join(args['data_dir'], args['image_dir'], file_name), transform) image_tensor = image.to(device) else: continue feature = encoder(image_tensor) sample_ids = decoder.sample(feature) sample_ids = sample_ids[0].cpu().numpy() sample_caption = [] for word_id in sample_ids: word = vocab.idx2word[word_id] sample_caption.append(word) if word == '<end>': break sentence = ' '.join(sample_caption) print(sentence) test_caption_list.append((file_name, sentence)) # image=Image.open(os.path.join(args['data_dir'],args['image_dir'],file_name)) # plt.imshow(np.asarray(image)) with open(os.path.join(args['data_dir'], 'test_caption.txt'), 'w') as f: for item in test_caption_list: f.write('image_name:{} ---- generated_caption:{}\n'.format( item[0], item[1])) f.write('\n')
def main(): st.title('Image Captioning App') st.markdown(STYLE, unsafe_allow_html=True) file = st.file_uploader("Upload file", type=["png", "jpg", "jpeg"]) show_file = st.empty() if not file: show_file.info("Please upload a file of type: " + ", ".join(["png", "jpg", "jpeg"])) return content = file.getvalue() show_file.image(file) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder_file = 'encoder-5-batch-128-hidden-256-epochs-5.pkl' decoder_file = 'decoder-5-batch-128-hidden-256-epochs-5.pkl' embed_size = 300 hidden_size = 256 vocab_size, word2idx, idx2word = get_vocab() encoder = EncoderCNN(embed_size) encoder.eval() decoder = DecoderRNN(embed_size, hidden_size, vocab_size) decoder.eval() encoder.load_state_dict(torch.load(os.path.join('./models', encoder_file))) decoder.load_state_dict(torch.load(os.path.join('./models', decoder_file))) encoder.to(device) decoder.to(device) transform_test = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) PIL_image = Image.open(file).convert('RGB') orig_image = np.array(PIL_image) image = transform_test(PIL_image) image = image.to(device).unsqueeze(0) features = encoder(image).unsqueeze(1) output = decoder.sample(features) sentence = clean_sentence(output, idx2word) st.info("Generated caption --> " + sentence) file.close()
def getCaption(self, imgs, output_path='', vocab_path='data/vocab.pkl', decoder_path='models/decoder-5-3000.pkl', encoder_path='models/encoder-5-3000.pkl', embed_size=256, hidden_size=512, num_layers=1): if (output_path == ''): output_path = self.DEFAULT_OUTPUT_PATH device = self.device transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(encoder_path)) decoder.load_state_dict(torch.load(decoder_path)) CAPTIONS = [] for img in imgs: # Prepare an image image = self.load_image(img, transform=transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out the image and the generated caption CAPTIONS.append(self.prune_caption(sentence)) json_captions = self.writeJSON(imgs, CAPTIONS, output_path=output_path) return json_captions
def main(args): # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(args.embed_size).eval() # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) total_step = len(data_loader) # List to score the BLEU scores bleu_scores = [] for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) # captions = captions.to(device) # Generate an caption from the image feature = encoder(images) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) score = sentence_bleu(captions, sentence, args.bleu_weights) bleu_scores.append(score) # Print log info if i % args.log_step == 0: print('Finish [{}/{}], Current BLEU Score: {:.4f}' .format(i, total_step, np.mean(bleu_scores))) np.save('test_results.npy', [bleu_scores, np.mean(bleu_scores)])
def run_inference(image_path, encoder_path, decoder_path, vocab_path, embed_size=256, hidden_size=512, num_layers=1): print(f'sample.py running ... ') # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(vocab_path, 'rb') as f: print("using " + vocab_path) vocab = pickle.load(f) # Build models encoder = EncoderCNN( embed_size).eval() # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict( torch.load(encoder_path, map_location=torch.device('cpu'))) decoder.load_state_dict( torch.load(decoder_path, map_location=torch.device('cpu'))) # Prepare an image image = load_image(image_path, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] print(word) sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption).replace('<start>', '') sentence = sentence.replace('<end>', '') sentence = sentence.replace('_', ' ') # Print out the image and the generated caption print(sentence) print(f'debug: chay xong roi ne') return sentence.strip().capitalize()
def get_caption(self, img_tensor): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) print("running") # Models encoder_file = 'legit_model/encoder_1.pkl' decoder_file = 'legit_model/decoder_1.pkl' # Embed and hidden embed_size = 512 hidden_size = 512 # The size of the vocabulary. vocab_size = 8856 # Initialize the encoder and decoder, and set each to inference mode. encoder = EncoderCNN(embed_size) encoder.eval() decoder = DecoderRNN(embed_size, hidden_size, vocab_size) decoder.eval() # Load the trained weights. encoder.load_state_dict( torch.load(os.path.join('./models', encoder_file))) decoder.load_state_dict( torch.load(os.path.join('./models', decoder_file))) # Move models to GPU if CUDA is available. encoder.to(device) decoder.to(device) img_d = img_tensor.to(device) # Obtain the embedded image features. features = encoder(img_d).unsqueeze(1) # Pass the embedded image features through the model to get a predicted caption. img_output = decoder.sample(features) sentence = self.clean_sentence(img_output) return sentence
def main(args): # Image preprocessing prediction = [] transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(args.embed_size).eval() # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) dirname='' fnames = listdir(os.getcwd) #with open(dirname) for fname in fnames: #print(fname) # Prepare an image image = load_image(''+fname, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) current_pred = [fname, sentence] predictions.append(current_pred) # Print out the image and the generated caption print(fname) print (sentence) #image = Image.open(args.image) #plt.imshow(np.asarray(image)) df = pd.DataFrame(predictions, columns=['File Name', 'Caption']) df.to_excel('output.xls')
class Neuraltalk2: def __init__(self): print("Defining I.A") # Device configuration self.device = torch.device('cpu') #vars embed_size = 256 hidden_size = 512 num_layers = 1 encoder_path = 'models/encoder-5-3000.pkl' decoder_path = 'models/decoder-5-3000.pkl' vocab_path = 'data/vocab.pkl' # Image preprocessing self.transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) with open(vocab_path, 'rb') as f: self.vocab = pickle.load(f) print("Building Model") # Build models self.encoder = EncoderCNN(embed_size).eval() # eval mode (batchnorm uses moving mean/variance) self.decoder = DecoderRNN(embed_size, hidden_size, len(self.vocab), num_layers) self.encoder = self.encoder.to(self.device) self.decoder = self.decoder.to(self.device) print("loading checkpoint") # Load the trained model parameters self.encoder.load_state_dict(torch.load(encoder_path)) self.decoder.load_state_dict(torch.load(decoder_path)) def eval_image(self, image_path): # Prepare an image image = load_image(image_path, self.transform) image_tensor = image.to(self.device) # Generate an caption from the image feature = self.encoder(image_tensor) sampled_ids = self.decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = self.vocab.idx2word[word_id] if word == '<end>': break if word == '<start>': continue sampled_caption.append(word) sentence = ' '.join(sampled_caption) return sentence
def evaluate(encoder_model_path, decoder_model_path): transformation = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) VAL_LOADER_UNIQUE = { 'root': config.VAL_IMG_PATH, 'json': config.VAL_JSON_PATH, 'batch_size': 16, 'shuffle': False, 'transform': transformation, 'num_workers': 4 } val_loader_unique = get_loader_unique(**VAL_LOADER_UNIQUE) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = CNNfull(2048) encoder.to(device) decoder = DecoderRNN(2048, 300, 512, vocab_size) decoder.to(device) encoder.load_state_dict(torch.load(encoder_model_path)) decoder.load_state_dict(torch.load(decoder_model_path)) encoder.eval() decoder.eval() bleu2, bleu3, bleu4, meteor = val_epoch(val_loader_unique, device, encoder, decoder, vocab, 0, enc_scheduler=None, dec_scheduler=None, view_val_captions=False) print(f'Bleu2 score:{bleu2}') print(f'Bleu3 score:{bleu3}') print(f'Bleu4 score:{bleu4}') print(f'Meteor score:{meteor}')
def predict(self, args): print('predict..start') device = torch.device('cpu') # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper # Load vocabulary wrapper #vocab = Vocabulary() with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(args.embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict( torch.load(args.encoder_path, map_location=device)) decoder.load_state_dict( torch.load(args.decoder_path, map_location=device)) # Prepare an image image = self.load_image(args.image, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out the image and the generated caption print(sentence) return sentence
def sample(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(args.embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) dissection.replace_layers(encoder, [ ('resnet.7.2.bn3', 'final_layer'), ]) vec = torch.zeros(2048).to(device) vec[0] = 100 encoder.replacement['final_layer'] = vec encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare an image image = load_image(args.image, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out the image and the generated caption print(sentence)
def main(cfg): # print(cfg.pretty()) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(cfg.image.mean, cfg.image.std) ]) print(hydra.utils.to_absolute_path(cfg.train.vocab_path)) with open(hydra.utils.to_absolute_path(cfg.train.vocab_path), 'rb') as f: vocab = pickle.load(f) # モデルの構築 encoder = EncoderCNN(cfg.train.embed_size).eval() decoder = DecoderRNN(cfg.train.embed_size, cfg.train.hidden_size, len(vocab), cfg.train.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # 学習済みモデルのパラメータを読み込む encoder.load_state_dict( torch.load(hydra.utils.to_absolute_path(cfg.train.encoder_path))) decoder.load_state_dict( torch.load(hydra.utils.to_absolute_path(cfg.train.decoder_path))) with open(json_dir, encoding='utf-8') as f: data = json.loads(f.read()) for key in data['images']: img_file_name = key['file_name'] img_file_path = base_dir + '/data/val2014/' + img_file_name # 画像の準備 image = load_image(hydra.utils.to_absolute_path(img_file_path), transform) image_tensor = image.to(device) # 入力した画像からキャプションを生成する feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() # word_idsをwordに変換する sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break # <start>,<end>,"."を取り除いて処理を行う sentence = ' '.join(sampled_caption[1:-2]) print(sentence)
def main2(image, encoder_path='models/encoder-5-3000.pkl', decoder_path='models/decoder-5-3000.pkl', vocab_path="data/vocab.pkl", embed_size=256, hidden_size=512, num_layers=1): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN( embed_size).eval() # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(encoder_path)) decoder.load_state_dict(torch.load(decoder_path)) # Prepare an image image = load_image(image, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out the image and the generated caption print(sentence) #image = Image.open(args.image) #plt.imshow(np.asarray(image)) return sentence
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(args.embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare an image image = load_image(args.image, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sampled_caption.remove('<end>') sampled_caption.remove('<start>') sentence = ' '.join(sampled_caption) # Print out the image and the generated caption print(sentence) f = open("demofile3.txt", "w") f.truncate(0) f.write(sentence) f.close() image = Image.open(args.image) plt.imshow(np.asarray(image))
def main(args): # Build models encoder = EncoderCNN(args.embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load( args.encoder_path)) #path for dumbing output of encoder model decoder.load_state_dict(torch.load( args.decoder_path)) #path for dumbing output of decoder model test_data = generate_training_data(5) textures_test = generate_textures(test_data) transforms_test = generate_transforms(test_data) for i in range(len(textures_test)): plt.imsave('predictions/texture4_0%i.png' % i, textures_test[i], cmap="gray") print(transforms_test) predicted_progs = [] for texture in textures_test: texture = torch.tensor(texture, device=device) texture = texture.unsqueeze(0) texture = texture.unsqueeze( 0) #for EncoderCNN ought to unsqueeze twice feature = encoder(texture) sampled_seq = decoder.sample(feature) sampled_seq = sampled_seq[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) # Convert sampled sequence of transforms to words prog = [] for int_word in sampled_seq: word = int_to_word(int_word) prog.append(word) if word == '<end>': break trans_seq = '-->'.join(prog) predicted_progs.append([trans_seq]) # Print out the sequence of generated transform sequences print(predicted_progs)
def check_decoder(features, captions): # Specify the number of features in the hidden state of the RNN decoder. hidden_size = 512 # Initialize the decoder. decoder = DecoderRNN(embed_size, hidden_size, vocab_size) # Move the decoder to GPU if CUDA is available. decoder.to(device) # Move last batch of captions (from Step 1) to GPU if CUDA is available captions = captions.to(device) # Pass the encoder output and captions through the decoder. outputs = decoder(features, captions) print('type(outputs):', type(outputs)) print('outputs.shape:', outputs.shape) # Check that your decoder satisfies some requirements of the project! :D assert type(outputs)==torch.Tensor, "Decoder output needs to be a PyTorch Tensor." assert (outputs.shape[0]==batch_size) & (outputs.shape[1]==captions.shape[1]) & (outputs.shape[2]==vocab_size), "The shape of the decoder output is incorrect." return outputs
def get_model(device,vocab_size): # model weights file encoder_file = "models/encoder-3.pkl" decoder_file = "models/decoder-3.pkl" embed_size = 512 hidden_size = 512 # Initialize the encoder and decoder, and set each to inference mode. encoder = EncoderCNN(embed_size) encoder.eval() decoder = DecoderRNN(embed_size, hidden_size, vocab_size) decoder.eval() # Load the trained weights. #print(torch.load(encoder_file)) encoder.load_state_dict(torch.load(encoder_file)) decoder.load_state_dict(torch.load(decoder_file)) # Move models to GPU if CUDA is available. encoder.to(device) decoder.to(device) return encoder,decoder
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(args.embed_size).eval() # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare an image ''' image = load_image(args.image, transform) image_tensor = image.to(device) ''' data_loader, _ = get_loader(transforms=False) inp, targets = next(iter(data_loader)) audio = inp_transform(inp) audio = audio.to(device) # Generate an caption from the image feature = encoder(audio) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out the image and the generated caption print("Logits : {}\nTarget : {}".format(sentence, targets)) '''
def generatecaption(image): # Image preprocessing device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open('/root/ImageCaptioning/data/vocab.pkl', 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(256).eval() # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(256, 512, len(vocab), 1) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load('models/encoder-5-3000.pkl', map_location='cpu')) decoder.load_state_dict(torch.load('models/decoder-5-3000.pkl', map_location='cpu')) encoder.eval() decoder.eval() # Prepare an image image = load_image(image, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break self.sentence = ' '.join(sampled_caption) # Print out the image and the generated caption self.Entry1.delete(0, END) self.Entry1.insert(0,self.sentence[7:-5])
def img2txt(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args['vocab_path'], 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(args['embed_size']).eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args['embed_size'], args['hidden_size'], len(vocab), args['num_layers']) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args['encoder_path'])) decoder.load_state_dict(torch.load(args['decoder_path'])) # Prepare an image image = load_image(args['image'], transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption).lstrip('<start>').rstrip( '<end>').strip() # Print out the image and the generated caption print(sentence) return f'[这个图翻译如下]:\n{sentence}\n{translate(sentence)}'
def main(cfg): # print(cfg.pretty()) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(cfg.image.mean, cfg.image.std) ]) print(hydra.utils.to_absolute_path(cfg.train.vocab_path)) with open(hydra.utils.to_absolute_path(cfg.train.vocab_path), 'rb') as f: vocab = pickle.load(f) # モデルの構築 encoder = EncoderCNN(cfg.train.embed_size).eval() decoder = DecoderRNN(cfg.train.embed_size, cfg.train.hidden_size, len(vocab), cfg.train.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # 学習済みモデルのパラメータを読み込む encoder.load_state_dict( torch.load(hydra.utils.to_absolute_path(cfg.train.encoder_path))) decoder.load_state_dict( torch.load(hydra.utils.to_absolute_path(cfg.train.decoder_path))) # 画像の準備 image = load_image(hydra.utils.to_absolute_path(cfg.sample.image_path), transform) image_tensor = image.to(device) # 入力した画像からキャプションを生成する feature = encoder(image_tensor) # sampled_ids = decoder.sample(feature) sampled_ids = decoder.greedy_decode(features=feature) sampled_ids = sampled_ids[0].cpu().numpy() # word_idsをwordに変換する sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) print(sentence) image = Image.open(hydra.utils.to_absolute_path(cfg.sample.image_path)) plt.imshow(np.asarray(image))
def main(args): # Goruntu on isleme transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Kelime sarmalayiciyi yukle with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Model olusturma encoder = EncoderCNN(args.embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Egitilmis model parametrelerini yukleyin encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Bir goruntu hazirla image = load_image(args.image, transform) image_tensor = image.to(device) # Resimden bir resim yazisi olusturun feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) # Word_ids kelimelerini kelimelere donustur sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Resmi ve olusturulan basligi yazdirin print(sentence) image = Image.open(args.image) plt.imshow(np.asarray(image))
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(args.embed_size).eval( ) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) images = glob.glob('../data/flickr/train/*.jpg') for i in images: # Prepare an image image = load_image(i, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) print("{} : {}".format(i[21:], sentence))
def get_text_caption(image): # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN( args.embed_size, args.model_type, args.mode) # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict( torch.load(args.model_path + "_" + args.model_type + "/encoder.pt")) encoder.eval() decoder.load_state_dict( torch.load(args.model_path + "_" + args.model_type + "/decoder.pt")) decoder.eval() # Prepare an image image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) print(sampled_ids) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) return (sentence.split("<start> ")[1].split(" <end>")[0] [:-2].capitalize().replace(" , ", ", "))
def main(args): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(args.embed_size).eval() # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare an image image = load_image(args.image, transform) image_tensor = image.to(device) # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out the image and the generated caption print (sentence) image = Image.open(args.image) plt.imshow(np.asarray(image))
def main(args): transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) #load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) #build models encoder = EncoderCNN(args.embed_size).eval() decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) #load the trained model parameters encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) #prepare an image image = load_image(args.image, transform) image_tensor = image.to(device) #generate a caption from image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() #convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) #print image and generated caption print(sentence) image = Image.open(args.image) plt.imshow(np.asarray(image))
def predict(path): # Image preprocessing transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) # Load vocabulary wrapper with open('data/vocab.pkl', 'rb') as f: vocab = pickle.load(f) # Build models encoder = EncoderCNN(300).eval() # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(300, 512, vocab, 1) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load('models/encoder-5-3000.ckpt')) decoder.load_state_dict(torch.load('models/decoder-5-3000.ckpt')) # Prepare an image image = load_image(path, transform) image_tensor = image.to(device) # 1,3,w,h # Generate an caption from the image feature = encoder(image_tensor) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption[1:-1]) print(sentence) return sentence
def main(args): # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Build models # encoder = EncoderCNN(args.embed_size).eval() # eval mode (batchnorm uses moving mean/variance) decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers, args.max_length) # encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters # encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) # Prepare an image image = torch.from_numpy( np.load(args.image_dir + '/' + args.image + '.npy')) image_tensor = image.to(device) # Generate an caption from the image # feature = encoder(image_tensor) print(image_tensor.shape) sampled_ids = decoder.sample(image_tensor) sampled_ids = sampled_ids[0].cpu().numpy( ) # (1, max_seq_length) -> (max_seq_length) # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) # Print out the image and the generated caption print(sentence)
def __init__(self, encoder_path, decoder_path, vocab_path, general_predictor_path, embed_size=256, hidden_size=512, num_layers=1, num_general_categories=3): # load vocabulary with open(vocab_path, 'rb') as f: vocab = pickle.load(f) # check device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Build models encoder = EncoderCNN(embed_size).eval( ) # eval mode (batch-norm uses moving mean/variance) decoder = DecoderRNN(embed_size, hidden_size, len(vocab), num_layers).eval() general_categ_predictor = CategoryPredictor( embed_size, num_general_categories).eval() encoder = encoder.to(device) decoder = decoder.to(device) general_categ_predictor = general_categ_predictor.to(device) # load model parameters encoder.load_state_dict(torch.load(encoder_path, map_location=device)) decoder.load_state_dict(torch.load(decoder_path, map_location=device)) general_categ_predictor.load_state_dict( torch.load(general_predictor_path, map_location=device)) self.device = device self.vocab = vocab self.encoder = encoder self.decoder = decoder self.general_categ_predictor = general_categ_predictor
def main(args): transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) encoder = EncoderCNN(args.embed_size).eval() decoder = DecoderRNN(len(vocab), args.embed_size, args.hidden_size, args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) encoder.load_state_dict(torch.load(args.encoder_path)) decoder.load_state_dict(torch.load(args.decoder_path)) image = load_image(args.image, transform) image_tensor = image.to(device) feature = encoder(image_tensor) sample_ids = decoder.sample(features) sample_ids = sample_ids[0].cpu().numpy() sample_caption = [] for word_id in sample_ids: word = vocab.idx2word[word_id] sample_caption.append(word) if word == '<end>': break sentence = ' '.join(sample_caption) print(sentence) image = Image.open(args.image) plt.imshow(np.asarray(image))
batch_size=batch_size, vocab_threshold=vocab_threshold, vocab_from_file=vocab_from_file, cocoapi_loc=COCOPATH) # The size of the vocabulary. vocab_size = len(data_loader.dataset.vocab) # Initialize the encoder and decoder. encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, vocab_size) # Move models to GPU if CUDA is available. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder.to(device) decoder.to(device) # Define the loss function. criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss() # TODO #3: Specify the learnable parameters of the model. params = list(decoder.parameters()) +\ list(encoder.embed.parameters()) # We don't want to retrain the resnet # TODO #4: Define the optimizer. optimizer = torch.optim.RMSprop(params) # Set the total number of training steps per epoch. total_step = math.ceil(len(data_loader.dataset.caption_lengths) / data_loader.batch_sampler.batch_size) ################################################################