class Encoder_Decoder(nn.Module): def __init__(self, dim_emb, dim_hid, vocab_file='./data/preprocessed/vocab_file.vocab'): super(Encoder_Decoder, self).__init__() self.vocab = Vocabulary() self.vocab.load(vocab_file=vocab_file) self.dim_hid = dim_hid self.word_embeddings = nn.Embedding(len(self.vocab), dim_emb) # self.gru = nn.GRU(dim_emb, dim_hid, batch_first=True) self.en_lstm = nn.LSTM(dim_emb, dim_hid, batch_first=True) self.de_lstm = nn.LSTM(dim_emb, dim_hid, batch_first=True) # LSTMの128次元の隠れ層を13次元に変換する全結合層 self.hidden2linear = nn.Linear(dim_hid, len(self.vocab)) def forward(self, sequence, state=None): embedding = self.word_embeddings(sequence) hs, (h, c) = self.en_lstm(embedding, state) output, (h, c) = self.de_lstm(embedding, (h, c)) # アテンションを計算 # t_output = torch.transpose(output, 1, 2) # s = torch.bmm(hs, t_output) # attention_weight = self.softmax(s) output = self.hidden2linear(output) return output, (h, c) def generate(self, start=None, max_len=17): if start is None: start = random.choice(self.vocab.index2word) idx = self.embed.weight.new_full((1, 1), self.vocab.get_index(start), dtype=torch.long) decoded = [start] state = None unk = self.vocab.get_index('<unk>') while decoded[-1] != '<eos>' and len(decoded) < max_len: x, state = self.forward(idx, state) x[:, :, unk] = -float('inf') # prob = list(map(self.to_int, x.squeeze().tolist())) # idx = torch.tensor(random.choices( # list(range(len(prob))), weights=prob, k=1)).view(1, -1) idx = torch.argmax(x, dim=-1) word = self.vocab.get_word(idx.item()) decoded.append(word) return ' '.join(decoded)
def main(args): threshold = 20 captions_dict = load_captions(train_dir) vocab = Vocabulary(captions_dict, threshold) vocab_size = vocab.index # Image preprocessing, normalization for the pretrained resnet transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) dataloader = DataLoader(val_dir, vocab, transform) imagenumbers, captiontotal, imagetotal = dataloader.gen_data() # Build data loader data_loader = get_loader(imagenumbers, captiontotal, imagetotal, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build models encoder = EncoderCNN(args.embed_size).eval() decoder = DecoderRNN(args.embed_size, args.hidden_size, vocab_size, args.num_layers) encoder = encoder.to(device) decoder = decoder.to(device) # Load the trained model parameters encoder.load_state_dict(torch.load(encoder_path)) decoder.load_state_dict(torch.load(decoder_path)) # Build data loader total_step = len(data_loader) # List to score the BLEU scores bleu_scores = [] for i, (images, captions, lengths) in enumerate(data_loader): # Set mini-batch dataset images = images.to(device) # captions = captions.to(device) # Generate an caption from the image feature = encoder(images) sampled_ids = decoder.sample(feature) sampled_ids = sampled_ids[0].cpu().numpy() # Convert word_ids to words sampled_caption = [] for word_id in sampled_ids: word = vocab.get_word(word_id) sampled_caption.append(word) if word == '<end>': break sentence = ' '.join(sampled_caption) score = sentence_bleu([captions], sentence, args.bleu_weights) bleu_scores.append(score) # Print log info if i % args.log_step == 0: print('Finish [{}/{}], Current BLEU Score: {:.4f}'.format( i, total_step, np.mean(bleu_scores))) print(sentence) print(captions) np.save('test_results.npy', [bleu_scores, np.mean(bleu_scores)])
class LM(torch.nn.Module): def __init__(self, dim_emb, dim_hid, vocab_file='./data/preprocessed/vocab_file.vocab'): super().__init__() self.vocab = Vocabulary() self.vocab.load(vocab_file=vocab_file) self.embed = torch.nn.Embedding(len(self.vocab), dim_emb) self.rnn1 = torch.nn.LSTM(dim_emb, dim_hid, batch_first=True) self.rnn2 = torch.nn.LSTM(dim_hid, dim_hid, batch_first=True) # self.rnn3 = torch.nn.LSTM(dim_hid, dim_hid, batch_first=True) # self.rnn4 = torch.nn.LSTM(dim_hid, dim_hid, batch_first=True) self.out = torch.nn.Linear(dim_hid, len(self.vocab)) def forward(self, x, state1=None, state2=None): out = self.embed(x) out, state1 = self.rnn1(out, state1) out, state2 = self.rnn2(out, state2) # out, (h, c) = self.rnn3(out, None) # out, (h, c) = self.rnn4(out, None) out = self.out(out) return out, state1, state2 # def to_int(self, a): # if a == -float('inf'): # return 0 # else: # return int(1e9*a) def generate(self, prefix, max_len=30): cost = 0 softmax = torch.nn.Softmax(dim=-1) start = '<bos>' idx = self.embed.weight.new_full((1, 1), self.vocab.get_index(start), dtype=torch.long) decoded = [start] state1, state2 = None, None unk = self.vocab.get_index('<unk>') while decoded[-1] != '<eos>' and len(decoded) < max_len: x, state1, state2 = self.forward(idx, state1, state2) if 0 < len(prefix): word = prefix.pop() idx = self.vocab.get_index(word) idx = torch.tensor(idx).view(1, 1).to(device) else: x[:, :, unk] = -float('inf') x = softmax(x) # idx = torch.argmax(x, dim=-1) x = x.squeeze().to('cpu').detach().numpy() accum = list(accumulate(x)) idx = bisect(accum, random.random() * accum[-1]) # word = self.vocab.get_word(idx.item()) cost += np.log2(x[idx]) word = self.vocab.get_word(idx) idx = torch.tensor(idx).view(1, 1).to(device) decoded.append(word) cost /= len(decoded) return ' '.join(decoded), cost