def test_dropout_WITH_PROB_ZERO(self): rnn = DecoderRNN(self.dataset.output_vocab, 50, 16, dropout_p=0) for param in rnn.parameters(): param.data.uniform_(-1, 1) batch = [[1, 2, 3], [1, 2], [1]] output1, _, _ = rnn(batch) output2, _, _ = rnn(batch) self.assertEqual(output1, output2)
def test_dropout_WITH_PROB_ZERO(self): rnn = DecoderRNN(self.vocab_size, 50, 16, 0, 1, dropout_p=0) for param in rnn.parameters(): param.data.uniform_(-1, 1) output1, _, _ = rnn() output2, _, _ = rnn() for prob1, prob2 in zip(output1, output2): self.assertTrue(torch.equal(prob1.data, prob2.data))
def test_input_dropout_WITH_NON_ZERO_PROB(self): rnn = DecoderRNN(self.vocab_size, 50, 16, 0, 1, input_dropout_p=0.5) for param in rnn.parameters(): param.data.uniform_(-1, 1) equal = True for _ in range(50): output1, _, _ = rnn() output2, _, _ = rnn() if not torch.equal(output1[0].data, output2[0].data): equal = False break self.assertFalse(equal)
def test_dropout_WITH_NON_ZERO_PROB(self): rnn = DecoderRNN(self.dataset.output_vocab, 50, 16, dropout_p=0.5) for param in rnn.parameters(): param.data.uniform_(-1, 1) batch = [[1, 2, 3], [1, 2], [1]] equal = True for _ in range(50): output1, _, _ = rnn(batch) output2, _, _ = rnn(batch) if output1[0] != output2[0]: equal = False break self.assertFalse(equal)
def test_k_1(self): """ When k=1, the output of topk decoder should be the same as a normal decoder. """ batch_size = 1 eos = 1 for _ in range(10): # Repeat the randomized test multiple times decoder = DecoderRNN(self.vocab_size, 50, 16, 0, eos) for param in decoder.parameters(): param.data.uniform_(-1, 1) topk_decoder = TopKDecoder(decoder, 1) output, _, other = decoder(None) output_topk, _, other_topk = topk_decoder(None) self.assertEqual(len(output), len(output_topk)) finished = [False] * batch_size seq_scores = [0] * batch_size for t_step in range(len(output)): score, _ = output[t_step].topk(1) symbols = other['sequence'][t_step] for b in range(batch_size): seq_scores[b] += score[b].data[0] symbol = symbols[b].data[0] if not finished[b] and symbol == eos: finished[b] = True self.assertEqual(other_topk['length'][b], t_step + 1) self.assertTrue( np.isclose(seq_scores[b], other_topk['score'][b][0])) if not finished[b]: symbol_topk = other_topk['topk_sequence'][t_step][ b].data[0][0] self.assertEqual(symbol, symbol_topk) self.assertTrue( torch.equal(output[t_step].data, output_topk[t_step].data)) if sum(finished) == batch_size: break
def test_k_greater_than_1(self): """ Implement beam search manually and compare results from topk decoder. """ max_len = 50 beam_size = 3 batch_size = 1 hidden_size = 8 sos = 0 eos = 1 for _ in range(10): decoder = DecoderRNN(self.vocab_size, max_len, hidden_size, sos, eos) for param in decoder.parameters(): param.data.uniform_(-1, 1) topk_decoder = TopKDecoder(decoder, beam_size) encoder_hidden = torch.autograd.Variable( torch.randn(1, batch_size, hidden_size)) _, hidden_topk, other_topk = topk_decoder( None, encoder_hidden=encoder_hidden) # Queue state: # 1. time step # 2. symbol # 3. hidden state # 4. accumulated log likelihood # 5. beam number batch_queue = [[(-1, sos, encoder_hidden[:, b, :].unsqueeze(1), 0, None)] for b in range(batch_size)] time_batch_queue = [batch_queue] batch_finished_seqs = [list() for _ in range(batch_size)] for t in range(max_len): new_batch_queue = [] for b in range(batch_size): new_queue = [] for k in range(min(len(time_batch_queue[t][b]), beam_size)): _, inputs, hidden, seq_score, _ = time_batch_queue[t][ b][k] if inputs == eos: batch_finished_seqs[b].append( time_batch_queue[t][b][k]) continue inputs = torch.autograd.Variable( torch.LongTensor([[inputs]])) context, hidden, attn = decoder.forward_step( inputs, hidden, None) decoder_outputs, symbols = decoder.decoder( context, attn, None, None) decoder_outputs = decoder_outputs.log() topk_score, topk = decoder_outputs[0].data.topk( beam_size) for score, sym in zip(topk_score.tolist()[0], topk.tolist()[0]): new_queue.append( (t, sym, hidden, score + seq_score, k)) new_queue = sorted(new_queue, key=lambda x: x[3], reverse=True)[:beam_size] new_batch_queue.append(new_queue) time_batch_queue.append(new_batch_queue) # finished beams finalist = [l[:beam_size] for l in batch_finished_seqs] # unfinished beams for b in range(batch_size): if len(finalist[b]) < beam_size: last_step = sorted(time_batch_queue[-1][b], key=lambda x: x[3], reverse=True) finalist[b] += last_step[:beam_size - len(finalist[b])] # back track topk = [] for b in range(batch_size): batch_topk = [] for k in range(beam_size): seq = [finalist[b][k]] prev_k = seq[-1][4] prev_t = seq[-1][0] while prev_k is not None: seq.append(time_batch_queue[prev_t][b][prev_k]) prev_k = seq[-1][4] prev_t = seq[-1][0] batch_topk.append([s for s in reversed(seq)]) topk.append(batch_topk) for b in range(batch_size): topk[b] = sorted(topk[b], key=lambda s: s[-1][3], reverse=True) topk_scores = other_topk['score'] topk_lengths = other_topk['topk_length'] topk_pred_symbols = other_topk['topk_sequence'] for b in range(batch_size): precision_error = False for k in range(beam_size - 1): if np.isclose(topk_scores[b][k], topk_scores[b][k + 1]): precision_error = True break if precision_error: break for k in range(beam_size): self.assertEqual(topk_lengths[b][k], len(topk[b][k]) - 1) self.assertTrue( np.isclose(topk_scores[b][k], topk[b][k][-1][3])) total_steps = topk_lengths[b][k] for t in range(total_steps): self.assertEqual(topk_pred_symbols[t][b, k].data[0], topk[b][k][t + 1][1]) # topk includes SOS
def main(option): random.seed(option.random_seed) torch.manual_seed(option.random_seed) LOG_FORMAT = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s' logging.basicConfig(format=LOG_FORMAT, level='INFO', stream=sys.stdout) glove = Glove(option.emb_file) logging.info('loaded embeddings from ' + option.emb_file) src_vocab = Vocab.build_from_glove(glove) tgt_vocab = Vocab.load(option.intent_vocab) train_dataset = load_intent_prediction_dataset(option.train_dataset, src_vocab, tgt_vocab, device=option.device) dev_dataset = load_intent_prediction_dataset(option.dev_dataset, src_vocab, tgt_vocab, device=option.device) train_data_loader = DataLoader(train_dataset, batch_size=option.batch_size, shuffle=True) dev_data_loader = DataLoader(dev_dataset, batch_size=len(dev_dataset), shuffle=False) src_vocab_size = len(src_vocab) tgt_vocab_size = len(tgt_vocab) # Prepare loss weight = torch.ones(tgt_vocab_size) pad = tgt_vocab.stoi[tgt_vocab.pad_token] loss = Perplexity(weight, pad) loss.criterion.to(option.device) # Initialize model encoder = NeuralTensorNetwork(nn.Embedding(src_vocab_size, option.emb_dim), option.em_k) decoder = DecoderRNN(tgt_vocab_size, option.im_max_len, option.im_hidden_size, use_attention=False, bidirectional=False, eos_id=tgt_vocab.stoi[tgt_vocab.eos_token], sos_id=tgt_vocab.stoi[tgt_vocab.bos_token]) encoder.to(option.device) decoder.to(option.device) init_model(encoder) init_model(decoder) encoder.embeddings.weight.data.copy_(torch.from_numpy(glove.embd).float()) optimizer_params = [{ 'params': encoder.parameters() }, { 'params': decoder.parameters() }] optimizer = Optimizer(optim.Adam(optimizer_params, lr=option.lr), max_grad_norm=5) trainer = NTNTrainer(loss, print_every=option.report_every, device=option.device) encoder, decoder = trainer.train( encoder, decoder, optimizer, train_data_loader, num_epochs=option.epochs, dev_data_loader=dev_data_loader, teacher_forcing_ratio=option.im_teacher_forcing_ratio) predictor = NTNPredictor(encoder, decoder, src_vocab, tgt_vocab, option.device) samples = [ ("PersonX", "eventually told", "___"), ("PersonX", "tells", "PersonY 's tale"), ("PersonX", "always played", " ___"), ("PersonX", "would teach", "PersonY"), ("PersonX", "gets", "a ride"), ] for sample in samples: subj, verb, obj = sample subj = subj.lower().split(' ') verb = verb.lower().split(' ') obj = obj.lower().split(' ') print(sample, predictor.predict(subj, verb, obj))
if not opt.resume: # Initialize model decoder = DecoderRNN(train_label_lang.word2index, x_mean_std[0], y_mean_std[0], w_mean_std[0], r_mean_std[0], opt.batch_size, opt.max_len, hidden_size, opt.gmm_comp_num, dropout_p=0.2, use_attention=False, bidirectional=bidirectional) for param in decoder.parameters(): param.data.uniform_(-0.08, 0.08) if torch.cuda.is_available(): decoder.cuda() # train t = SupervisedTrainer(lloss=lloss, bloss=bloss, batch_size=opt.batch_size, checkpoint_every=100, print_every=50, expt_dir=opt.expt_dir, train_cap_lang=train_cap_lang, train_label_lang=train_label_lang, x_mean_std=x_mean_std,
class RNNDecoder(): """RNN decoder class. Wraps the IBM seq2seq decoder (using GRU or LSTM units).""" def __init__(self, vocab_size: int, embedding_size: int, n_hidden: int, sos_token: int = 0, eos_token: int = 1, mask_token: int = 2, max_output_length: int = 100, rnn_cell: str = 'lstm') -> None: self.decoder = DecoderRNN(vocab_size, max_output_length, embedding_size, n_layers=n_hidden, rnn_cell=rnn_cell, use_attention=False, bidirectional=False, eos_id=eos_token, sos_id=sos_token) if torch.cuda.is_available(): self.decoder.cuda() self.rnn_cell = rnn_cell self.n_hidden = n_hidden self.embedding_size = embedding_size self.SOS_token = sos_token self.EOS_token = eos_token self.mask_token = mask_token self.max_output_length = max_output_length token_weights = torch.ones(vocab_size) if torch.cuda.is_available(): token_weights = token_weights.cuda() self.loss = NLLLoss(weight=token_weights, mask=mask_token) self.optimizer = None def _create_init_hidden(self, embedding): # All hidden states start as the embedding. decoder_hidden = [] for i in range(self.n_hidden): decoder_hidden.append(embedding) # num_layers x batch_size x embedding_size decoder_h = torch.cat(decoder_hidden, 0) return decoder_h def train(self, input_tensor, target_tensor, teacher_forcing_ratio=0.5): """Train for one batch.""" decoder_outputs, decoder_hidden, ret_dict = self.decoder( inputs=target_tensor, encoder_hidden=input_tensor, teacher_forcing_ratio=teacher_forcing_ratio) # Nothing was generated. This number (10) was arbitrarily chosen. if len(decoder_outputs) == 0: return 10 loss = self.loss loss.reset() for step, step_output in enumerate(decoder_outputs): batch_size = target_tensor.size(0) loss.eval_batch(step_output.contiguous().view(batch_size, -1), target_tensor[:, step + 1]) self.decoder.zero_grad() loss.backward() self.optimizer.step() return loss.get_loss() def train_iters(self, pairs, n_iters, batch_size=64, print_every=1000, learning_rate=0.0002, teacher_forcing_ratio=0.5): """Train for some number of iterations choosing randomly from the list of tensor pairs.""" print("Initializing training.") if self.optimizer == None: adam = optim.Adam(self.decoder.parameters(), lr=learning_rate) self.optimizer = Optimizer(adam, max_grad_norm=5) else: print("Using existing optimizer.") random.shuffle(pairs) if (len(pairs) < batch_size): print("Not enough examples for one batch.") return # Turn the pairs into big tensors. # TODO: instead of saving pairs, save tensors directly. Otherwise this operation takes too much space. # Input: num_layers x num_examples x embedding_size # Target: num_examples x max_output_length+1 input_tensors = [torch.reshape(i, (1, 1, -1)) for i, j in pairs] input_tensor = torch.cat(input_tensors, 1) input_tensor = self._create_init_hidden(input_tensor) target_tensors = [j for i, j in pairs] targets = [] for target in target_tensors: target_tensor = torch.reshape(target, (1, -1)) if target_tensor.size(1) >= self.max_output_length: target_tensor = target_tensor[0][0:self.max_output_length] target_tensor = torch.reshape(target_tensor, (1, -1)) else: pad = torch.zeros( 1, self.max_output_length - target_tensor.size(1)).long() for i in range(self.max_output_length - target_tensor.size(1)): pad[0][i] = self.mask_token target_tensor = torch.cat((target_tensor, pad), 1) # Add the start token. start_tensor = torch.zeros(1, 1).long() start_tensor[0][0] = self.SOS_token target_tensor = torch.cat((start_tensor, target_tensor), 1) targets.append(target_tensor) target_tensor = torch.cat(targets, 0) if torch.cuda.is_available(): target_tensor = target_tensor.cuda() if torch.cuda.is_available(): input_tensor = input_tensor.cuda() print("Starting training.") print_loss_total = 0 # Reset every print_every. batch = 0 for iter in range(n_iters): # Create the batch. if (batch + 1) * batch_size > len(pairs): print("Finished an epoch!") batch = 0 batch_input = input_tensor[:, batch * batch_size:(batch + 1) * batch_size, :].contiguous() batch_target = target_tensor[batch * batch_size:(batch + 1) * batch_size, :].contiguous() if self.rnn_cell == 'lstm': batch_input = (batch_input, batch_input) loss = self.train(batch_input, batch_target, teacher_forcing_ratio=teacher_forcing_ratio) print_loss_total += loss if iter % print_every == print_every - 1: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('Steps: {0}\nAverage loss: {1}'.format( iter, print_loss_avg)) batch += 1 def predict(self, input_tensor, beam_size: int): if beam_size > 1: beam_decoder = TopKDecoder(self.decoder, beam_size) else: beam_decoder = self.decoder with torch.no_grad(): decoder_hidden = self._create_init_hidden( torch.reshape(input_tensor, (1, 1, -1))) if torch.cuda.is_available(): decoder_hidden = decoder_hidden.cuda() if self.rnn_cell == 'lstm': decoder_hidden = (decoder_hidden, decoder_hidden) decoder_outputs, decoder_hidden, ret_dict = beam_decoder( inputs=None, encoder_hidden=decoder_hidden, teacher_forcing_ratio=0) output_sequence = [] for item in ret_dict['sequence']: output_sequence.append(item[0].item()) return output_sequence