def train_batch(self, data, clip, reset=0): if reset: self.reset() # Zero gradients of both optimizers self.encoder_optimizer.zero_grad() self.extKnow_optimizer.zero_grad() self.decoder_optimizer.zero_grad() # Encode and Decode use_teacher_forcing = random.random() < args['teacher_forcing_ratio'] max_target_length = max(data['response_lengths']) all_decoder_outputs_vocab, all_decoder_outputs_ptr, _, _, global_pointer, label_e, label_d, label_mix_e, label_mix_d = self.encode_and_decode( data, max_target_length, use_teacher_forcing, False) # Loss calculation and backpropagation domains = [] for domain in data['domain']: domains.append(self.domains[domain]) loss_g = self.criterion_bce(global_pointer, data['selector_index']) loss_v = masked_cross_entropy( all_decoder_outputs_vocab.transpose(0, 1).contiguous(), data['sketch_response'].contiguous(), data['response_lengths']) loss_l = masked_cross_entropy( all_decoder_outputs_ptr.transpose(0, 1).contiguous(), data['ptr_index'].contiguous(), data['response_lengths']) loss = loss_g + loss_v + loss_l golden_labels = torch.zeros_like(label_e).scatter_( 1, data['label_arr'], 1) loss += self.criterion_label(label_e, golden_labels) loss += self.criterion_label(label_d, golden_labels) domains = self._cuda(torch.Tensor(domains)).long().unsqueeze(-1) loss += masked_cross_entropy( label_mix_e, domains.expand(len(domains), label_mix_e.size(1)).contiguous(), data['conv_arr_lengths']) loss += masked_cross_entropy( label_mix_d, domains.expand(len(domains), label_mix_d.size(1)).contiguous(), data['response_lengths']) loss.backward() # Clip gradient norms ec = torch.nn.utils.clip_grad_norm_(self.encoder.parameters(), clip) ec = torch.nn.utils.clip_grad_norm_(self.extKnow.parameters(), clip) dc = torch.nn.utils.clip_grad_norm_(self.decoder.parameters(), clip) # Update parameters with optimizers self.encoder_optimizer.step() self.extKnow_optimizer.step() self.decoder_optimizer.step() self.loss += loss.item() self.loss_g += loss_g.item() self.loss_v += loss_v.item() self.loss_l += loss_l.item()
def train_batch(self, data, clip, reset=0, ss=1.0): if reset: self.reset() # Zero gradients of both optimizers self.encoder_optimizer.zero_grad() self.extKnow_optimizer.zero_grad() self.decoder_optimizer.zero_grad() """ optimizer.zero_grad() """ """ for k, v in data.items(): print(k) if isinstance(v, torch.Tensor): print(v.size()) print(v) else: print(v) """ # Encode and Decode max_target_length = data["sketch_response"].size(1) all_decoder_outputs_vocab, all_decoder_outputs_ptr, _, _ = self.encode_and_decode( data, max_target_length, ss, False) # Loss calculation and backpropagation loss_v = masked_cross_entropy( all_decoder_outputs_vocab.transpose(0, 1).contiguous(), data['sketch_response'].contiguous(), data['response'], PAD_token) loss_l = masked_cross_entropy( all_decoder_outputs_ptr.transpose(0, 1).contiguous(), data['ptr_index'].contiguous(), data['response'], PAD_token) loss = loss_v + loss_l loss.backward() # Clip gradient norms ec = torch.nn.utils.clip_grad_norm_(self.encoder.parameters(), clip) ec = torch.nn.utils.clip_grad_norm_(self.extKnow.parameters(), clip) dc = torch.nn.utils.clip_grad_norm_(self.decoder.parameters(), clip) # Update parameters with optimizers self.encoder_optimizer.step() self.extKnow_optimizer.step() self.decoder_optimizer.step() """ torch.nn.utils.clip_grad_norm_(self.parameters(), clip) optimizer.step() """ self.loss += loss.item() self.loss_v += loss_v.item() self.loss_l += loss_l.item()
def evaluate(self, batch): r""" Take a step of learning process Args: batch, grad_clip - **batch** torchtext.data.batch.Batch object of TabularDataset - **grad_clip** value of gradient clipping Output: loss, target_seq, target_seq_distr, pointer_distr, att_distr - **loss** loss value - **target_seq** of shape `(max_target_len, batch)`: tensor, containing encoded generated sequence. - **target_seq_distr** of shape `(max_target_len, batch, vocabulary_size + input_seq_len)`: tensor containing vocabulary distribution on generated sequence. - **pointer_distr** of shape `(max_target_len, batch)`: tensor, containing pointer probabilities. - **att_distr** of shape `(max_target_len, seq_len, batch)`: tensor, containing attention distributions for each generated word. """ self.eval() input_seq, input_lens, target_seq, target_lens = *batch.src, *batch.trg target_seq_ext, _ = batch.trg_ext max_target_len = target_lens.max().item() # Calculate out of vocabulary mask. If word has <unk> tag, this is oov. oov_mask = input_seq == 0 target_seq, target_seq_distr, pointer_distr, att_distr = self.forward(input_seq, input_lens, oov_mask, max_target_len) loss = masked_cross_entropy(target_seq_distr.transpose(0, 1).contiguous(), target_seq_ext.transpose(0, 1).contiguous(), target_lens) self.train() return loss.item(), target_seq, target_seq_distr, pointer_distr, att_distr
def train_step(self, optimizer, batch, grad_clip): r"""Take a step of learning process Args: optimizer, batch, grad_clip - **optimizer** optimizer for learning - **batch** torchtext.data.batch.Batch object of TabularDataset - **grad_clip** value of gradient clipping Output: loss - **loss** loss value """ optimizer.zero_grad() input_seq, input_lens, target_seq, target_lens = *batch.src, *batch.trg # Extended target is a target, encoded using OOV encoding target_seq_ext, _ = batch.trg_ext max_target_len = target_lens.max().item() # Calculate out of vocabulary mask. If word has <unk> tag, this is oov. oov_mask = (input_seq == 0).long().to(input_seq.device) out_seq_distr = self.forward(input_seq, input_lens, oov_mask, max_target_len, target_seq) loss = masked_cross_entropy(out_seq_distr.transpose(0, 1).contiguous(), target_seq_ext.transpose(0, 1).contiguous(), target_lens) loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_(self.parameters(), grad_clip) optimizer.step() return loss.item(), grad_norm
def train_batch(self, data, clip, reset=0, ss=1.0): if reset: self.reset() # Zero gradients of both optimizers self.encoder_optimizer.zero_grad() self.extKnow_optimizer.zero_grad() self.decoder_optimizer.zero_grad() # Encode and Decode max_target_length = max(data['response_lengths']) all_decoder_outputs_vocab, all_decoder_outputs_ptr, _, _ = self.encode_and_decode( data, max_target_length, ss, False) # Loss calculation and backpropagation loss_v = masked_cross_entropy( all_decoder_outputs_vocab.transpose(0, 1).contiguous(), data['sketch_response'].contiguous(), data['response_lengths']) loss_l = masked_cross_entropy( all_decoder_outputs_ptr.transpose(0, 1).contiguous(), data['response_entity_id'].contiguous(), #data['response_lengths']) data['response_lengths'], is_logit=False) loss = loss_v + loss_l loss.backward() # Clip gradient norms ec = torch.nn.utils.clip_grad_norm_(self.encoder.parameters(), clip) ec = torch.nn.utils.clip_grad_norm_(self.extKnow.parameters(), clip) dc = torch.nn.utils.clip_grad_norm_(self.decoder.parameters(), clip) # Update parameters with optimizers self.encoder_optimizer.step() self.extKnow_optimizer.step() self.decoder_optimizer.step() self.loss += loss.item() self.loss_v += loss_v.item() self.loss_l += loss_l.item()
def train_batch(self, input_batches, input_lengths, target_batches, target_lengths, target_index, batch_size, clip, teacher_forcing_ratio): self.batch_size = batch_size # Zero gradients of both optimizers self.encoder_optimizer.zero_grad() self.decoder_optimizer.zero_grad() loss_Vocab, loss_Ptr = 0, 0 # Run words through encoder decoder_hidden = self.encoder(input_batches.transpose(0, 1)).unsqueeze(0) # load memories with input self.decoder.load_memory(input_batches.transpose(0, 1)) # Prepare input and output variables decoder_input = Variable(torch.LongTensor([SOS_token] * batch_size)) max_target_length = max(target_lengths) all_decoder_outputs_vocab = Variable( torch.zeros(max_target_length, batch_size, self.output_size)) all_decoder_outputs_ptr = Variable( torch.zeros(max_target_length, batch_size, input_batches.size(0))) # Move new Variables to CUDA if USE_CUDA: all_decoder_outputs_vocab = all_decoder_outputs_vocab.cuda() all_decoder_outputs_ptr = all_decoder_outputs_ptr.cuda() decoder_input = decoder_input.cuda() # Choose whether to use teacher forcing use_teacher_forcing = random.random() < teacher_forcing_ratio if use_teacher_forcing: # Run through decoder one time step at a time for t in range(max_target_length): decoder_ptr, decoder_vacab, decoder_hidden = self.decoder.ptrMemDecoder( decoder_input, decoder_hidden) all_decoder_outputs_vocab[t] = decoder_vacab all_decoder_outputs_ptr[t] = decoder_ptr decoder_input = target_batches[t] # Chosen word is next input if USE_CUDA: decoder_input = decoder_input.cuda() else: for t in range(max_target_length): decoder_ptr, decoder_vacab, decoder_hidden = self.decoder.ptrMemDecoder( decoder_input, decoder_hidden) _, toppi = decoder_ptr.data.topk(1) _, topvi = decoder_vacab.data.topk(1) all_decoder_outputs_vocab[t] = decoder_vacab all_decoder_outputs_ptr[t] = decoder_ptr ## get the correspective word in input top_ptr_i = torch.gather(input_batches, 0, Variable(toppi.view(1, -1))) next_in = [ top_ptr_i.squeeze()[i].data[0] if (toppi.squeeze()[i] < input_lengths[i] - 1) else topvi.squeeze()[i] for i in range(batch_size) ] decoder_input = Variable( torch.LongTensor(next_in)) # Chosen word is next input if USE_CUDA: decoder_input = decoder_input.cuda() #Loss calculation and backpropagation loss_Vocab = masked_cross_entropy( all_decoder_outputs_vocab.transpose( 0, 1).contiguous(), # -> batch x seq target_batches.transpose(0, 1).contiguous(), # -> batch x seq target_lengths) loss_Ptr = masked_cross_entropy( all_decoder_outputs_ptr.transpose( 0, 1).contiguous(), # -> batch x seq target_index.transpose(0, 1).contiguous(), # -> batch x seq target_lengths) loss = loss_Vocab + loss_Ptr loss.backward() # Clip gradient norms ec = torch.nn.utils.clip_grad_norm(self.encoder.parameters(), clip) dc = torch.nn.utils.clip_grad_norm(self.decoder.parameters(), clip) # Update parameters with optimizers self.encoder_optimizer.step() self.decoder_optimizer.step() self.loss += loss.data[0] #self.loss_gate += loss_gate.data[0] self.loss_ptr += loss_Ptr.data[0] self.loss_vac += loss_Vocab.data[0]
def train_batch(self, input_batches, input_lengths, target_batches, target_lengths, target_index, target_gate, batch_size, clip, teacher_forcing_ratio): # if reset: # self.loss = 0 # self.loss_ptr = 0 # self.loss_vac = 0 # self.print_every = 1 self.batch_size = batch_size # Zero gradients of both optimizers self.encoder_optimizer.zero_grad() self.decoder_optimizer.zero_grad() loss_Vocab, loss_Ptr = 0, 0 # Run words through encoder # get the hidden processed by encoder decoder_hidden = self.encoder(input_batches).unsqueeze(0) self.decoder.load_memory(input_batches.transpose(0, 1)) # Prepare input and output variables # init the first decoder_input decoder_input = Variable(torch.LongTensor( [SOS_token_index] * batch_size).to(device=DEVICE)) # self.output_size 指输入数据集中总共容纳的词汇,利用词概率分布来选择最佳输出词 max_target_length = max(target_lengths) self.max_response = max(max_target_length,self.max_response) all_decoder_outputs_vocab = Variable(torch.zeros( max_target_length, batch_size, self.output_size).to(device=DEVICE)) # input_batches.size(0)是指输入句子的分词长度,用于指针根据用户输入关键词引入回复句子中 all_decoder_outputs_ptr = Variable(torch.zeros( max_target_length, batch_size, input_batches.size(0)).to(device=DEVICE)) # Move new Variables to CUDA # if USE_CUDA: # all_decoder_outputs_vocab = all_decoder_outputs_vocab.cuda() # all_decoder_outputs_ptr = all_decoder_outputs_ptr.cuda() # decoder_input = decoder_input.cuda() # Choose whether to use teacher forcing use_teacher_forcing = random.random() < teacher_forcing_ratio if use_teacher_forcing: # Run through decoder one time step at a time for t in range(max_target_length): decoder_ptr, decoder_vacab, decoder_hidden = self.decoder.ptrMemDecoder( decoder_input, decoder_hidden) # 对于每一个输出的词存储其对应的预测分布 all_decoder_outputs_vocab[t] = decoder_vacab all_decoder_outputs_ptr[t] = decoder_ptr decoder_input = target_batches[t] # Chosen word is next input decoder_input.to(device=DEVICE) # if USE_CUDA: # decoder_input = decoder_input.cuda() else: for t in range(max_target_length): decoder_ptr, decoder_vacab, decoder_hidden = self.decoder.ptrMemDecoder( decoder_input, decoder_hidden) _, toppi = decoder_ptr.data.topk(1) # 指针网络大根堆第一个Tensor _, topvi = decoder_vacab.data.topk(1) all_decoder_outputs_vocab[t] = decoder_vacab all_decoder_outputs_ptr[t] = decoder_ptr # get the correspective word in input top_ptr_i = torch.gather(input_batches[:, :, 0], 0, Variable( toppi.view(1, -1))).transpose(0, 1) next_in = [top_ptr_i[i].item() if (toppi[i].item( ) < input_lengths[i]-1) else topvi[i].item() for i in range(batch_size)] # Chosen word is next input decoder_input = Variable(torch.LongTensor(next_in).to(device=DEVICE)) if USE_CUDA: decoder_input = decoder_input.cuda() # Loss calculation and backpropagation loss_Vocab = masked_cross_entropy( all_decoder_outputs_vocab.transpose( 0, 1).contiguous(), # -> batch x seq target_batches.transpose(0, 1).contiguous(), # -> batch x seq target_lengths ) loss_Ptr = masked_cross_entropy( all_decoder_outputs_ptr.transpose( 0, 1).contiguous(), # -> batch x seq target_index.transpose(0, 1).contiguous(), # -> batch x seq target_lengths ) loss = loss_Vocab + loss_Ptr loss.backward() # Clip gradient norms ec = torch.nn.utils.clip_grad_norm(self.encoder.parameters(), clip) dc = torch.nn.utils.clip_grad_norm(self.decoder.parameters(), clip) # Update parameters with optimizers self.encoder_optimizer.step() self.decoder_optimizer.step() self.loss += loss.item() self.loss_ptr += loss_Ptr.item() self.loss_vac += loss_Vocab.item()
def train_batch(self, input_batches, input_lengths, target_batches, target_lengths, target_index, target_gate, batch_size, clip, teacher_forcing_ratio,reset): if reset: self.loss = 0 self.loss_gate = 0 self.loss_ptr = 0 self.loss_vac = 0 self.print_every = 1 # Zero gradients of both optimizers self.encoder_optimizer.zero_grad() self.decoder_optimizer.zero_grad() loss_Vocab,loss_Ptr,loss_Gate = 0,0,0 # Run words through encoder encoder_outputs, encoder_hidden = self.encoder(input_batches, input_lengths) # Prepare input and output variables decoder_input = Variable(torch.LongTensor([SOS_token] * batch_size)) decoder_hidden = (encoder_hidden[0][:self.decoder.n_layers],encoder_hidden[1][:self.decoder.n_layers]) max_target_length = max(target_lengths) all_decoder_outputs_vocab = Variable(torch.zeros(max_target_length, batch_size, self.output_size)) all_decoder_outputs_ptr = Variable(torch.zeros(max_target_length, batch_size, encoder_outputs.size(0))) all_decoder_outputs_gate = Variable(torch.zeros(max_target_length, batch_size)) # Move new Variables to CUDA if USE_CUDA: all_decoder_outputs_vocab = all_decoder_outputs_vocab.cuda() all_decoder_outputs_ptr = all_decoder_outputs_ptr.cuda() all_decoder_outputs_gate = all_decoder_outputs_gate.cuda() decoder_input = decoder_input.cuda() # Choose whether to use teacher forcing use_teacher_forcing = random.random() < teacher_forcing_ratio if use_teacher_forcing: # Run through decoder one time step at a time for t in range(max_target_length): decoder_ptr,decoder_vacab,gate,decoder_hidden = self.decoder( decoder_input, decoder_hidden, encoder_outputs) all_decoder_outputs_vocab[t] = decoder_vacab all_decoder_outputs_ptr[t] = decoder_ptr all_decoder_outputs_gate[t] = gate decoder_input = target_batches[t] # Next input is current target if USE_CUDA: decoder_input = decoder_input.cuda() else: for t in range(max_target_length): decoder_ptr,decoder_vacab,gate,decoder_hidden = self.decoder( decoder_input, decoder_hidden, encoder_outputs) all_decoder_outputs_vocab[t] = decoder_vacab all_decoder_outputs_ptr[t] = decoder_ptr all_decoder_outputs_gate[t] = gate topv, topvi = decoder_vacab.data.topk(1) topp, toppi = decoder_ptr.data.topk(1) ## get the correspective word in input top_ptr_i = torch.gather(input_batches,0,toppi.view(1, -1)) next_in = [top_ptr_i.squeeze()[i].data[0] if(gate.squeeze()[i].data[0]>=0.5) else topvi.squeeze()[i] for i in range(batch_size)] decoder_input = Variable(torch.LongTensor(next_in)) # Chosen word is next input if USE_CUDA: decoder_input = decoder_input.cuda() #Loss calculation and backpropagation loss_Vocab = masked_cross_entropy( all_decoder_outputs_vocab.transpose(0, 1).contiguous(), # -> batch x seq target_batches.transpose(0, 1).contiguous(), # -> batch x seq target_lengths ) loss_Ptr = masked_cross_entropy( all_decoder_outputs_ptr.transpose(0, 1).contiguous(), # -> batch x seq target_index.transpose(0, 1).contiguous(), # -> batch x seq target_lengths ) loss_gate = self.criterion(all_decoder_outputs_gate,target_gate.float()) loss = loss_Vocab + loss_Ptr + loss_gate loss.backward() # Clip gradient norms ec = torch.nn.utils.clip_grad_norm(self.encoder.parameters(), clip) dc = torch.nn.utils.clip_grad_norm(self.decoder.parameters(), clip) # Update parameters with optimizers self.encoder_optimizer.step() self.decoder_optimizer.step() self.loss += loss.data[0] self.loss_gate += loss_gate.data[0] self.loss_ptr += loss_Ptr.data[0] self.loss_vac += loss_Vocab.data[0]
def train_batch(self, input_batches, input_lengths, target_batches, target_lengths, target_index, target_gate, batch_size, clip, teacher_forcing_ratio, conv_seqs, conv_lengths, reset): ''' :param input_batches: (T, B, 3) or something else :param input_lengths: (B,) length of each instance in the batch :param target_batches: (T',B) T' is the max response length :param target_lengths: (B,) :param target_index: (T',B) as shape of target_batches :param target_gate: (T,B) not used :param batch_size: 没有必要的 :param clip: :param teacher_forcing_ratio: # a trick, for sentence generation :param conv_seqs: :param conv_lengths: :param reset: a flag for the begin of each epoch :return: ''' if reset: self.loss = 0 self.loss_ptr = 0 self.loss_vac = 0 self.print_every = 1 self.batch_size = batch_size # Zero gradients of both optimizers self.encoder_optimizer.zero_grad() self.decoder_optimizer.zero_grad() loss_Vocab, loss_Ptr = 0, 0 # Run words through encoder; this is h_0 in the paper; # unsqueeze is for GRU as it requires **h_0** (num_layers * num_directions, batch, hidden_size) decoder_hidden = self.encoder(input_batches).unsqueeze( 0) # (B,E) ---> (1,B,E) # TODO: this is Dialog History + KB in the Fig.1 of the paper. # get the embedding self.decoder.load_memory(input_batches.transpose(0, 1)) # Prepare input and output variables # this is y_0 in the paper. Fig.1 decoder_input = Variable(torch.LongTensor([SOS_token] * batch_size)) max_target_length = max( target_lengths ) # (just in this batch)may smaller than max_r_len(in all data); all_decoder_outputs_vocab = Variable( torch.zeros(max_target_length, batch_size, self.output_size)) # input_batches.size(0) is time_length all_decoder_outputs_ptr = Variable( torch.zeros(max_target_length, batch_size, input_batches.size(0))) # Move new Variables to CUDA if USE_CUDA: all_decoder_outputs_vocab = all_decoder_outputs_vocab.cuda() all_decoder_outputs_ptr = all_decoder_outputs_ptr.cuda() decoder_input = decoder_input.cuda() # Choose whether to use teacher forcing ''' https://machinelearningmastery.com/teacher-forcing-for-recurrent-neural-networks/ ''' use_teacher_forcing = random.random() < teacher_forcing_ratio if use_teacher_forcing: # Run through decoder one time step at a time for t in range(max_target_length): # decoder_input(in the form of word) shape(B,) decoder_hidden shape(1,B,E) # (B,M) (B,V) (1,B,E or H) decoder_ptr, decoder_vacab, decoder_hidden = self.decoder.ptrMemDecoder( decoder_input, decoder_hidden) all_decoder_outputs_vocab[t] = decoder_vacab all_decoder_outputs_ptr[t] = decoder_ptr decoder_input = target_batches[t] # Chosen word is next input if USE_CUDA: decoder_input = decoder_input.cuda() else: for t in range(max_target_length): # (B,M) (B,V) (1,B,E or H) decoder_ptr, decoder_vacab, decoder_hidden = self.decoder.ptrMemDecoder( decoder_input, decoder_hidden) '''A tuple of (values, indices) is returned, where the indices are the indices of the elements in the original input tensor''' _, toppi = decoder_ptr.data.topk( 1) # return topk_value, topk_position _, topvi = decoder_vacab.data.topk(1) # shape (B,1) all_decoder_outputs_vocab[t] = decoder_vacab all_decoder_outputs_ptr[t] = decoder_ptr # get the correspective word in input ''' out[i][j][k] = input[index[i][j][k]][j][k] # if dim == 0 out[i][j][k] = input[i][index[i][j][k]][k] # if dim == 1 out[i][j][k] = input[i][j][index[i][j][k]] # if dim == 2 ''' # input_batches[:,:,0] shape(M,B) 最后输出的out与index的size是一样的。shape(1,B) # 由top_ptr导出的input top_ptr_i = torch.gather(input_batches[:, :, 0], 0, Variable(toppi.view(1, -1))) # 因为起始位置为0; 所以长度-1;而且最后元素为$$$$符号,所以小于 next_in = [ top_ptr_i.squeeze()[i].data[0] if (toppi.squeeze()[i] < input_lengths[i] - 1) else topvi.squeeze()[i] for i in range(batch_size) ] # next_in = [] # toppi = toppi.squeeze() # top_ptr_i = top_ptr_i.squeeze() # topvi = topvi.squeeze() # for i in range(batch_size): # if toppi[i] < input_lengths[i]: # next_in.append(top_ptr_i[i].data[0]) # else: # next_in.append(topvi[i]) # topvi 就是单词的idx decoder_input = Variable( torch.LongTensor(next_in)) # Chosen word is next input if USE_CUDA: decoder_input = decoder_input.cuda() # Loss calculation and backpropagation ''' http://www.studyai.com/article/bba734ff PyTorch 高维矩阵转置 Transpose 和 Permute ''' loss_Vocab = masked_cross_entropy( all_decoder_outputs_vocab.transpose( 0, 1).contiguous(), # -> batch x seq target_batches.transpose(0, 1).contiguous(), # -> batch x seq target_lengths) # target_index is ptr的位置 loss_Ptr = masked_cross_entropy( all_decoder_outputs_ptr.transpose( 0, 1).contiguous(), # -> batch x seq target_index.transpose(0, 1).contiguous(), # -> batch x seq target_lengths) loss = loss_Vocab + loss_Ptr loss.backward() # TODO: not used ?? # Clip gradient norms ec = torch.nn.utils.clip_grad_norm(self.encoder.parameters(), clip) dc = torch.nn.utils.clip_grad_norm(self.decoder.parameters(), clip) # Update parameters with optimizers self.encoder_optimizer.step() self.decoder_optimizer.step() self.loss += loss.data[0] self.loss_ptr += loss_Ptr.data[0] # to get the data in Variable self.loss_vac += loss_Vocab.data[0]
def train_batch(self, input_batches, input_lengths, target_batches, target_lengths, target_index, target_gate, batch_size, clip, teacher_forcing_ratio, conv_seqs, conv_lengths, kb_seqs, kb_lengths, kb_target_index, kb_plain, reset): """ TODO:Check shape of inputs :param input_batches: seq_len x batch_size x MEM_SIZE ,i.e: torch.Size([37, 2, 5]) :param input_lengths: :param target_batches: :param target_lengths: :param target_index: :param target_gate: :param batch_size: :param clip: :param teacher_forcing_ratio: :param conv_seqs: :param conv_lengths: :param kb_seqs: lens x batch x mem_size torch.Size([41, 2, 5]) :param kb_lengths: :param reset: :return: """ if reset: self.loss = 0 self.loss_memory = 0 self.loss_vocabulary = 0 self.loss_kb = 0 self.print_every = 1 self.batch_size = batch_size # Zero gradients of both optimizers self.encoder_optimizer.zero_grad() self.decoder_optimizer.zero_grad() decoder_hidden = self.encoder(input_batches).unsqueeze(0) self.decoder.load_memory(input_batches.transpose(0, 1)) self.decoder.kb_memory.load_memory(kb_seqs.transpose(0, 1)) decoder_input = torch.LongTensor([SOS_token] * batch_size) max_target_length = max(target_lengths) # store the output of all_decoder_outputs_vocab = torch.zeros(max_target_length, batch_size, self.output_size) all_decoder_outputs_memory = torch.zeros(max_target_length, batch_size, input_batches.size(0)) all_decoder_outputs_kb = torch.zeros(max_target_length, batch_size, kb_seqs.size(0)) # 这里给出的是对位置的概率,而不是词表的概率!!! # Move new Variables to CUDA if USE_CUDA: all_decoder_outputs_vocab = all_decoder_outputs_vocab.cuda() all_decoder_outputs_memory = all_decoder_outputs_memory.cuda() all_decoder_outputs_kb = all_decoder_outputs_kb.cuda() decoder_input = decoder_input.cuda() # Choose whether to use teacher forcing use_teacher_forcing = random.random() < teacher_forcing_ratio if use_teacher_forcing: # Run through decoder one time step at a time for t in range(max_target_length): decoder_pkb, decoder_pmemory, decoder_vacab, switch_probality, decoder_hidden, pg_state = self.decoder.ptrMemDecoder( decoder_input, decoder_hidden) # 先mask fill ,然后再使用softmax decoder_pmemory_normalized = self._masked(decoder_pmemory, input_batches[:, :, 0]) decoder_pkb_normalized = self._masked(decoder_pkb, kb_seqs[:, :, 0]) all_decoder_outputs_vocab[t] = decoder_vacab all_decoder_outputs_memory[t] = decoder_pmemory_normalized all_decoder_outputs_kb[t] = decoder_pkb_normalized decoder_input = target_batches[t] # Chosen word is next input if USE_CUDA: decoder_input = decoder_input.cuda() else: for t in range(max_target_length): decoder_pkb, decoder_pmemory, decoder_vacab, switch_probality, decoder_hidden, pg_state = self.decoder.ptrMemDecoder( decoder_input, decoder_hidden) decoder_pmemory_normalized = self._masked(decoder_pmemory, input_batches[:, :, 0]) decoder_pkb_normalized = self._masked(decoder_pkb, kb_seqs[:, :, 0]) all_decoder_outputs_vocab[t] = decoder_vacab all_decoder_outputs_memory[t] = decoder_pmemory_normalized all_decoder_outputs_kb[t] = decoder_pkb_normalized next_in, _, _ = self._infer_get_next_in(memory_pro=decoder_pmemory_normalized, kb_pro=decoder_pkb_normalized, vocab_pro=decoder_vacab, inputs=input_batches[:, :, 0], kb_inputs=kb_seqs[:, :, 0], input_lengths=input_lengths, kb_lengths=kb_lengths) decoder_input = torch.LongTensor(next_in) # Chosen word is next input if USE_CUDA: decoder_input = decoder_input.cuda() # Loss calculation and backpropagation loss_Vocab = masked_cross_entropy( all_decoder_outputs_vocab.transpose(0, 1).contiguous(), # -> batch x seq target_batches.transpose(0, 1).contiguous(), # -> batch x seq target_lengths ) loss_memory = masked_cross_entropy( all_decoder_outputs_memory.transpose(0, 1).contiguous(), # -> batch x seq target_index.transpose(0, 1).contiguous(), # -> batch x seq target_lengths ) loss_kb = masked_cross_entropy( all_decoder_outputs_kb.transpose(0, 1).contiguous(), # -> batch x seq kb_target_index.transpose(0, 1).contiguous(), # -> batch x seq target_lengths ) loss = loss_Vocab + loss_memory + loss_kb loss.backward() # Clip gradient norms torch.nn.utils.clip_grad_norm_(self.encoder.parameters(), clip) torch.nn.utils.clip_grad_norm_(self.decoder.parameters(), clip) # Update parameters with optimizers self.encoder_optimizer.step() self.decoder_optimizer.step() self.loss += loss.item() self.loss_memory += loss_memory.item() self.loss_vocabulary += loss_Vocab.item() self.loss_kb += loss_kb.item()