Example #1
0
    def train_batch(self, data, clip, reset=0):
        if reset: self.reset()
        # Zero gradients of both optimizers
        self.encoder_optimizer.zero_grad()
        self.extKnow_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()

        # Encode and Decode
        use_teacher_forcing = random.random() < args['teacher_forcing_ratio']
        max_target_length = max(data['response_lengths'])
        all_decoder_outputs_vocab, all_decoder_outputs_ptr, _, _, global_pointer, label_e, label_d, label_mix_e, label_mix_d = self.encode_and_decode(
            data, max_target_length, use_teacher_forcing, False)

        # Loss calculation and backpropagation
        domains = []
        for domain in data['domain']:
            domains.append(self.domains[domain])
        loss_g = self.criterion_bce(global_pointer, data['selector_index'])
        loss_v = masked_cross_entropy(
            all_decoder_outputs_vocab.transpose(0, 1).contiguous(),
            data['sketch_response'].contiguous(), data['response_lengths'])
        loss_l = masked_cross_entropy(
            all_decoder_outputs_ptr.transpose(0, 1).contiguous(),
            data['ptr_index'].contiguous(), data['response_lengths'])
        loss = loss_g + loss_v + loss_l

        golden_labels = torch.zeros_like(label_e).scatter_(
            1, data['label_arr'], 1)
        loss += self.criterion_label(label_e, golden_labels)
        loss += self.criterion_label(label_d, golden_labels)

        domains = self._cuda(torch.Tensor(domains)).long().unsqueeze(-1)
        loss += masked_cross_entropy(
            label_mix_e,
            domains.expand(len(domains), label_mix_e.size(1)).contiguous(),
            data['conv_arr_lengths'])
        loss += masked_cross_entropy(
            label_mix_d,
            domains.expand(len(domains), label_mix_d.size(1)).contiguous(),
            data['response_lengths'])
        loss.backward()

        # Clip gradient norms
        ec = torch.nn.utils.clip_grad_norm_(self.encoder.parameters(), clip)
        ec = torch.nn.utils.clip_grad_norm_(self.extKnow.parameters(), clip)
        dc = torch.nn.utils.clip_grad_norm_(self.decoder.parameters(), clip)

        # Update parameters with optimizers
        self.encoder_optimizer.step()
        self.extKnow_optimizer.step()
        self.decoder_optimizer.step()
        self.loss += loss.item()
        self.loss_g += loss_g.item()
        self.loss_v += loss_v.item()
        self.loss_l += loss_l.item()
Example #2
0
    def train_batch(self, data, clip, reset=0, ss=1.0):
        if reset: self.reset()

        # Zero gradients of both optimizers
        self.encoder_optimizer.zero_grad()
        self.extKnow_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()
        """
        optimizer.zero_grad()
        """
        """
        for k, v in data.items():
            print(k)
            if isinstance(v, torch.Tensor):
                print(v.size())
                print(v)
            else:
                print(v)
        """
        # Encode and Decode
        max_target_length = data["sketch_response"].size(1)
        all_decoder_outputs_vocab, all_decoder_outputs_ptr, _, _ = self.encode_and_decode(
            data, max_target_length, ss, False)

        # Loss calculation and backpropagation
        loss_v = masked_cross_entropy(
            all_decoder_outputs_vocab.transpose(0, 1).contiguous(),
            data['sketch_response'].contiguous(), data['response'], PAD_token)
        loss_l = masked_cross_entropy(
            all_decoder_outputs_ptr.transpose(0, 1).contiguous(),
            data['ptr_index'].contiguous(), data['response'], PAD_token)
        loss = loss_v + loss_l
        loss.backward()

        # Clip gradient norms
        ec = torch.nn.utils.clip_grad_norm_(self.encoder.parameters(), clip)
        ec = torch.nn.utils.clip_grad_norm_(self.extKnow.parameters(), clip)
        dc = torch.nn.utils.clip_grad_norm_(self.decoder.parameters(), clip)

        # Update parameters with optimizers
        self.encoder_optimizer.step()
        self.extKnow_optimizer.step()
        self.decoder_optimizer.step()
        """
        torch.nn.utils.clip_grad_norm_(self.parameters(), clip)
        optimizer.step()
        """
        self.loss += loss.item()
        self.loss_v += loss_v.item()
        self.loss_l += loss_l.item()
    def evaluate(self, batch):
        r""" Take a step of learning process

                Args: batch, grad_clip
                    - **batch** torchtext.data.batch.Batch object of TabularDataset
                    - **grad_clip** value of gradient clipping
                Output: loss, target_seq, target_seq_distr, pointer_distr, att_distr
                    - **loss** loss value
                    - **target_seq** of shape `(max_target_len, batch)`: tensor, containing encoded generated sequence.
                    - **target_seq_distr** of shape `(max_target_len, batch, vocabulary_size + input_seq_len)`: tensor
                      containing vocabulary distribution on generated sequence.
                    - **pointer_distr** of shape `(max_target_len, batch)`: tensor, containing pointer probabilities.
                    - **att_distr** of shape `(max_target_len, seq_len, batch)`: tensor, containing attention distributions
                      for each generated word.
                """
        self.eval()

        input_seq, input_lens, target_seq, target_lens = *batch.src, *batch.trg
        target_seq_ext, _ = batch.trg_ext
        max_target_len = target_lens.max().item()

        # Calculate out of vocabulary mask. If word has <unk> tag, this is oov.
        oov_mask = input_seq == 0

        target_seq, target_seq_distr, pointer_distr, att_distr = self.forward(input_seq, input_lens,
                                                                              oov_mask, max_target_len)
        loss = masked_cross_entropy(target_seq_distr.transpose(0, 1).contiguous(),
                                    target_seq_ext.transpose(0, 1).contiguous(), target_lens)
        self.train()

        return loss.item(), target_seq, target_seq_distr, pointer_distr, att_distr
    def train_step(self, optimizer, batch, grad_clip):
        r"""Take a step of learning process

            Args: optimizer, batch, grad_clip
                - **optimizer** optimizer for learning
                - **batch** torchtext.data.batch.Batch object of TabularDataset
                - **grad_clip** value of gradient clipping

            Output: loss
                - **loss** loss value
        """
        optimizer.zero_grad()

        input_seq, input_lens, target_seq, target_lens = *batch.src, *batch.trg
        # Extended target is a target, encoded using OOV encoding
        target_seq_ext, _ = batch.trg_ext
        max_target_len = target_lens.max().item()

        # Calculate out of vocabulary mask. If word has <unk> tag, this is oov.
        oov_mask = (input_seq == 0).long().to(input_seq.device)

        out_seq_distr = self.forward(input_seq, input_lens, oov_mask, max_target_len, target_seq)

        loss = masked_cross_entropy(out_seq_distr.transpose(0, 1).contiguous(),
                                    target_seq_ext.transpose(0, 1).contiguous(), target_lens)
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(self.parameters(), grad_clip)
        optimizer.step()

        return loss.item(), grad_norm
Example #5
0
    def train_batch(self, data, clip, reset=0, ss=1.0):
        if reset: self.reset()
        # Zero gradients of both optimizers
        self.encoder_optimizer.zero_grad()
        self.extKnow_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()

        # Encode and Decode
        max_target_length = max(data['response_lengths'])
        all_decoder_outputs_vocab, all_decoder_outputs_ptr, _, _ = self.encode_and_decode(
            data, max_target_length, ss, False)

        # Loss calculation and backpropagation
        loss_v = masked_cross_entropy(
            all_decoder_outputs_vocab.transpose(0, 1).contiguous(),
            data['sketch_response'].contiguous(), data['response_lengths'])
        loss_l = masked_cross_entropy(
            all_decoder_outputs_ptr.transpose(0, 1).contiguous(),
            data['response_entity_id'].contiguous(),
            #data['response_lengths'])
            data['response_lengths'],
            is_logit=False)
        loss = loss_v + loss_l
        loss.backward()

        # Clip gradient norms
        ec = torch.nn.utils.clip_grad_norm_(self.encoder.parameters(), clip)
        ec = torch.nn.utils.clip_grad_norm_(self.extKnow.parameters(), clip)
        dc = torch.nn.utils.clip_grad_norm_(self.decoder.parameters(), clip)

        # Update parameters with optimizers
        self.encoder_optimizer.step()
        self.extKnow_optimizer.step()
        self.decoder_optimizer.step()

        self.loss += loss.item()
        self.loss_v += loss_v.item()
        self.loss_l += loss_l.item()
Example #6
0
    def train_batch(self, input_batches, input_lengths, target_batches,
                    target_lengths, target_index, batch_size, clip,
                    teacher_forcing_ratio):

        self.batch_size = batch_size
        # Zero gradients of both optimizers
        self.encoder_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()
        loss_Vocab, loss_Ptr = 0, 0

        # Run words through encoder
        decoder_hidden = self.encoder(input_batches.transpose(0,
                                                              1)).unsqueeze(0)

        # load memories with input
        self.decoder.load_memory(input_batches.transpose(0, 1))

        # Prepare input and output variables
        decoder_input = Variable(torch.LongTensor([SOS_token] * batch_size))

        max_target_length = max(target_lengths)
        all_decoder_outputs_vocab = Variable(
            torch.zeros(max_target_length, batch_size, self.output_size))
        all_decoder_outputs_ptr = Variable(
            torch.zeros(max_target_length, batch_size, input_batches.size(0)))

        # Move new Variables to CUDA
        if USE_CUDA:
            all_decoder_outputs_vocab = all_decoder_outputs_vocab.cuda()
            all_decoder_outputs_ptr = all_decoder_outputs_ptr.cuda()
            decoder_input = decoder_input.cuda()

        # Choose whether to use teacher forcing
        use_teacher_forcing = random.random() < teacher_forcing_ratio

        if use_teacher_forcing:
            # Run through decoder one time step at a time
            for t in range(max_target_length):
                decoder_ptr, decoder_vacab, decoder_hidden = self.decoder.ptrMemDecoder(
                    decoder_input, decoder_hidden)
                all_decoder_outputs_vocab[t] = decoder_vacab
                all_decoder_outputs_ptr[t] = decoder_ptr
                decoder_input = target_batches[t]  # Chosen word is next input
                if USE_CUDA: decoder_input = decoder_input.cuda()
        else:
            for t in range(max_target_length):
                decoder_ptr, decoder_vacab, decoder_hidden = self.decoder.ptrMemDecoder(
                    decoder_input, decoder_hidden)
                _, toppi = decoder_ptr.data.topk(1)
                _, topvi = decoder_vacab.data.topk(1)
                all_decoder_outputs_vocab[t] = decoder_vacab
                all_decoder_outputs_ptr[t] = decoder_ptr
                ## get the correspective word in input
                top_ptr_i = torch.gather(input_batches, 0,
                                         Variable(toppi.view(1, -1)))
                next_in = [
                    top_ptr_i.squeeze()[i].data[0] if
                    (toppi.squeeze()[i] < input_lengths[i] - 1) else
                    topvi.squeeze()[i] for i in range(batch_size)
                ]
                decoder_input = Variable(
                    torch.LongTensor(next_in))  # Chosen word is next input
                if USE_CUDA: decoder_input = decoder_input.cuda()

        #Loss calculation and backpropagation
        loss_Vocab = masked_cross_entropy(
            all_decoder_outputs_vocab.transpose(
                0, 1).contiguous(),  # -> batch x seq
            target_batches.transpose(0, 1).contiguous(),  # -> batch x seq
            target_lengths)
        loss_Ptr = masked_cross_entropy(
            all_decoder_outputs_ptr.transpose(
                0, 1).contiguous(),  # -> batch x seq
            target_index.transpose(0, 1).contiguous(),  # -> batch x seq
            target_lengths)

        loss = loss_Vocab + loss_Ptr
        loss.backward()

        # Clip gradient norms
        ec = torch.nn.utils.clip_grad_norm(self.encoder.parameters(), clip)
        dc = torch.nn.utils.clip_grad_norm(self.decoder.parameters(), clip)
        # Update parameters with optimizers
        self.encoder_optimizer.step()
        self.decoder_optimizer.step()
        self.loss += loss.data[0]
        #self.loss_gate += loss_gate.data[0]
        self.loss_ptr += loss_Ptr.data[0]
        self.loss_vac += loss_Vocab.data[0]
Example #7
0
    def train_batch(self, input_batches, input_lengths, target_batches,
                    target_lengths, target_index, target_gate, batch_size, clip,
                    teacher_forcing_ratio):

        # if reset:
        #     self.loss = 0
        #     self.loss_ptr = 0
        #     self.loss_vac = 0
        #     self.print_every = 1

        self.batch_size = batch_size
        # Zero gradients of both optimizers
        self.encoder_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()
        loss_Vocab, loss_Ptr = 0, 0
        

        # Run words through encoder
        # get the hidden processed by encoder
        decoder_hidden = self.encoder(input_batches).unsqueeze(0)
        self.decoder.load_memory(input_batches.transpose(0, 1))

        # Prepare input and output variables
        # init the first decoder_input
        decoder_input = Variable(torch.LongTensor(
            [SOS_token_index] * batch_size).to(device=DEVICE))

        # self.output_size 指输入数据集中总共容纳的词汇,利用词概率分布来选择最佳输出词
        max_target_length = max(target_lengths)
        self.max_response = max(max_target_length,self.max_response)
        all_decoder_outputs_vocab = Variable(torch.zeros(
            max_target_length, batch_size, self.output_size).to(device=DEVICE))

        # input_batches.size(0)是指输入句子的分词长度,用于指针根据用户输入关键词引入回复句子中
        all_decoder_outputs_ptr = Variable(torch.zeros(
            max_target_length, batch_size, input_batches.size(0)).to(device=DEVICE))

        # Move new Variables to CUDA
        # if USE_CUDA:
        #     all_decoder_outputs_vocab = all_decoder_outputs_vocab.cuda()
        #     all_decoder_outputs_ptr = all_decoder_outputs_ptr.cuda()
        #     decoder_input = decoder_input.cuda()

        # Choose whether to use teacher forcing
        use_teacher_forcing = random.random() < teacher_forcing_ratio

        if use_teacher_forcing:
            # Run through decoder one time step at a time
            for t in range(max_target_length):
                decoder_ptr, decoder_vacab, decoder_hidden = self.decoder.ptrMemDecoder(
                    decoder_input, decoder_hidden)
                # 对于每一个输出的词存储其对应的预测分布
                all_decoder_outputs_vocab[t] = decoder_vacab
                all_decoder_outputs_ptr[t] = decoder_ptr
                decoder_input = target_batches[t]  # Chosen word is next input
                decoder_input.to(device=DEVICE)
                # if USE_CUDA:
                #     decoder_input = decoder_input.cuda()
        else:
            for t in range(max_target_length):
                decoder_ptr, decoder_vacab, decoder_hidden = self.decoder.ptrMemDecoder(
                    decoder_input, decoder_hidden)
                _, toppi = decoder_ptr.data.topk(1)  # 指针网络大根堆第一个Tensor
                _, topvi = decoder_vacab.data.topk(1)
                all_decoder_outputs_vocab[t] = decoder_vacab
                all_decoder_outputs_ptr[t] = decoder_ptr
                # get the correspective word in input
                top_ptr_i = torch.gather(input_batches[:, :, 0], 0, Variable(
                    toppi.view(1, -1))).transpose(0, 1)
                next_in = [top_ptr_i[i].item() if (toppi[i].item(
                ) < input_lengths[i]-1) else topvi[i].item() for i in range(batch_size)]

                # Chosen word is next input
                decoder_input = Variable(torch.LongTensor(next_in).to(device=DEVICE))
                if USE_CUDA:
                    decoder_input = decoder_input.cuda()

        # Loss calculation and backpropagation
        loss_Vocab = masked_cross_entropy(
            all_decoder_outputs_vocab.transpose(
                0, 1).contiguous(),  # -> batch x seq
            target_batches.transpose(0, 1).contiguous(),  # -> batch x seq
            target_lengths
        )
        loss_Ptr = masked_cross_entropy(
            all_decoder_outputs_ptr.transpose(
                0, 1).contiguous(),  # -> batch x seq
            target_index.transpose(0, 1).contiguous(),  # -> batch x seq
            target_lengths
        )

        loss = loss_Vocab + loss_Ptr
        loss.backward()

        # Clip gradient norms
        ec = torch.nn.utils.clip_grad_norm(self.encoder.parameters(), clip)
        dc = torch.nn.utils.clip_grad_norm(self.decoder.parameters(), clip)
        # Update parameters with optimizers
        self.encoder_optimizer.step()
        self.decoder_optimizer.step()
        self.loss += loss.item()
        self.loss_ptr += loss_Ptr.item()
        self.loss_vac += loss_Vocab.item()
Example #8
0
    def train_batch(self, input_batches, input_lengths, target_batches, 
                    target_lengths, target_index, target_gate, batch_size, clip,
                    teacher_forcing_ratio,reset):   
        if reset:
            self.loss = 0
            self.loss_gate = 0
            self.loss_ptr = 0
            self.loss_vac = 0
            self.print_every = 1 
        # Zero gradients of both optimizers
        self.encoder_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()
        loss_Vocab,loss_Ptr,loss_Gate = 0,0,0
        # Run words through encoder
        encoder_outputs, encoder_hidden = self.encoder(input_batches, input_lengths)
      
        # Prepare input and output variables
        decoder_input = Variable(torch.LongTensor([SOS_token] * batch_size))
        decoder_hidden = (encoder_hidden[0][:self.decoder.n_layers],encoder_hidden[1][:self.decoder.n_layers])
        
        max_target_length = max(target_lengths)
        all_decoder_outputs_vocab = Variable(torch.zeros(max_target_length, batch_size, self.output_size))
        all_decoder_outputs_ptr = Variable(torch.zeros(max_target_length, batch_size, encoder_outputs.size(0)))
        all_decoder_outputs_gate = Variable(torch.zeros(max_target_length, batch_size))
        # Move new Variables to CUDA
        if USE_CUDA:
            all_decoder_outputs_vocab = all_decoder_outputs_vocab.cuda()
            all_decoder_outputs_ptr = all_decoder_outputs_ptr.cuda()
            all_decoder_outputs_gate = all_decoder_outputs_gate.cuda()
            decoder_input = decoder_input.cuda()

        # Choose whether to use teacher forcing
        use_teacher_forcing = random.random() < teacher_forcing_ratio
        
        if use_teacher_forcing:    
            # Run through decoder one time step at a time
            for t in range(max_target_length):
                decoder_ptr,decoder_vacab,gate,decoder_hidden = self.decoder(
                    decoder_input, decoder_hidden, encoder_outputs)

                all_decoder_outputs_vocab[t] = decoder_vacab
                all_decoder_outputs_ptr[t] = decoder_ptr
                all_decoder_outputs_gate[t] = gate
                decoder_input = target_batches[t] # Next input is current target
                if USE_CUDA: decoder_input = decoder_input.cuda()
                
        else:
            for t in range(max_target_length):
                decoder_ptr,decoder_vacab,gate,decoder_hidden = self.decoder(
                    decoder_input, decoder_hidden, encoder_outputs)
                all_decoder_outputs_vocab[t] = decoder_vacab
                all_decoder_outputs_ptr[t] = decoder_ptr
                all_decoder_outputs_gate[t] = gate
                topv, topvi = decoder_vacab.data.topk(1)
                topp, toppi = decoder_ptr.data.topk(1)
                ## get the correspective word in input
                top_ptr_i = torch.gather(input_batches,0,toppi.view(1, -1))
                next_in = [top_ptr_i.squeeze()[i].data[0] if(gate.squeeze()[i].data[0]>=0.5) else topvi.squeeze()[i] for i in range(batch_size)]
                decoder_input = Variable(torch.LongTensor(next_in)) # Chosen word is next input
                if USE_CUDA: decoder_input = decoder_input.cuda()
                  
        #Loss calculation and backpropagation
        loss_Vocab = masked_cross_entropy(
            all_decoder_outputs_vocab.transpose(0, 1).contiguous(), # -> batch x seq
            target_batches.transpose(0, 1).contiguous(), # -> batch x seq
            target_lengths
        )
        loss_Ptr = masked_cross_entropy(
            all_decoder_outputs_ptr.transpose(0, 1).contiguous(), # -> batch x seq
            target_index.transpose(0, 1).contiguous(), # -> batch x seq
            target_lengths
        )
        loss_gate = self.criterion(all_decoder_outputs_gate,target_gate.float())


        loss = loss_Vocab + loss_Ptr + loss_gate
        loss.backward()
        
        # Clip gradient norms
        ec = torch.nn.utils.clip_grad_norm(self.encoder.parameters(), clip)
        dc = torch.nn.utils.clip_grad_norm(self.decoder.parameters(), clip)
        # Update parameters with optimizers
        self.encoder_optimizer.step()
        self.decoder_optimizer.step()
        self.loss += loss.data[0]
        self.loss_gate += loss_gate.data[0] 
        self.loss_ptr += loss_Ptr.data[0]
        self.loss_vac += loss_Vocab.data[0]
Example #9
0
    def train_batch(self, input_batches, input_lengths, target_batches,
                    target_lengths, target_index, target_gate, batch_size,
                    clip, teacher_forcing_ratio, conv_seqs, conv_lengths,
                    reset):
        '''
        :param input_batches:   (T, B, 3) or something else
        :param input_lengths:   (B,)    length of each instance in the batch
        :param target_batches:  (T',B)  T' is the max response length
        :param target_lengths:  (B,)
        :param target_index:    (T',B)  as shape of target_batches
        :param target_gate:     (T,B)       not used
        :param batch_size:      没有必要的
        :param clip:
        :param teacher_forcing_ratio:       # a trick, for sentence generation
        :param conv_seqs:
        :param conv_lengths:
        :param reset:          a flag for the begin of each epoch
        :return:
        '''
        if reset:
            self.loss = 0
            self.loss_ptr = 0
            self.loss_vac = 0
            self.print_every = 1

        self.batch_size = batch_size
        # Zero gradients of both optimizers
        self.encoder_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()
        loss_Vocab, loss_Ptr = 0, 0

        # Run words through encoder; this is h_0 in the paper;
        # unsqueeze is for GRU as it requires **h_0** (num_layers * num_directions, batch, hidden_size)
        decoder_hidden = self.encoder(input_batches).unsqueeze(
            0)  # (B,E) ---> (1,B,E)
        # TODO: this is Dialog History + KB in the Fig.1 of the paper.
        # get the embedding
        self.decoder.load_memory(input_batches.transpose(0, 1))

        # Prepare input and output variables
        # this is y_0 in the paper. Fig.1
        decoder_input = Variable(torch.LongTensor([SOS_token] * batch_size))

        max_target_length = max(
            target_lengths
        )  # (just in this batch)may smaller than max_r_len(in all data);
        all_decoder_outputs_vocab = Variable(
            torch.zeros(max_target_length, batch_size, self.output_size))
        # input_batches.size(0) is time_length
        all_decoder_outputs_ptr = Variable(
            torch.zeros(max_target_length, batch_size, input_batches.size(0)))

        # Move new Variables to CUDA
        if USE_CUDA:
            all_decoder_outputs_vocab = all_decoder_outputs_vocab.cuda()
            all_decoder_outputs_ptr = all_decoder_outputs_ptr.cuda()
            decoder_input = decoder_input.cuda()

        # Choose whether to use teacher forcing
        '''
        https://machinelearningmastery.com/teacher-forcing-for-recurrent-neural-networks/
        '''
        use_teacher_forcing = random.random() < teacher_forcing_ratio

        if use_teacher_forcing:
            # Run through decoder one time step at a time
            for t in range(max_target_length):
                # decoder_input(in the form of word) shape(B,)   decoder_hidden shape(1,B,E)
                # (B,M)     (B,V)           (1,B,E or H)
                decoder_ptr, decoder_vacab, decoder_hidden = self.decoder.ptrMemDecoder(
                    decoder_input, decoder_hidden)
                all_decoder_outputs_vocab[t] = decoder_vacab
                all_decoder_outputs_ptr[t] = decoder_ptr
                decoder_input = target_batches[t]  # Chosen word is next input
                if USE_CUDA: decoder_input = decoder_input.cuda()
        else:
            for t in range(max_target_length):
                # (B,M)     (B,V)           (1,B,E or H)
                decoder_ptr, decoder_vacab, decoder_hidden = self.decoder.ptrMemDecoder(
                    decoder_input, decoder_hidden)
                '''A tuple of (values, indices) is returned,
                where the indices are the indices of the elements in the original input tensor'''
                _, toppi = decoder_ptr.data.topk(
                    1)  # return topk_value, topk_position
                _, topvi = decoder_vacab.data.topk(1)  # shape (B,1)
                all_decoder_outputs_vocab[t] = decoder_vacab
                all_decoder_outputs_ptr[t] = decoder_ptr
                # get the correspective word in input
                '''
                    out[i][j][k] = input[index[i][j][k]][j][k]  # if dim == 0
                    out[i][j][k] = input[i][index[i][j][k]][k]  # if dim == 1
                    out[i][j][k] = input[i][j][index[i][j][k]]  # if dim == 2
                '''
                # input_batches[:,:,0] shape(M,B)   最后输出的out与index的size是一样的。shape(1,B)
                # 由top_ptr导出的input
                top_ptr_i = torch.gather(input_batches[:, :, 0], 0,
                                         Variable(toppi.view(1, -1)))
                # 因为起始位置为0; 所以长度-1;而且最后元素为$$$$符号,所以小于
                next_in = [
                    top_ptr_i.squeeze()[i].data[0] if
                    (toppi.squeeze()[i] < input_lengths[i] - 1) else
                    topvi.squeeze()[i] for i in range(batch_size)
                ]
                # next_in = []
                # toppi = toppi.squeeze()
                # top_ptr_i = top_ptr_i.squeeze()
                # topvi = topvi.squeeze()
                # for i in range(batch_size):
                #     if toppi[i] < input_lengths[i]:
                #         next_in.append(top_ptr_i[i].data[0])
                #     else:
                #         next_in.append(topvi[i])    # topvi 就是单词的idx
                decoder_input = Variable(
                    torch.LongTensor(next_in))  # Chosen word is next input
                if USE_CUDA: decoder_input = decoder_input.cuda()

        # Loss calculation and backpropagation
        '''
        http://www.studyai.com/article/bba734ff     PyTorch 高维矩阵转置 Transpose 和 Permute
        '''
        loss_Vocab = masked_cross_entropy(
            all_decoder_outputs_vocab.transpose(
                0, 1).contiguous(),  # -> batch x seq
            target_batches.transpose(0, 1).contiguous(),  # -> batch x seq
            target_lengths)
        # target_index is ptr的位置
        loss_Ptr = masked_cross_entropy(
            all_decoder_outputs_ptr.transpose(
                0, 1).contiguous(),  # -> batch x seq
            target_index.transpose(0, 1).contiguous(),  # -> batch x seq
            target_lengths)

        loss = loss_Vocab + loss_Ptr
        loss.backward()

        # TODO: not used ??
        # Clip gradient norms
        ec = torch.nn.utils.clip_grad_norm(self.encoder.parameters(), clip)
        dc = torch.nn.utils.clip_grad_norm(self.decoder.parameters(), clip)

        # Update parameters with optimizers
        self.encoder_optimizer.step()
        self.decoder_optimizer.step()
        self.loss += loss.data[0]
        self.loss_ptr += loss_Ptr.data[0]  # to get the data in Variable
        self.loss_vac += loss_Vocab.data[0]
Example #10
0
 def train_batch(self, input_batches, input_lengths, target_batches, target_lengths, target_index, target_gate,
                 batch_size, clip,
                 teacher_forcing_ratio, conv_seqs, conv_lengths, kb_seqs, kb_lengths, kb_target_index, kb_plain,
                 reset):
     """
     TODO:Check shape of inputs
     :param input_batches: seq_len x batch_size x MEM_SIZE ,i.e: torch.Size([37, 2, 5])
     :param input_lengths:
     :param target_batches:
     :param target_lengths:
     :param target_index:
     :param target_gate:
     :param batch_size:
     :param clip:
     :param teacher_forcing_ratio:
     :param conv_seqs:
     :param conv_lengths:
     :param kb_seqs:  lens x batch x mem_size torch.Size([41, 2, 5])
     :param kb_lengths:
     :param reset:
     :return:
     """
     if reset:
         self.loss = 0
         self.loss_memory = 0
         self.loss_vocabulary = 0
         self.loss_kb = 0
         self.print_every = 1
     self.batch_size = batch_size
     # Zero gradients of both optimizers
     self.encoder_optimizer.zero_grad()
     self.decoder_optimizer.zero_grad()
     decoder_hidden = self.encoder(input_batches).unsqueeze(0)
     self.decoder.load_memory(input_batches.transpose(0, 1))
     self.decoder.kb_memory.load_memory(kb_seqs.transpose(0, 1))
     decoder_input = torch.LongTensor([SOS_token] * batch_size)
     max_target_length = max(target_lengths)
     # store the output of
     all_decoder_outputs_vocab = torch.zeros(max_target_length, batch_size, self.output_size)
     all_decoder_outputs_memory = torch.zeros(max_target_length, batch_size, input_batches.size(0))
     all_decoder_outputs_kb = torch.zeros(max_target_length, batch_size, kb_seqs.size(0))  # 这里给出的是对位置的概率,而不是词表的概率!!!
     # Move new Variables to CUDA
     if USE_CUDA:
         all_decoder_outputs_vocab = all_decoder_outputs_vocab.cuda()
         all_decoder_outputs_memory = all_decoder_outputs_memory.cuda()
         all_decoder_outputs_kb = all_decoder_outputs_kb.cuda()
         decoder_input = decoder_input.cuda()
     # Choose whether to use teacher forcing
     use_teacher_forcing = random.random() < teacher_forcing_ratio
     if use_teacher_forcing:
         # Run through decoder one time step at a time
         for t in range(max_target_length):
             decoder_pkb, decoder_pmemory, decoder_vacab, switch_probality, decoder_hidden, pg_state = self.decoder.ptrMemDecoder(
                 decoder_input, decoder_hidden)
             # 先mask fill ,然后再使用softmax
             decoder_pmemory_normalized = self._masked(decoder_pmemory, input_batches[:, :, 0])
             decoder_pkb_normalized = self._masked(decoder_pkb, kb_seqs[:, :, 0])
             all_decoder_outputs_vocab[t] = decoder_vacab
             all_decoder_outputs_memory[t] = decoder_pmemory_normalized
             all_decoder_outputs_kb[t] = decoder_pkb_normalized
             decoder_input = target_batches[t]  # Chosen word is next input
             if USE_CUDA: decoder_input = decoder_input.cuda()
     else:
         for t in range(max_target_length):
             decoder_pkb, decoder_pmemory, decoder_vacab, switch_probality, decoder_hidden, pg_state = self.decoder.ptrMemDecoder(
                 decoder_input, decoder_hidden)
             decoder_pmemory_normalized = self._masked(decoder_pmemory, input_batches[:, :, 0])
             decoder_pkb_normalized = self._masked(decoder_pkb, kb_seqs[:, :, 0])
             all_decoder_outputs_vocab[t] = decoder_vacab
             all_decoder_outputs_memory[t] = decoder_pmemory_normalized
             all_decoder_outputs_kb[t] = decoder_pkb_normalized
             next_in, _, _ = self._infer_get_next_in(memory_pro=decoder_pmemory_normalized,
                                                     kb_pro=decoder_pkb_normalized,
                                                     vocab_pro=decoder_vacab,
                                                     inputs=input_batches[:, :, 0],
                                                     kb_inputs=kb_seqs[:, :, 0],
                                                     input_lengths=input_lengths,
                                                     kb_lengths=kb_lengths)
             decoder_input = torch.LongTensor(next_in)  # Chosen word is next input
             if USE_CUDA: decoder_input = decoder_input.cuda()
     # Loss calculation and backpropagation
     loss_Vocab = masked_cross_entropy(
         all_decoder_outputs_vocab.transpose(0, 1).contiguous(),  # -> batch x seq
         target_batches.transpose(0, 1).contiguous(),  # -> batch x seq
         target_lengths
     )
     loss_memory = masked_cross_entropy(
         all_decoder_outputs_memory.transpose(0, 1).contiguous(),  # -> batch x seq
         target_index.transpose(0, 1).contiguous(),  # -> batch x seq
         target_lengths
     )
     loss_kb = masked_cross_entropy(
         all_decoder_outputs_kb.transpose(0, 1).contiguous(),  # -> batch x seq
         kb_target_index.transpose(0, 1).contiguous(),  # -> batch x seq
         target_lengths
     )
     loss = loss_Vocab + loss_memory + loss_kb
     loss.backward()
     # Clip gradient norms
     torch.nn.utils.clip_grad_norm_(self.encoder.parameters(), clip)
     torch.nn.utils.clip_grad_norm_(self.decoder.parameters(), clip)
     # Update parameters with optimizers
     self.encoder_optimizer.step()
     self.decoder_optimizer.step()
     self.loss += loss.item()
     self.loss_memory += loss_memory.item()
     self.loss_vocabulary += loss_Vocab.item()
     self.loss_kb += loss_kb.item()