コード例 #1
0
    def train_one_batch(self, batch, iter):
        '''
        Perfrom MLE & RL based training and compute their losses
        '''
        enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, context = get_enc_data(
            batch)

        enc_batch = self.model.embeds(
            enc_batch)  #Get embeddings for encoder input
        enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens)

        # -------------------------------Summarization-----------------------
        if opt.train_mle == "yes":  #perform MLE training
            mle_loss = self.train_batch_MLE(enc_out, enc_hidden,
                                            enc_padding_mask, context,
                                            extra_zeros,
                                            enc_batch_extend_vocab, batch)
        else:
            mle_loss = get_cuda(T.FloatTensor([0]))
        # --------------------------------RL training-------------------------
        if opt.train_rl == "yes":  #perform reinforcement learning training
            #multinomial based sampling
            sample_sents, RL_log_probs = self.train_batch_RL(
                enc_out,
                enc_hidden,
                enc_padding_mask,
                context,
                extra_zeros,
                enc_batch_extend_vocab,
                batch.art_oovs,
                greedy=False)
            with T.autograd.no_grad():
                #greedy based sampling
                greedy_sents, _ = self.train_batch_RL(enc_out,
                                                      enc_hidden,
                                                      enc_padding_mask,
                                                      context,
                                                      extra_zeros,
                                                      enc_batch_extend_vocab,
                                                      batch.art_oovs,
                                                      greedy=True)

            sample_reward = self.reward_function(sample_sents,
                                                 batch.original_abstracts)
            baseline_reward = self.reward_function(greedy_sents,
                                                   batch.original_abstracts)
            rl_loss = -(
                sample_reward - baseline_reward
            ) * RL_log_probs  #Self-critic policy gradient training (eq 15 in https://arxiv.org/pdf/1705.04304.pdf)
            rl_loss = T.mean(rl_loss)
            batch_reward = T.mean(sample_reward).item()
        else:
            rl_loss = get_cuda(T.FloatTensor([0]))
            batch_reward = 0
        #------------------------------------------------------------------------------------
        self.trainer.zero_grad()
        (opt.mle_weight * mle_loss + opt.rl_weight * rl_loss).backward()
        self.trainer.step()

        return mle_loss.item(), batch_reward
コード例 #2
0
ファイル: model.py プロジェクト: Leyan529/Master-Summarizer
    def __init__(self, pre_train_emb, word_emb_type, vocab):
        super(Model, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
        if pre_train_emb:
            self.embeds = nn.Embedding(config.vocab_size, config.emb_dim)
            # FastText & word2Vec same format
            if word_emb_type == 'word2Vec' or word_emb_type == 'FastText':
                weight = get_Word2Vec_weight(vocab)
                # print('weight',len(weight))
            elif word_emb_type == 'glove':
                weight = get_glove_weight(vocab)
#                weight = get_glove_weight2(vocab)
            elif word_emb_type == 'bert':
                weight = get_bert_weight(vocab)

            self.embeds = T.nn.Embedding.from_pretrained(weight)
            self.embeds.weight.requires_grad = config.emb_grad

        else:
            self.embeds = nn.Embedding(config.vocab_size, config.emb_dim)
            init_wt_normal(self.embeds.weight)
            # requires_grad指定是否在训练过程中对词向量的权重进行微调
            self.embeds.weight.requires_grad = True

        self.encoder = get_cuda(self.encoder)
        self.decoder = get_cuda(self.decoder)
        self.embeds = get_cuda(self.embeds)
コード例 #3
0
    def __init__(self,pre_train_emb,word_emb_type,vocab):
        super(Model, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
#         if pre_train_emb:
#             self.embeds = nn.Embedding(config.vocab_size, config.emb_dim)
#             weight = get_Word2Vec_weight(vocab)
#             if word_emb_type == 'word2Vec':    
#                 weight = get_Word2Vec_weight(vocab)
#             elif word_emb_type == 'glove':
#                 weight = get_glove_weight(vocab)
# #                weight = get_glove_weight2(vocab)                
# #             elif word_emb_type == 'bert':    
# #                 weight = get_bert_weight(vocab)
                
#             self.embeds = T.nn.Embedding.from_pretrained(weight)
#             self.embeds.weight.requires_grad = config.emb_grad

#         else:
        # self.embeds = nn.Embedding(config.vocab_size, config.emb_dim)
        # init_wt_normal(self.embeds.weight)
        # # requires_grad指定是否在训练过程中对词向量的权重进行微调
        # self.embeds.weight.requires_grad = True

        self.encoder = get_cuda(self.encoder)
        self.decoder = get_cuda(self.decoder)
コード例 #4
0
    def __init__(self, start_id, end_id, unk_id, hidden_state, context):
        # beam_size = batch_size * beam_n
        h, c = hidden_state  #(hid_size,)
        self.tokens = T.LongTensor(config.beam_size, 1).fill_(
            start_id)  #(beam_size, t) after t time steps
        # 初始beam score分數為-30
        self.scores = T.FloatTensor(config.beam_size, 1).fill_(
            -30)  #beam_size,1; Initial score of beams = -30
        self.tokens, self.scores = get_cuda(self.tokens), get_cuda(self.scores)
        self.scores[0][0] = 0

        # 每個batch中欲被decode的元素,將根據beam_size進行複製
        #At time step t=0, all beams should extend from a single beam. So, I am giving high initial score to 1st beam
        self.hid_h = h.unsqueeze(0).repeat(config.beam_size,
                                           1)  #beam_size, hid_size
        self.hid_c = c.unsqueeze(0).repeat(config.beam_size,
                                           1)  #beam_size, hid_size
        # print('self.hid_h',self.hid_h.shape);print('self.hid_c',self.hid_c.shape)
        # print('context',context.shape)
        self.context = context.unsqueeze(0).repeat(config.beam_size,
                                                   1)  #beam_size, 2*hid_size
        self.sum_temporal_srcs = None
        self.prev_s = None
        self.done = False
        self.end_id = end_id
        self.unk_id = unk_id
コード例 #5
0
    def forward(self, s_t, prev_s):
        '''Perform intra_decoder attention
        Args
        :param s_t: hidden state of decoder at current time step
        :param prev_s: If intra_decoder attention, contains list of previous decoder hidden states
        '''
        if config.intra_decoder is False:
            ct_d = get_cuda(T.zeros(s_t.size()))
        elif prev_s is None:
            ct_d = get_cuda(T.zeros(s_t.size()))
            prev_s = s_t.unsqueeze(1)  #bs, 1, n_hid
        else:
            # Standard attention technique (eq 1 in https://arxiv.org/pdf/1704.04368.pdf)
            et = self.W_prev(prev_s)  # bs,t-1,n_hid
            dec_fea = self.W_s(s_t).unsqueeze(1)  # bs,1,n_hid
            et = et + dec_fea
            et = T.tanh(et)  # bs,t-1,n_hid
            et = self.v(et).squeeze(2)  # bs,t-1

            # intra-decoder attention     (eq 7 & 8 in https://arxiv.org/pdf/1705.04304.pdf)
            at = F.softmax(et, dim=1).unsqueeze(1)  #bs, 1, t-1
            ct_d = T.bmm(at, prev_s).squeeze(1)  #bs, n_hid
            prev_s = T.cat([prev_s, s_t.unsqueeze(1)], dim=1)  #bs, t, n_hid

        return ct_d, prev_s
コード例 #6
0
ファイル: model.py プロジェクト: Leyan529/Master-Summarizer
    def forward(self, s_t, prev_s, sum_k_emb):
        '''Perform intra_decoder attention
        Args
        :param s_t: hidden state of decoder at current time step
        :param prev_s: If intra_decoder attention, contains list of previous decoder hidden states
        '''
        if config.intra_decoder is False:
            ct_d = get_cuda(T.zeros(s_t.size()))  # set c1_d to vector of zeros
        elif prev_s is None:
            ct_d = get_cuda(T.zeros(s_t.size()))
            prev_s = s_t.unsqueeze(1)  #batch_size, 1, hid_size
        else:
            # Standard attention technique (eq 1 in Pointer-Generator Networks - https://arxiv.org/pdf/1704.04368.pdf)
            # et = tanh ( W_prev(prev_s)  + W_s(st_hat) )
            et = self.W_prev(prev_s)  # batch_size,t-1,hid_size
            dec_fea = self.W_s(s_t).unsqueeze(1)  # batch_size,1,hid_size
            et = et + dec_fea

            if config.key_attention:
                k_t = self.W_t(sum_k_emb).unsqueeze(1)
                if k_t.shape[0] == et.shape[0]: et = et + k_t

            et = T.tanh(et)  # batch_size,t-1,hid_size
            et = self.v(et).squeeze(2)  # batch_size,t-1
            # intra-decoder attention     (eq 7 & 8 in DEEP REINFORCED MODEL - https://arxiv.org/pdf/1705.04304.pdf)
            at = F.softmax(et, dim=1).unsqueeze(1)  #batch_size, 1, t-1
            ct_d = T.bmm(at, prev_s).squeeze(
                1
            )  #batch_size, hid_size    #  將 previous decoder hidden states 與 attention distribution 做矩阵乘法得 decoder context vector
            prev_s = T.cat(
                [prev_s, s_t.unsqueeze(1)], dim=1
            )  #batch_size, t, hid_size  # 將目前計算的decoder state 合併到 previous decoder hidden states

        return ct_d, prev_s
コード例 #7
0
    def __init__(self, vocab_size):
        super(Model, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder(vocab_size)
        self.embeds = nn.Embedding(vocab_size, config.emb_dim)
        init_wt_normal(self.embeds.weight)

        self.encoder = get_cuda(self.encoder)
        self.decoder = get_cuda(self.decoder)
        self.embeds = get_cuda(self.embeds)
コード例 #8
0
    def train_batch_MLE(self, enc_out, enc_hidden, enc_padding_mask, ct_e,
                        extra_zeros, enc_batch_extend_vocab, batch):
        ''' Calculate Negative Log Likelihood Loss for the given batch. In order to reduce exposure bias,
        pass the previous generated token as input with a probability of 0.25 instead of ground truth label

        Args:
        :param enc_out: Outputs of the encoder for all time steps (batch_size, length_input_sequence, 2*hidden_size)
        :param enc_hidden: Tuple containing final hidden state & cell state of encoder. Shape of h & c: (batch_size, hidden_size)
        :param enc_padding_mask: Mask for encoder input; Tensor of size (batch_size, length_input_sequence) with values of 0 for pad tokens & 1 for others
        :param ct_e: encoder context vector for time_step=0 (eq 5 in https://arxiv.org/pdf/1705.04304.pdf)
        :param extra_zeros: Tensor used to extend vocab distribution for pointer mechanism
        :param enc_batch_extend_vocab: Input batch that stores OOV ids
        :param batch: batch object
        '''
        sum_dec_batch, sum_max_dec_len, sum_dec_lens, sum_target_batch = get_sum_dec_data(
            batch)  #Get input and target batchs for training decoder
        step_losses = []
        s_t = (enc_hidden[0], enc_hidden[1])  #Decoder hidden states
        y_t = get_cuda(T.LongTensor(len(enc_out)).fill_(
            self.start_id))  #Input to the decoder
        prev_s = None  #Used for intra-decoder attention (section 2.2 in https://arxiv.org/pdf/1705.04304.pdf)
        sum_temporal_srcs = None  #Used for intra-temporal attention (section 2.1 in https://arxiv.org/pdf/1705.04304.pdf)
        for t in range(min(sum_max_dec_len, config.max_dec_steps)):
            use_gound_truth = get_cuda((T.rand(len(enc_out)) > 0.25)).long(
            )  #Probabilities indicating whether to use ground truth labels instead of previous decoded tokens
            y_t = use_gound_truth * sum_dec_batch[:, t] + (
                1 - use_gound_truth
            ) * y_t  #Select decoder input based on use_ground_truth probabilities
            y_t = self.model.embeds(y_t)
            #decoder step
            final_dist, s_t, ct_e, sum_temporal_srcs, prev_s = self.model.sum_decoder(
                y_t, s_t, enc_out, enc_padding_mask, ct_e, extra_zeros,
                enc_batch_extend_vocab, sum_temporal_srcs, prev_s)
            target = sum_target_batch[:, t]
            log_probs = T.log(final_dist + config.eps)
            step_loss = F.nll_loss(log_probs,
                                   target,
                                   reduction="none",
                                   ignore_index=self.pad_id)
            step_losses.append(step_loss)
            y_t = T.multinomial(final_dist, 1).squeeze(
            )  #Sample words from final distribution which can be used as input in next time step
            is_oov = (y_t >= config.vocab_size
                      ).long()  #Mask indicating whether sampled word is OOV
            y_t = (1 - is_oov) * y_t.detach() + (
                is_oov) * self.unk_id  #Replace OOVs with [UNK] token

        sum_losses = T.sum(
            T.stack(step_losses, 1), 1
        )  #unnormalized losses for each example in the batch; (batch_size)
        batch_avg_loss = sum_losses / sum_dec_lens  #Normalized losses; (batch_size)
        mle_loss = T.mean(batch_avg_loss)  #Average batch loss
        return mle_loss
コード例 #9
0
    def __init__(self, vocab_size):
        super(TaskModel, self).__init__()
        self.encoder = TaskEncoder()
        self.decoder = Decoder(vocab_size)
        self.embeds = nn.Embedding(vocab_size, config.emb_dim)
        self.seg_embeds = nn.Embedding(len(SEGMENT), config.emb_dim)

        init_wt_normal(self.embeds.weight)
        init_wt_normal(self.seg_embeds.weight)

        self.encoder = get_cuda(self.encoder)
        self.decoder = get_cuda(self.decoder)
        self.embeds = get_cuda(self.embeds)
        self.seg_embeds = get_cuda(self.seg_embeds)
コード例 #10
0
 def __init__(self, start_id, end_id, unk_id, hidden_state, context):
     h,c = hidden_state                                              #(n_hid,)
     self.tokens = T.LongTensor(config.beam_size,1).fill_(start_id)  #(beam, t) after t time steps
     self.scores = T.FloatTensor(config.beam_size,1).fill_(-30)      #beam,1; Initial score of beams = -30
     self.tokens, self.scores = get_cuda(self.tokens), get_cuda(self.scores)
     self.scores[0][0] = 0                                           #At time step t=0, all beams should extend from a single beam. So, I am giving high initial score to 1st beam
     self.hid_h = h.unsqueeze(0).repeat(config.beam_size, 1)         #beam, n_hid
     self.hid_c = c.unsqueeze(0).repeat(config.beam_size, 1)         #beam, n_hid
     self.context = context.unsqueeze(0).repeat(config.beam_size, 1) #beam, 2*n_hid
     self.sum_temporal_srcs = None
     self.prev_s = None
     self.done = False
     self.end_id = end_id
     self.unk_id = unk_id
コード例 #11
0
 def reward_function(self, decoded_sents, original_sents):
     ''' Calculate Rouge_L scores for a batch of decoded sentences given their original sentences
     :param decoded_sents: List containing decoded sentences
     :param original_sents: List containing original sentences
     '''
     rouge = Rouge()
     try:
         scores = rouge.get_scores(decoded_sents, original_sents)
     except Exception:
         print(
             "Rouge failed for multi sentence evaluation.. Finding exact pair"
         )
         scores = []
         for i in range(len(decoded_sents)):
             try:
                 score = rouge.get_scores(decoded_sents[i],
                                          original_sents[i])
             except Exception:
                 print("Error occured at:")
                 print("decoded_sents:", decoded_sents[i])
                 print("original_sents:", original_sents[i])
                 score = [{"rouge-l": {"f": 0.0}}]
             scores.append(score[0])
     rouge_l_f1 = [score["rouge-l"]["f"] for score in scores]
     rouge_l_f1 = get_cuda(T.FloatTensor(rouge_l_f1))
     return rouge_l_f1
コード例 #12
0
 def forward(self, x):
     x = train_util.get_cuda(x)
     out = self.conv1(x)
     out = self.norm1(out)
     out = self.relu1(out)
     out = self.layers1(out)
     out = self.layers2(out)
     out = self.layers3(out)
     out = self.avgpool(out)
     out = out.view(out.size(0), -1)
     out = self.linear(out)
     return out
コード例 #13
0
 def setup_train(self):
     self.model = Model()
     self.model = get_cuda(self.model)
     self.trainer = T.optim.Adam(self.model.parameters(), lr=config.lr)
     start_iter = 0
     if config.resume_training:
         checkpoint = T.load(config.load_model_path)
         start_iter = checkpoint["iter"]
         self.model.load_state_dict(checkpoint["model_dict"])
         self.trainer.load_state_dict(checkpoint["trainer_dict"])
         print("Loaded model at " + config.load_model_path)
     return start_iter
コード例 #14
0
 def setup_train(self):
     self.model = Model()
     self.model = get_cuda(self.model)
     self.trainer = T.optim.Adam(self.model.parameters(), lr=config.lr)
     start_iter = 0
     if self.opt.load_model is not None:
         load_model_path = os.path.join(config.save_model_path,
                                        self.opt.load_model)
         checkpoint = T.load(load_model_path)
         start_iter = checkpoint["iter"]
         self.model.load_state_dict(checkpoint["model_dict"])
         self.trainer.load_state_dict(checkpoint["trainer_dict"])
         print("Loaded model at " + load_model_path)
     if self.opt.new_lr is not None:
         self.trainer = T.optim.Adam(self.model.parameters(),
                                     lr=self.opt.new_lr)
     return start_iter
コード例 #15
0
def test(testloader, model):
    """
    Test unit
    # TODO: Move to eval.py later
    """

    correct = 0
    total = 0

    with torch.no_grad():
        for data in testloader:
            images, labels = data
            labels = get_cuda(labels)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print('Accuracy of the network on the 10000 test images: {}%'.format(
        100 * correct / total))
コード例 #16
0
    def forward(self, x, use_ccu=False, loss_samples=None, remaining_step=None, training_step=None):
        x = train_util.get_cuda(x)
        x = self.ResNet(x)

        if use_ccu == True:
            if training_step and training_step >= config.warm_up:

                loss_begin = loss_samples[0]
                loss_end = loss_samples[-1]
                loss_samples = loss_samples.view(1, -1)

                c_ccu, final_p = self.CCU(loss_samples, loss_begin, loss_end, remaining_step, self)
                p = F.relu(self.w_p(final_p))
                p = F.relu(self.w_p2(p))
                c_ccu = F.relu(self.w_ccu(c_ccu))
                c_ccu = F.relu(self.w_ccu_2(c_ccu))

                # Add c_ccu to enable learning for CCU parameters:
                x = x + c_ccu + p

        return x
コード例 #17
0
    def forward(self, st_hat, h, enc_padding_mask, sum_temporal_srcs):
        ''' Perform attention over encoder hidden states
        :param st_hat: decoder hidden state at current time step
        :param h: encoder hidden states
        :param enc_padding_mask:
        :param sum_temporal_srcs: if using intra-temporal attention, contains summation of attention weights from previous decoder time steps
        '''

        # Standard attention technique (eq 1 in https://arxiv.org/pdf/1704.04368.pdf)
        et = self.W_h(h)  # bs,n_seq,2*n_hid
        dec_fea = self.W_s(st_hat).unsqueeze(1)  # bs,1,2*n_hid
        et = et + dec_fea
        et = T.tanh(et)  # bs,n_seq,2*n_hid
        et = self.v(et).squeeze(2)  # bs,n_seq

        # intra-temporal attention     (eq 3 in https://arxiv.org/pdf/1705.04304.pdf)
        if config.intra_encoder:
            exp_et = T.exp(et)
            if sum_temporal_srcs is None:
                et1 = exp_et
                sum_temporal_srcs = get_cuda(
                    T.FloatTensor(et.size()).fill_(1e-10)) + exp_et
            else:
                et1 = exp_et / sum_temporal_srcs  #bs, n_seq
                sum_temporal_srcs = sum_temporal_srcs + exp_et
        else:
            et1 = F.softmax(et, dim=1)

        # assign 0 probability for padded elements
        at = et1 * enc_padding_mask
        normalization_factor = at.sum(1, keepdim=True)
        at = at / normalization_factor

        at = at.unsqueeze(1)  #bs,1,n_seq
        # Compute encoder context vector
        ct_e = T.bmm(at, h)  #bs, 1, 2*n_hid
        ct_e = ct_e.squeeze(1)
        at = at.squeeze(1)

        return ct_e, at, sum_temporal_srcs
コード例 #18
0
 def reward_function(self, decoded_sents, original_sents):
     rouge = Rouge()
     try:
         scores = rouge.get_scores(decoded_sents, original_sents)
     except Exception:
         print(
             "Rouge failed for multi sentence evaluation.. Finding exact pair"
         )
         scores = []
         for i in range(len(decoded_sents)):
             try:
                 score = rouge.get_scores(decoded_sents[i],
                                          original_sents[i])
             except Exception:
                 print("Error occured at:")
                 print("decoded_sents:", decoded_sents[i])
                 print("original_sents:", original_sents[i])
                 score = [{"rouge-l": {"f": 0.0}}]
             scores.append(score[0])
     rouge_l_f1 = [score["rouge-l"]["f"] for score in scores]
     rouge_l_f1 = get_cuda(T.FloatTensor(rouge_l_f1))
     return rouge_l_f1
コード例 #19
0
def train(trainloader):
    # Init data loader:
    dataloader = iter(trainloader)

    # Init model:
    model = Model()
    model = model.type(torch.cuda.FloatTensor)

    # Init Optimization scheme
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = optim.Adam(model.parameters())

    # Used in CCU:
    loss_samples = get_cuda(torch.zeros(CCU_gap, 1))
    loss_counter = 0

    # Main loop:
    for i in tqdm(
            range(training_step)):  # loop over the dataset multiple times

        running_loss = 0.0

        # get data here
        try:
            inputs, labels = dataloader.next()
        except StopIteration:
            dataloader = iter(trainloader)
            inputs, labels = dataloader.next()
        labels = get_cuda(labels)

        # zero the parameter gradients
        optimizer.zero_grad()

        # Choose CCU or not:
        if loss_counter < CCU_gap:
            outputs = model(inputs)

        elif loss_counter == CCU_gap and use_CCU:
            # print('use CCU:')
            remain_step = get_cuda(
                torch.tensor(training_step - i, dtype=torch.float))
            outputs = model(inputs,
                            use_ccu=True,
                            loss_samples=loss_samples,
                            remaining_step=remain_step,
                            training_step=i)

            # reset CCU loss counter and samples.
            loss_counter = 0
            loss_samples = get_cuda(torch.zeros(CCU_gap, 1))

        elif loss_counter == CCU_gap and not use_CCU:
            outputs = model(inputs)

            # TODO: remove redundancy
            loss_counter = 0
            loss_samples = get_cuda(torch.zeros(CCU_gap, 1))
        else:
            # Should not get into here
            raise RuntimeError('There may be bugs in loss_counter')

        # calculate loss
        loss = criterion(outputs, labels)
        running_loss += loss.item()

        # Update ccu_loss_samples and loss_counter
        loss_samples[loss_counter] = loss.item()
        loss_counter += 1

        # 1 step BP
        loss.backward()
        optimizer.step()

        # print statistics
        if i % showstep == 0:
            print('Training step: {} loss: {}'.format(i,
                                                      running_loss / showstep))

            running_loss = 0.0

    print('Finished Training')
    return model
コード例 #20
0
    def train_batch_RL(self, enc_out, enc_hidden, enc_padding_mask, ct_e,
                       extra_zeros, enc_batch_extend_vocab, article_oovs,
                       greedy):
        '''Generate sentences from decoder entirely using sampled tokens as input. These sentences are used for ROUGE evaluation
        Args
        :param enc_out: Outputs of the encoder for all time steps (batch_size, length_input_sequence, 2*hidden_size)
        :param enc_hidden: Tuple containing final hidden state & cell state of encoder. Shape of h & c: (batch_size, hidden_size)
        :param enc_padding_mask: Mask for encoder input; Tensor of size (batch_size, length_input_sequence) with values of 0 for pad tokens & 1 for others
        :param ct_e: encoder context vector for time_step=0 (eq 5 in https://arxiv.org/pdf/1705.04304.pdf)
        :param extra_zeros: Tensor used to extend vocab distribution for pointer mechanism
        :param enc_batch_extend_vocab: Input batch that stores OOV ids
        :param article_oovs: Batch containing list of OOVs in each example
        :param greedy: If true, performs greedy based sampling, else performs multinomial sampling
        Returns:
        :decoded_strs: List of decoded sentences
        :log_probs: Log probabilities of sampled words
        '''
        s_t = enc_hidden  #Decoder hidden states
        x_t = get_cuda(T.LongTensor(len(enc_out)).fill_(
            self.start_id))  #Input to the decoder
        prev_s = None  #Used for intra-decoder attention (section 2.2 in https://arxiv.org/pdf/1705.04304.pdf)
        sum_temporal_srcs = None  #Used for intra-temporal attention (section 2.1 in https://arxiv.org/pdf/1705.04304.pdf)
        inds = []  #Stores sampled indices for each time step
        decoder_padding_mask = []  #Stores padding masks of generated samples
        log_probs = []  #Stores log probabilites of generated samples
        mask = get_cuda(
            T.LongTensor(len(enc_out)).fill_(1)
        )  #Values that indicate whether [STOP] token has already been encountered; 1 => Not encountered, 0 otherwise

        for t in range(config.max_dec_steps):
            x_t = self.model.embeds(x_t)
            probs, s_t, ct_e, sum_temporal_srcs, prev_s = self.model.decoder(
                x_t, s_t, enc_out, enc_padding_mask, ct_e, extra_zeros,
                enc_batch_extend_vocab, sum_temporal_srcs, prev_s)
            if greedy is False:
                multi_dist = Categorical(probs)
                x_t = multi_dist.sample()  #perform multinomial sampling
                log_prob = multi_dist.log_prob(x_t)
                log_probs.append(log_prob)
            else:
                _, x_t = T.max(probs, dim=1)  #perform greedy sampling
            x_t = x_t.detach()
            inds.append(x_t)
            mask_t = get_cuda(T.zeros(
                len(enc_out)))  #Padding mask of batch for current time step
            mask_t[
                mask ==
                1] = 1  #If [STOP] is not encountered till previous time step, mask_t = 1 else mask_t = 0
            mask[
                (mask == 1) + (x_t == self.end_id) ==
                2] = 0  #If [STOP] is not encountered till previous time step and current word is [STOP], make mask = 0
            decoder_padding_mask.append(mask_t)
            is_oov = (x_t >= config.vocab_size
                      ).long()  #Mask indicating whether sampled word is OOV
            x_t = (1 - is_oov) * x_t + (
                is_oov) * self.unk_id  #Replace OOVs with [UNK] token

        inds = T.stack(inds, dim=1)
        decoder_padding_mask = T.stack(decoder_padding_mask, dim=1)
        if greedy is False:  #If multinomial based sampling, compute log probabilites of sampled words
            log_probs = T.stack(log_probs, dim=1)
            log_probs = log_probs * decoder_padding_mask  #Not considering sampled words with padding mask = 0
            lens = T.sum(decoder_padding_mask,
                         dim=1)  #Length of sampled sentence
            log_probs = T.sum(
                log_probs, dim=1
            ) / lens  # (bs,)                                     #compute normalizied log probability of a sentence
        decoded_strs = []
        for i in range(len(enc_out)):
            id_list = inds[i].cpu().numpy()
            oovs = article_oovs[i]
            S = data.outputids2words(
                id_list, self.vocab,
                oovs)  # Generate sentence corresponding to sampled words
            try:
                end_idx = S.index(data.STOP_DECODING)
                S = S[:end_idx]
            except ValueError:
                S = S
            if len(
                    S
            ) < 2:  #If length of sentence is less than 2 words, replace it with "xxx"; Avoids setences like "." which throws error while calculating ROUGE
                S = ["xxx"]
            S = " ".join(S)
            decoded_strs.append(S)

        return decoded_strs, log_probs
コード例 #21
0
ファイル: model.py プロジェクト: Leyan529/Master-Summarizer
    def forward(self, st_hat, h, enc_padding_mask, sum_temporal_srcs,
                sum_k_emb):
        ''' Perform attention over encoder hidden states
        :param st_hat: decoder hidden state at current time step
        :param h: encoder hidden states
        :param enc_padding_mask:
        :param sum_temporal_srcs: if using intra-temporal attention, contains summation of attention weights from previous decoder time steps
        Self Attention也经常被称为intra Attention(内部Attention)
        Source内部元素之间或者Target内部元素之间发生的Attention机制,也可以理解为Target=Source这种特殊情况下的注意力计算机制。
        其具体计算过程是一样的,只是计算对象发生了变化而已
        '''

        # Standard attention technique (eq 1 in h Pointer-Generator Networks - https://arxiv.org/pdf/1704.04368.pdf)
        # et = tanh ( W_h(h) + W_s(st_hat) )
        # print(h.shape);print(config.hidden_dim * 2, config.hidden_dim * 2)
        # print('h',h.shape)
        et = self.W_h(h)  # batch_size,n_seq,2*hid_size
        # print('et1',et.shape)
        dec_fea = self.W_s(st_hat).unsqueeze(1)  # batch_size,1,2*hid_size
        # print('dec_fea',dec_fea.shape)
        # print('h',h.shape)
        # print('st_hat',st_hat.shape)
        et = et + dec_fea  # et => incorporate h_td (hidden decoder state) & h_te (hidden encoder state)
        # print('et2',et.shape)
        if config.key_attention:
            k_t = self.W_t(sum_k_emb).unsqueeze(1)
            if k_t.shape[0] == et.shape[0]: et = et + k_t
        et = T.tanh(et)  # batch_size,b_seq_len,2*hid_size
        et = self.v(et).squeeze(2)  # batch_size,b_seq_len
        # print('et3',et.shape)
        # intra-temporal attention     (eq 3 in DEEP REINFORCED MODEL - https://arxiv.org/pdf/1705.04304.pdf)
        if config.intra_encoder:
            exp_et = T.exp(et)
            if sum_temporal_srcs is None:
                et1 = exp_et  # eq 3 if t = 1 condition
                sum_temporal_srcs = get_cuda(
                    T.FloatTensor(et.size()).fill_(1e-10)) + exp_et
            else:
                et1 = exp_et / sum_temporal_srcs  # eq 3 otherwise condition   #batch_size, b_seq_len
                sum_temporal_srcs = sum_temporal_srcs + exp_et  # 針對自己過去所有的 source attention score 加總 (self-attention)
        else:
            # (eq 2 in h Pointer-Generator Networks - https://arxiv.org/pdf/1704.04368.pdf)
            et1 = F.softmax(et, dim=1)  # et = softmax(et)
        # et1 最後加權的attention score
        # assign 0 probability for padded elements
        # print('et1',et1.shape)
        # print('enc_padding_mask',enc_padding_mask.shape)
        # print('----------------------------')
        at = et1 * enc_padding_mask
        # torch.sum(input, dim, keepdim=False, out=None) → Tensor 返回新的张量,其中包括输入张量input中指定维度dim中每行的和。
        # 若keepdim值为True,则在输出张量中,除了被操作的dim维度值降为1,其它维度与输入张量input相同
        normalization_factor = at.sum(1, keepdim=True)
        at = at / normalization_factor  # 做 normalization 得 context vector

        at = at.unsqueeze(
            1
        )  #batch_size,1,b_seq_len          # torch.unsqueeze()这个函数主要是对数据维度进行扩充。给指定位置加上维数为一的维度
        # Compute encoder context vector
        ct_e = T.bmm(
            at, h
        )  #batch_size, 1, 2*hid_size      #  將 encoder hidden states 與 attention distribution 做矩阵乘法得 context vector
        ct_e = ct_e.squeeze(1)
        at = at.squeeze(1)  # torch.squeeze() 这个函数主要对数据的维度进行压缩,去掉维数为1的的维度
        # print('ct_e',ct_e.shape)
        # print('at',at.shape)
        # print('h',h.shape)
        # print('sum_temporal_srcs',sum_temporal_srcs.shape)
        # print('-------------------------------------------------')
        return ct_e, at, sum_temporal_srcs  # context vector , attention score , sum_temporal_srcs (value != None if self attention )