def train_one_batch(self, batch, iter): ''' Perfrom MLE & RL based training and compute their losses ''' enc_batch, enc_lens, enc_padding_mask, enc_batch_extend_vocab, extra_zeros, context = get_enc_data( batch) enc_batch = self.model.embeds( enc_batch) #Get embeddings for encoder input enc_out, enc_hidden = self.model.encoder(enc_batch, enc_lens) # -------------------------------Summarization----------------------- if opt.train_mle == "yes": #perform MLE training mle_loss = self.train_batch_MLE(enc_out, enc_hidden, enc_padding_mask, context, extra_zeros, enc_batch_extend_vocab, batch) else: mle_loss = get_cuda(T.FloatTensor([0])) # --------------------------------RL training------------------------- if opt.train_rl == "yes": #perform reinforcement learning training #multinomial based sampling sample_sents, RL_log_probs = self.train_batch_RL( enc_out, enc_hidden, enc_padding_mask, context, extra_zeros, enc_batch_extend_vocab, batch.art_oovs, greedy=False) with T.autograd.no_grad(): #greedy based sampling greedy_sents, _ = self.train_batch_RL(enc_out, enc_hidden, enc_padding_mask, context, extra_zeros, enc_batch_extend_vocab, batch.art_oovs, greedy=True) sample_reward = self.reward_function(sample_sents, batch.original_abstracts) baseline_reward = self.reward_function(greedy_sents, batch.original_abstracts) rl_loss = -( sample_reward - baseline_reward ) * RL_log_probs #Self-critic policy gradient training (eq 15 in https://arxiv.org/pdf/1705.04304.pdf) rl_loss = T.mean(rl_loss) batch_reward = T.mean(sample_reward).item() else: rl_loss = get_cuda(T.FloatTensor([0])) batch_reward = 0 #------------------------------------------------------------------------------------ self.trainer.zero_grad() (opt.mle_weight * mle_loss + opt.rl_weight * rl_loss).backward() self.trainer.step() return mle_loss.item(), batch_reward
def __init__(self, pre_train_emb, word_emb_type, vocab): super(Model, self).__init__() self.encoder = Encoder() self.decoder = Decoder() if pre_train_emb: self.embeds = nn.Embedding(config.vocab_size, config.emb_dim) # FastText & word2Vec same format if word_emb_type == 'word2Vec' or word_emb_type == 'FastText': weight = get_Word2Vec_weight(vocab) # print('weight',len(weight)) elif word_emb_type == 'glove': weight = get_glove_weight(vocab) # weight = get_glove_weight2(vocab) elif word_emb_type == 'bert': weight = get_bert_weight(vocab) self.embeds = T.nn.Embedding.from_pretrained(weight) self.embeds.weight.requires_grad = config.emb_grad else: self.embeds = nn.Embedding(config.vocab_size, config.emb_dim) init_wt_normal(self.embeds.weight) # requires_grad指定是否在训练过程中对词向量的权重进行微调 self.embeds.weight.requires_grad = True self.encoder = get_cuda(self.encoder) self.decoder = get_cuda(self.decoder) self.embeds = get_cuda(self.embeds)
def __init__(self,pre_train_emb,word_emb_type,vocab): super(Model, self).__init__() self.encoder = Encoder() self.decoder = Decoder() # if pre_train_emb: # self.embeds = nn.Embedding(config.vocab_size, config.emb_dim) # weight = get_Word2Vec_weight(vocab) # if word_emb_type == 'word2Vec': # weight = get_Word2Vec_weight(vocab) # elif word_emb_type == 'glove': # weight = get_glove_weight(vocab) # # weight = get_glove_weight2(vocab) # # elif word_emb_type == 'bert': # # weight = get_bert_weight(vocab) # self.embeds = T.nn.Embedding.from_pretrained(weight) # self.embeds.weight.requires_grad = config.emb_grad # else: # self.embeds = nn.Embedding(config.vocab_size, config.emb_dim) # init_wt_normal(self.embeds.weight) # # requires_grad指定是否在训练过程中对词向量的权重进行微调 # self.embeds.weight.requires_grad = True self.encoder = get_cuda(self.encoder) self.decoder = get_cuda(self.decoder)
def __init__(self, start_id, end_id, unk_id, hidden_state, context): # beam_size = batch_size * beam_n h, c = hidden_state #(hid_size,) self.tokens = T.LongTensor(config.beam_size, 1).fill_( start_id) #(beam_size, t) after t time steps # 初始beam score分數為-30 self.scores = T.FloatTensor(config.beam_size, 1).fill_( -30) #beam_size,1; Initial score of beams = -30 self.tokens, self.scores = get_cuda(self.tokens), get_cuda(self.scores) self.scores[0][0] = 0 # 每個batch中欲被decode的元素,將根據beam_size進行複製 #At time step t=0, all beams should extend from a single beam. So, I am giving high initial score to 1st beam self.hid_h = h.unsqueeze(0).repeat(config.beam_size, 1) #beam_size, hid_size self.hid_c = c.unsqueeze(0).repeat(config.beam_size, 1) #beam_size, hid_size # print('self.hid_h',self.hid_h.shape);print('self.hid_c',self.hid_c.shape) # print('context',context.shape) self.context = context.unsqueeze(0).repeat(config.beam_size, 1) #beam_size, 2*hid_size self.sum_temporal_srcs = None self.prev_s = None self.done = False self.end_id = end_id self.unk_id = unk_id
def forward(self, s_t, prev_s): '''Perform intra_decoder attention Args :param s_t: hidden state of decoder at current time step :param prev_s: If intra_decoder attention, contains list of previous decoder hidden states ''' if config.intra_decoder is False: ct_d = get_cuda(T.zeros(s_t.size())) elif prev_s is None: ct_d = get_cuda(T.zeros(s_t.size())) prev_s = s_t.unsqueeze(1) #bs, 1, n_hid else: # Standard attention technique (eq 1 in https://arxiv.org/pdf/1704.04368.pdf) et = self.W_prev(prev_s) # bs,t-1,n_hid dec_fea = self.W_s(s_t).unsqueeze(1) # bs,1,n_hid et = et + dec_fea et = T.tanh(et) # bs,t-1,n_hid et = self.v(et).squeeze(2) # bs,t-1 # intra-decoder attention (eq 7 & 8 in https://arxiv.org/pdf/1705.04304.pdf) at = F.softmax(et, dim=1).unsqueeze(1) #bs, 1, t-1 ct_d = T.bmm(at, prev_s).squeeze(1) #bs, n_hid prev_s = T.cat([prev_s, s_t.unsqueeze(1)], dim=1) #bs, t, n_hid return ct_d, prev_s
def forward(self, s_t, prev_s, sum_k_emb): '''Perform intra_decoder attention Args :param s_t: hidden state of decoder at current time step :param prev_s: If intra_decoder attention, contains list of previous decoder hidden states ''' if config.intra_decoder is False: ct_d = get_cuda(T.zeros(s_t.size())) # set c1_d to vector of zeros elif prev_s is None: ct_d = get_cuda(T.zeros(s_t.size())) prev_s = s_t.unsqueeze(1) #batch_size, 1, hid_size else: # Standard attention technique (eq 1 in Pointer-Generator Networks - https://arxiv.org/pdf/1704.04368.pdf) # et = tanh ( W_prev(prev_s) + W_s(st_hat) ) et = self.W_prev(prev_s) # batch_size,t-1,hid_size dec_fea = self.W_s(s_t).unsqueeze(1) # batch_size,1,hid_size et = et + dec_fea if config.key_attention: k_t = self.W_t(sum_k_emb).unsqueeze(1) if k_t.shape[0] == et.shape[0]: et = et + k_t et = T.tanh(et) # batch_size,t-1,hid_size et = self.v(et).squeeze(2) # batch_size,t-1 # intra-decoder attention (eq 7 & 8 in DEEP REINFORCED MODEL - https://arxiv.org/pdf/1705.04304.pdf) at = F.softmax(et, dim=1).unsqueeze(1) #batch_size, 1, t-1 ct_d = T.bmm(at, prev_s).squeeze( 1 ) #batch_size, hid_size # 將 previous decoder hidden states 與 attention distribution 做矩阵乘法得 decoder context vector prev_s = T.cat( [prev_s, s_t.unsqueeze(1)], dim=1 ) #batch_size, t, hid_size # 將目前計算的decoder state 合併到 previous decoder hidden states return ct_d, prev_s
def __init__(self, vocab_size): super(Model, self).__init__() self.encoder = Encoder() self.decoder = Decoder(vocab_size) self.embeds = nn.Embedding(vocab_size, config.emb_dim) init_wt_normal(self.embeds.weight) self.encoder = get_cuda(self.encoder) self.decoder = get_cuda(self.decoder) self.embeds = get_cuda(self.embeds)
def train_batch_MLE(self, enc_out, enc_hidden, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab, batch): ''' Calculate Negative Log Likelihood Loss for the given batch. In order to reduce exposure bias, pass the previous generated token as input with a probability of 0.25 instead of ground truth label Args: :param enc_out: Outputs of the encoder for all time steps (batch_size, length_input_sequence, 2*hidden_size) :param enc_hidden: Tuple containing final hidden state & cell state of encoder. Shape of h & c: (batch_size, hidden_size) :param enc_padding_mask: Mask for encoder input; Tensor of size (batch_size, length_input_sequence) with values of 0 for pad tokens & 1 for others :param ct_e: encoder context vector for time_step=0 (eq 5 in https://arxiv.org/pdf/1705.04304.pdf) :param extra_zeros: Tensor used to extend vocab distribution for pointer mechanism :param enc_batch_extend_vocab: Input batch that stores OOV ids :param batch: batch object ''' sum_dec_batch, sum_max_dec_len, sum_dec_lens, sum_target_batch = get_sum_dec_data( batch) #Get input and target batchs for training decoder step_losses = [] s_t = (enc_hidden[0], enc_hidden[1]) #Decoder hidden states y_t = get_cuda(T.LongTensor(len(enc_out)).fill_( self.start_id)) #Input to the decoder prev_s = None #Used for intra-decoder attention (section 2.2 in https://arxiv.org/pdf/1705.04304.pdf) sum_temporal_srcs = None #Used for intra-temporal attention (section 2.1 in https://arxiv.org/pdf/1705.04304.pdf) for t in range(min(sum_max_dec_len, config.max_dec_steps)): use_gound_truth = get_cuda((T.rand(len(enc_out)) > 0.25)).long( ) #Probabilities indicating whether to use ground truth labels instead of previous decoded tokens y_t = use_gound_truth * sum_dec_batch[:, t] + ( 1 - use_gound_truth ) * y_t #Select decoder input based on use_ground_truth probabilities y_t = self.model.embeds(y_t) #decoder step final_dist, s_t, ct_e, sum_temporal_srcs, prev_s = self.model.sum_decoder( y_t, s_t, enc_out, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab, sum_temporal_srcs, prev_s) target = sum_target_batch[:, t] log_probs = T.log(final_dist + config.eps) step_loss = F.nll_loss(log_probs, target, reduction="none", ignore_index=self.pad_id) step_losses.append(step_loss) y_t = T.multinomial(final_dist, 1).squeeze( ) #Sample words from final distribution which can be used as input in next time step is_oov = (y_t >= config.vocab_size ).long() #Mask indicating whether sampled word is OOV y_t = (1 - is_oov) * y_t.detach() + ( is_oov) * self.unk_id #Replace OOVs with [UNK] token sum_losses = T.sum( T.stack(step_losses, 1), 1 ) #unnormalized losses for each example in the batch; (batch_size) batch_avg_loss = sum_losses / sum_dec_lens #Normalized losses; (batch_size) mle_loss = T.mean(batch_avg_loss) #Average batch loss return mle_loss
def __init__(self, vocab_size): super(TaskModel, self).__init__() self.encoder = TaskEncoder() self.decoder = Decoder(vocab_size) self.embeds = nn.Embedding(vocab_size, config.emb_dim) self.seg_embeds = nn.Embedding(len(SEGMENT), config.emb_dim) init_wt_normal(self.embeds.weight) init_wt_normal(self.seg_embeds.weight) self.encoder = get_cuda(self.encoder) self.decoder = get_cuda(self.decoder) self.embeds = get_cuda(self.embeds) self.seg_embeds = get_cuda(self.seg_embeds)
def __init__(self, start_id, end_id, unk_id, hidden_state, context): h,c = hidden_state #(n_hid,) self.tokens = T.LongTensor(config.beam_size,1).fill_(start_id) #(beam, t) after t time steps self.scores = T.FloatTensor(config.beam_size,1).fill_(-30) #beam,1; Initial score of beams = -30 self.tokens, self.scores = get_cuda(self.tokens), get_cuda(self.scores) self.scores[0][0] = 0 #At time step t=0, all beams should extend from a single beam. So, I am giving high initial score to 1st beam self.hid_h = h.unsqueeze(0).repeat(config.beam_size, 1) #beam, n_hid self.hid_c = c.unsqueeze(0).repeat(config.beam_size, 1) #beam, n_hid self.context = context.unsqueeze(0).repeat(config.beam_size, 1) #beam, 2*n_hid self.sum_temporal_srcs = None self.prev_s = None self.done = False self.end_id = end_id self.unk_id = unk_id
def reward_function(self, decoded_sents, original_sents): ''' Calculate Rouge_L scores for a batch of decoded sentences given their original sentences :param decoded_sents: List containing decoded sentences :param original_sents: List containing original sentences ''' rouge = Rouge() try: scores = rouge.get_scores(decoded_sents, original_sents) except Exception: print( "Rouge failed for multi sentence evaluation.. Finding exact pair" ) scores = [] for i in range(len(decoded_sents)): try: score = rouge.get_scores(decoded_sents[i], original_sents[i]) except Exception: print("Error occured at:") print("decoded_sents:", decoded_sents[i]) print("original_sents:", original_sents[i]) score = [{"rouge-l": {"f": 0.0}}] scores.append(score[0]) rouge_l_f1 = [score["rouge-l"]["f"] for score in scores] rouge_l_f1 = get_cuda(T.FloatTensor(rouge_l_f1)) return rouge_l_f1
def forward(self, x): x = train_util.get_cuda(x) out = self.conv1(x) out = self.norm1(out) out = self.relu1(out) out = self.layers1(out) out = self.layers2(out) out = self.layers3(out) out = self.avgpool(out) out = out.view(out.size(0), -1) out = self.linear(out) return out
def setup_train(self): self.model = Model() self.model = get_cuda(self.model) self.trainer = T.optim.Adam(self.model.parameters(), lr=config.lr) start_iter = 0 if config.resume_training: checkpoint = T.load(config.load_model_path) start_iter = checkpoint["iter"] self.model.load_state_dict(checkpoint["model_dict"]) self.trainer.load_state_dict(checkpoint["trainer_dict"]) print("Loaded model at " + config.load_model_path) return start_iter
def setup_train(self): self.model = Model() self.model = get_cuda(self.model) self.trainer = T.optim.Adam(self.model.parameters(), lr=config.lr) start_iter = 0 if self.opt.load_model is not None: load_model_path = os.path.join(config.save_model_path, self.opt.load_model) checkpoint = T.load(load_model_path) start_iter = checkpoint["iter"] self.model.load_state_dict(checkpoint["model_dict"]) self.trainer.load_state_dict(checkpoint["trainer_dict"]) print("Loaded model at " + load_model_path) if self.opt.new_lr is not None: self.trainer = T.optim.Adam(self.model.parameters(), lr=self.opt.new_lr) return start_iter
def test(testloader, model): """ Test unit # TODO: Move to eval.py later """ correct = 0 total = 0 with torch.no_grad(): for data in testloader: images, labels = data labels = get_cuda(labels) outputs = model(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print('Accuracy of the network on the 10000 test images: {}%'.format( 100 * correct / total))
def forward(self, x, use_ccu=False, loss_samples=None, remaining_step=None, training_step=None): x = train_util.get_cuda(x) x = self.ResNet(x) if use_ccu == True: if training_step and training_step >= config.warm_up: loss_begin = loss_samples[0] loss_end = loss_samples[-1] loss_samples = loss_samples.view(1, -1) c_ccu, final_p = self.CCU(loss_samples, loss_begin, loss_end, remaining_step, self) p = F.relu(self.w_p(final_p)) p = F.relu(self.w_p2(p)) c_ccu = F.relu(self.w_ccu(c_ccu)) c_ccu = F.relu(self.w_ccu_2(c_ccu)) # Add c_ccu to enable learning for CCU parameters: x = x + c_ccu + p return x
def forward(self, st_hat, h, enc_padding_mask, sum_temporal_srcs): ''' Perform attention over encoder hidden states :param st_hat: decoder hidden state at current time step :param h: encoder hidden states :param enc_padding_mask: :param sum_temporal_srcs: if using intra-temporal attention, contains summation of attention weights from previous decoder time steps ''' # Standard attention technique (eq 1 in https://arxiv.org/pdf/1704.04368.pdf) et = self.W_h(h) # bs,n_seq,2*n_hid dec_fea = self.W_s(st_hat).unsqueeze(1) # bs,1,2*n_hid et = et + dec_fea et = T.tanh(et) # bs,n_seq,2*n_hid et = self.v(et).squeeze(2) # bs,n_seq # intra-temporal attention (eq 3 in https://arxiv.org/pdf/1705.04304.pdf) if config.intra_encoder: exp_et = T.exp(et) if sum_temporal_srcs is None: et1 = exp_et sum_temporal_srcs = get_cuda( T.FloatTensor(et.size()).fill_(1e-10)) + exp_et else: et1 = exp_et / sum_temporal_srcs #bs, n_seq sum_temporal_srcs = sum_temporal_srcs + exp_et else: et1 = F.softmax(et, dim=1) # assign 0 probability for padded elements at = et1 * enc_padding_mask normalization_factor = at.sum(1, keepdim=True) at = at / normalization_factor at = at.unsqueeze(1) #bs,1,n_seq # Compute encoder context vector ct_e = T.bmm(at, h) #bs, 1, 2*n_hid ct_e = ct_e.squeeze(1) at = at.squeeze(1) return ct_e, at, sum_temporal_srcs
def reward_function(self, decoded_sents, original_sents): rouge = Rouge() try: scores = rouge.get_scores(decoded_sents, original_sents) except Exception: print( "Rouge failed for multi sentence evaluation.. Finding exact pair" ) scores = [] for i in range(len(decoded_sents)): try: score = rouge.get_scores(decoded_sents[i], original_sents[i]) except Exception: print("Error occured at:") print("decoded_sents:", decoded_sents[i]) print("original_sents:", original_sents[i]) score = [{"rouge-l": {"f": 0.0}}] scores.append(score[0]) rouge_l_f1 = [score["rouge-l"]["f"] for score in scores] rouge_l_f1 = get_cuda(T.FloatTensor(rouge_l_f1)) return rouge_l_f1
def train(trainloader): # Init data loader: dataloader = iter(trainloader) # Init model: model = Model() model = model.type(torch.cuda.FloatTensor) # Init Optimization scheme criterion = nn.CrossEntropyLoss().cuda() optimizer = optim.Adam(model.parameters()) # Used in CCU: loss_samples = get_cuda(torch.zeros(CCU_gap, 1)) loss_counter = 0 # Main loop: for i in tqdm( range(training_step)): # loop over the dataset multiple times running_loss = 0.0 # get data here try: inputs, labels = dataloader.next() except StopIteration: dataloader = iter(trainloader) inputs, labels = dataloader.next() labels = get_cuda(labels) # zero the parameter gradients optimizer.zero_grad() # Choose CCU or not: if loss_counter < CCU_gap: outputs = model(inputs) elif loss_counter == CCU_gap and use_CCU: # print('use CCU:') remain_step = get_cuda( torch.tensor(training_step - i, dtype=torch.float)) outputs = model(inputs, use_ccu=True, loss_samples=loss_samples, remaining_step=remain_step, training_step=i) # reset CCU loss counter and samples. loss_counter = 0 loss_samples = get_cuda(torch.zeros(CCU_gap, 1)) elif loss_counter == CCU_gap and not use_CCU: outputs = model(inputs) # TODO: remove redundancy loss_counter = 0 loss_samples = get_cuda(torch.zeros(CCU_gap, 1)) else: # Should not get into here raise RuntimeError('There may be bugs in loss_counter') # calculate loss loss = criterion(outputs, labels) running_loss += loss.item() # Update ccu_loss_samples and loss_counter loss_samples[loss_counter] = loss.item() loss_counter += 1 # 1 step BP loss.backward() optimizer.step() # print statistics if i % showstep == 0: print('Training step: {} loss: {}'.format(i, running_loss / showstep)) running_loss = 0.0 print('Finished Training') return model
def train_batch_RL(self, enc_out, enc_hidden, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab, article_oovs, greedy): '''Generate sentences from decoder entirely using sampled tokens as input. These sentences are used for ROUGE evaluation Args :param enc_out: Outputs of the encoder for all time steps (batch_size, length_input_sequence, 2*hidden_size) :param enc_hidden: Tuple containing final hidden state & cell state of encoder. Shape of h & c: (batch_size, hidden_size) :param enc_padding_mask: Mask for encoder input; Tensor of size (batch_size, length_input_sequence) with values of 0 for pad tokens & 1 for others :param ct_e: encoder context vector for time_step=0 (eq 5 in https://arxiv.org/pdf/1705.04304.pdf) :param extra_zeros: Tensor used to extend vocab distribution for pointer mechanism :param enc_batch_extend_vocab: Input batch that stores OOV ids :param article_oovs: Batch containing list of OOVs in each example :param greedy: If true, performs greedy based sampling, else performs multinomial sampling Returns: :decoded_strs: List of decoded sentences :log_probs: Log probabilities of sampled words ''' s_t = enc_hidden #Decoder hidden states x_t = get_cuda(T.LongTensor(len(enc_out)).fill_( self.start_id)) #Input to the decoder prev_s = None #Used for intra-decoder attention (section 2.2 in https://arxiv.org/pdf/1705.04304.pdf) sum_temporal_srcs = None #Used for intra-temporal attention (section 2.1 in https://arxiv.org/pdf/1705.04304.pdf) inds = [] #Stores sampled indices for each time step decoder_padding_mask = [] #Stores padding masks of generated samples log_probs = [] #Stores log probabilites of generated samples mask = get_cuda( T.LongTensor(len(enc_out)).fill_(1) ) #Values that indicate whether [STOP] token has already been encountered; 1 => Not encountered, 0 otherwise for t in range(config.max_dec_steps): x_t = self.model.embeds(x_t) probs, s_t, ct_e, sum_temporal_srcs, prev_s = self.model.decoder( x_t, s_t, enc_out, enc_padding_mask, ct_e, extra_zeros, enc_batch_extend_vocab, sum_temporal_srcs, prev_s) if greedy is False: multi_dist = Categorical(probs) x_t = multi_dist.sample() #perform multinomial sampling log_prob = multi_dist.log_prob(x_t) log_probs.append(log_prob) else: _, x_t = T.max(probs, dim=1) #perform greedy sampling x_t = x_t.detach() inds.append(x_t) mask_t = get_cuda(T.zeros( len(enc_out))) #Padding mask of batch for current time step mask_t[ mask == 1] = 1 #If [STOP] is not encountered till previous time step, mask_t = 1 else mask_t = 0 mask[ (mask == 1) + (x_t == self.end_id) == 2] = 0 #If [STOP] is not encountered till previous time step and current word is [STOP], make mask = 0 decoder_padding_mask.append(mask_t) is_oov = (x_t >= config.vocab_size ).long() #Mask indicating whether sampled word is OOV x_t = (1 - is_oov) * x_t + ( is_oov) * self.unk_id #Replace OOVs with [UNK] token inds = T.stack(inds, dim=1) decoder_padding_mask = T.stack(decoder_padding_mask, dim=1) if greedy is False: #If multinomial based sampling, compute log probabilites of sampled words log_probs = T.stack(log_probs, dim=1) log_probs = log_probs * decoder_padding_mask #Not considering sampled words with padding mask = 0 lens = T.sum(decoder_padding_mask, dim=1) #Length of sampled sentence log_probs = T.sum( log_probs, dim=1 ) / lens # (bs,) #compute normalizied log probability of a sentence decoded_strs = [] for i in range(len(enc_out)): id_list = inds[i].cpu().numpy() oovs = article_oovs[i] S = data.outputids2words( id_list, self.vocab, oovs) # Generate sentence corresponding to sampled words try: end_idx = S.index(data.STOP_DECODING) S = S[:end_idx] except ValueError: S = S if len( S ) < 2: #If length of sentence is less than 2 words, replace it with "xxx"; Avoids setences like "." which throws error while calculating ROUGE S = ["xxx"] S = " ".join(S) decoded_strs.append(S) return decoded_strs, log_probs
def forward(self, st_hat, h, enc_padding_mask, sum_temporal_srcs, sum_k_emb): ''' Perform attention over encoder hidden states :param st_hat: decoder hidden state at current time step :param h: encoder hidden states :param enc_padding_mask: :param sum_temporal_srcs: if using intra-temporal attention, contains summation of attention weights from previous decoder time steps Self Attention也经常被称为intra Attention(内部Attention) Source内部元素之间或者Target内部元素之间发生的Attention机制,也可以理解为Target=Source这种特殊情况下的注意力计算机制。 其具体计算过程是一样的,只是计算对象发生了变化而已 ''' # Standard attention technique (eq 1 in h Pointer-Generator Networks - https://arxiv.org/pdf/1704.04368.pdf) # et = tanh ( W_h(h) + W_s(st_hat) ) # print(h.shape);print(config.hidden_dim * 2, config.hidden_dim * 2) # print('h',h.shape) et = self.W_h(h) # batch_size,n_seq,2*hid_size # print('et1',et.shape) dec_fea = self.W_s(st_hat).unsqueeze(1) # batch_size,1,2*hid_size # print('dec_fea',dec_fea.shape) # print('h',h.shape) # print('st_hat',st_hat.shape) et = et + dec_fea # et => incorporate h_td (hidden decoder state) & h_te (hidden encoder state) # print('et2',et.shape) if config.key_attention: k_t = self.W_t(sum_k_emb).unsqueeze(1) if k_t.shape[0] == et.shape[0]: et = et + k_t et = T.tanh(et) # batch_size,b_seq_len,2*hid_size et = self.v(et).squeeze(2) # batch_size,b_seq_len # print('et3',et.shape) # intra-temporal attention (eq 3 in DEEP REINFORCED MODEL - https://arxiv.org/pdf/1705.04304.pdf) if config.intra_encoder: exp_et = T.exp(et) if sum_temporal_srcs is None: et1 = exp_et # eq 3 if t = 1 condition sum_temporal_srcs = get_cuda( T.FloatTensor(et.size()).fill_(1e-10)) + exp_et else: et1 = exp_et / sum_temporal_srcs # eq 3 otherwise condition #batch_size, b_seq_len sum_temporal_srcs = sum_temporal_srcs + exp_et # 針對自己過去所有的 source attention score 加總 (self-attention) else: # (eq 2 in h Pointer-Generator Networks - https://arxiv.org/pdf/1704.04368.pdf) et1 = F.softmax(et, dim=1) # et = softmax(et) # et1 最後加權的attention score # assign 0 probability for padded elements # print('et1',et1.shape) # print('enc_padding_mask',enc_padding_mask.shape) # print('----------------------------') at = et1 * enc_padding_mask # torch.sum(input, dim, keepdim=False, out=None) → Tensor 返回新的张量,其中包括输入张量input中指定维度dim中每行的和。 # 若keepdim值为True,则在输出张量中,除了被操作的dim维度值降为1,其它维度与输入张量input相同 normalization_factor = at.sum(1, keepdim=True) at = at / normalization_factor # 做 normalization 得 context vector at = at.unsqueeze( 1 ) #batch_size,1,b_seq_len # torch.unsqueeze()这个函数主要是对数据维度进行扩充。给指定位置加上维数为一的维度 # Compute encoder context vector ct_e = T.bmm( at, h ) #batch_size, 1, 2*hid_size # 將 encoder hidden states 與 attention distribution 做矩阵乘法得 context vector ct_e = ct_e.squeeze(1) at = at.squeeze(1) # torch.squeeze() 这个函数主要对数据的维度进行压缩,去掉维数为1的的维度 # print('ct_e',ct_e.shape) # print('at',at.shape) # print('h',h.shape) # print('sum_temporal_srcs',sum_temporal_srcs.shape) # print('-------------------------------------------------') return ct_e, at, sum_temporal_srcs # context vector , attention score , sum_temporal_srcs (value != None if self attention )