def forward(self, inputs, state): """ forward """ inputs, lengths = inputs batch_size, max_len = inputs.size() out_inputs = inputs.new_zeros(size=(batch_size, max_len, self.out_input_size), dtype=torch.float) # sort by lengths sorted_lengths, indices = lengths.sort(descending=True) inputs = inputs.index_select(0, indices) state = state.index_select(indices) # number of valid input (i.e. not padding index) in each time step num_valid_list = sequence_mask(sorted_lengths).int().sum(dim=0) for i, num_valid in enumerate(num_valid_list): dec_input = inputs[:num_valid, i] valid_state = state.slice_select(num_valid) out_input, valid_state, _ = self.decode(dec_input, valid_state, is_training=True) state.hidden[:, :num_valid] = valid_state.hidden out_inputs[:num_valid, i] = out_input.squeeze(1) # Resort _, inv_indices = indices.sort() state = state.index_select(inv_indices) out_inputs = out_inputs.index_select(0, inv_indices) log_probs = self.output_layer(out_inputs) return log_probs, state
def initialize_state(self, hidden, feature=None, attn_memory=None, attn_mask=None, memory_lengths=None, knowledge=None): """ initialize_state """ if self.feature_size is not None: assert feature is not None if self.attn_mode is not None: assert attn_memory is not None if memory_lengths is not None and attn_mask is None: max_len = attn_memory.size(1) attn_mask = sequence_mask(memory_lengths, max_len).eq(0) init_state = DecoderState( hidden=hidden, feature=feature, attn_memory=attn_memory, attn_mask=attn_mask, knowledge=knowledge, ) return init_state
def forward(self, hidden, hidden_states, attention_mask): mixed_query_layer = self.query(hidden) mixed_key_layer = self.key(hidden_states) mixed_value_layer = self.value(hidden_states) query_layer = self.transpose_for_scores(mixed_query_layer) key_layer = self.transpose_for_scores(mixed_key_layer) value_layer = self.transpose_for_scores(mixed_value_layer) # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) attention_scores = attention_scores / math.sqrt(self.attention_head_size) # Apply the attention mask is (precomputed for all layers in BertModel forward() function) if attention_mask is not None: if len(attention_mask.size()) < 2: attention_mask = sequence_mask(attention_mask) reverse_mask = torch.ones(attention_mask.size()).cuda() reverse_mask[attention_mask] = 0.0 attention_scores = attention_scores + reverse_mask.unsqueeze(1).unsqueeze(2) * (-1e9) else: raise NotImplemented # Normalize the attention scores to probabilities. attention_probs = nn.Softmax(dim=-1)(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs) context_layer = torch.matmul(attention_probs, value_layer) context_layer = context_layer.permute(0, 2, 1, 3).contiguous() new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) context_layer = context_layer.view(*new_context_layer_shape) return context_layer[:, 0, :], attention_probs
def forward(self, source, memory_bank, memory_lengths=None, coverage=None, is_mask=False): """ Args: source (`FloatTensor`): query vectors `[batch x tgt_len x dim]` memory_bank (`FloatTensor`): source vectors `[batch x src_len x dim]` memory_lengths (`LongTensor`): the source context lengths `[batch]` coverage (`FloatTensor`): None (not supported yet) Returns: (`FloatTensor`, `FloatTensor`): * Computed vector `[tgt_len x batch x dim]` * Attention distribtutions for each query `[tgt_len x batch x src_len]` """ # one step input if source.dim() == 2: one_step = True source = source.unsqueeze(1) else: one_step = False batch, source_l, dim = memory_bank.size() batch_, target_l, dim_ = source.size() aeq(batch, batch_) aeq(dim, dim_) aeq(self.dim, dim) if coverage is not None: batch_, source_l_ = coverage.size() aeq(batch, batch_) aeq(source_l, source_l_) if coverage is not None: cover = coverage.view(-1).unsqueeze(1) memory_bank += self.linear_cover(cover).view_as(memory_bank) memory_bank = torch.tanh(memory_bank) # compute attention scores, as in Luong et al. align = self.score(source, memory_bank) if memory_lengths is not None: if not is_mask: mask = sequence_mask(memory_lengths, max_len=align.size(-1)) else: mask = memory_lengths.byte() mask = mask.unsqueeze(1) # Make it broadcastable. align.masked_fill_(~mask, -float('inf')) # Softmax or sparsemax to normalize attention weights if self.attn_func == "softmax": align_vectors = F.softmax(align.view(batch*target_l, source_l), -1) else: align_vectors = sparsemax(align.view(batch*target_l, source_l), -1) align_vectors = align_vectors.view(batch, target_l, source_l) # each context vector c_t is the weighted average # over all the source hidden states c = torch.bmm(align_vectors, memory_bank) # concatenate concat_c = torch.cat([c, source], 2).view(batch*target_l, dim*2) attn_h = self.linear_out(concat_c).view(batch, target_l, dim) # attn_h = self.dropout(attn_h) # if self.attn_type in ["general", "dot"]: # attn_h = torch.tanh(attn_h) if one_step: attn_h = attn_h.squeeze(1) align_vectors = align_vectors.squeeze(1) # Check output sizes batch_, dim_ = attn_h.size() aeq(batch, batch_) aeq(dim, dim_) batch_, source_l_ = align_vectors.size() aeq(batch, batch_) aeq(source_l, source_l_) else: attn_h = attn_h.transpose(0, 1).contiguous() align_vectors = align_vectors.transpose(0, 1).contiguous() # Check output sizes target_l_, batch_, dim_ = attn_h.size() aeq(target_l, target_l_) aeq(batch, batch_) aeq(dim, dim_) target_l_, batch_, source_l_ = align_vectors.size() aeq(target_l, target_l_) aeq(batch, batch_) aeq(source_l, source_l_) return attn_h, align_vectors
def decode(self, dec_state): """ decode """ long_tensor_type = torch.cuda.LongTensor if self.use_gpu else torch.LongTensor b = dec_state.get_batch_size() # [[0], [k*1], [k*2], ..., [k*(b-1)]] self.pos_index = (long_tensor_type(range(b)) * self.k).view(-1, 1) # Inflate the initial hidden states to be of size: (b*k, H) dec_state = dec_state.inflate(self.k) # Initialize the scores; for the first step, # ignore the inflated copies to avoid duplicate entries in the top k sequence_scores = long_tensor_type(b * self.k).float() sequence_scores.fill_(-float('inf')) sequence_scores.index_fill_( 0, long_tensor_type([i * self.k for i in range(b)]), 0.0) # Initialize the input vector input_var = long_tensor_type([self.BOS] * b * self.k) # Store decisions for backtracking stored_scores = list() stored_predecessors = list() stored_emitted_symbols = list() for t in range(1, self.max_length + 1): # Run the RNN one step forward output, dec_state, attn = self.model.decode(input_var, dec_state) log_softmax_output = output.squeeze(1) # To get the full sequence scores for the new candidates, add the # local scores for t_i to the predecessor scores for t_(i-1) sequence_scores = sequence_scores.unsqueeze(1).repeat(1, self.V) if self.length_average and t > 1: sequence_scores = sequence_scores * \ (1 - 1/t) + log_softmax_output / t else: sequence_scores += log_softmax_output scores, candidates = sequence_scores.view(b, -1).topk(self.k, dim=1) # Reshape input = (b*k, 1) and sequence_scores = (b*k) input_var = (candidates % self.V) sequence_scores = scores.view(b * self.k) input_var = input_var.view(b * self.k) # Update fields for next timestep predecessors = (candidates / self.V + self.pos_index.expand_as(candidates)).view(b * self.k) dec_state = dec_state.index_select(predecessors) # Update sequence scores and erase scores for end-of-sentence symbol so that they aren't expanded stored_scores.append(sequence_scores.clone()) eos_indices = input_var.data.eq(self.EOS) if eos_indices.nonzero().dim() > 0: sequence_scores.data.masked_fill_(eos_indices, -float('inf')) if self.ignore_unk: # Erase scores for UNK symbol so that they aren't expanded unk_indices = input_var.data.eq(self.UNK) if unk_indices.nonzero().dim() > 0: sequence_scores.data.masked_fill_(unk_indices, -float('inf')) # Cache results for backtracking stored_predecessors.append(predecessors) stored_emitted_symbols.append(input_var) predicts, scores, lengths = self._backtrack(stored_predecessors, stored_emitted_symbols, stored_scores, b) predicts = predicts[:, :1] scores = scores[:, :1] lengths = long_tensor_type(lengths)[:, :1] mask = sequence_mask(lengths, max_len=self.max_length).eq(0) predicts[mask] = self.PAD return predicts, lengths, scores
def forward(self, source, memory_bank, memory_lengths=None): # Whether input is provided one step at a time or not if source.dim() == 2: one_step = True source = source.unsqueeze(1) else: one_step = False batch, source_l, dim = memory_bank.size() batch_, target_l, dim_ = source.size() assert batch == batch_ assert dim == dim_ assert self.input_size == dim # Compute attention scores align = self.score(source, memory_bank) if memory_lengths is not None: mask = sequence_mask(memory_lengths, max_len=align.size(-1)) mask = mask.unsqueeze(1) # make it broadcastable align.masked_fill_(1 - mask, -float('inf')) # Normalize attention weights align_vectors = F.softmax(align.view(batch * target_l, source_l), -1) align_vectors = align_vectors.view(batch, target_l, source_l) # Generate context vector c_t as the weighted average # over all the source hidden states c = torch.bmm(align_vectors, memory_bank) # Concatenate context vector with source concat_c = torch.cat([c, source], 2).view(batch * target_l, dim * 2) attn_h = self.linear_out(concat_c).view(batch, target_l, dim) if self.attention_type in ["general", "dot"]: attn_h = torch.tanh(attn_h) if one_step: attn_h = attn_h.squeeze(1) align_vectors = align_vectors.squeeze(1) # Check output sizes batch_, dim_ = attn_h.size() assert batch == batch_ assert dim == dim_ batch_, source_l_ = align_vectors.size() assert batch == batch_ assert source_l == source_l_ else: attn_h = attn_h.transpose(0, 1).contiguous() align_vectors = align_vectors.transpose(0, 1).contiguous() # Check output sizes target_l_, batch_, dim_ = attn_h.size() assert target_l == target_l_ assert batch == batch_ assert dim == dim_ target_l_, batch_, source_l_ = align_vectors.size() assert target_l == target_l_ assert batch == batch_ assert source_l == source_l_ return attn_h, align_vectors