def __init__(self, d_model, heads, d_ff, dropout): super(TransformerDecoderLayer, self).__init__() self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) mask = self._get_attn_subsequent_mask(MAX_SIZE) # Register self.mask as a buffer in TransformerDecoderLayer, so # it gets TransformerDecoderLayer's cuda behavior automatically. self.register_buffer('mask', mask)
def __init__(self, d_model, heads, d_ff, dropout): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)
def __init__(self, d_model, heads, d_ff, dropout, mem_args=None, use_mem=False): super(TransformerInterLayer, self).__init__() self.d_model, self.heads = d_model, heads self.d_per_head = self.d_model // self.heads self.layer_norm1 = nn.LayerNorm(d_model, eps=1e-6) self.pooling = MultiHeadedPooling(heads, d_model, dropout=dropout, use_final_linear=False) self.layer_norm2 = nn.LayerNorm(self.d_per_head, eps=1e-6) self.inter_att = MultiHeadedAttention(1, self.d_per_head, dropout, use_final_linear=False) self.linear = nn.Linear(self.d_model, self.d_model) self.dropout = nn.Dropout(dropout) if mem_args is not None and use_mem: self.feed_forward = HashingMemory.build(d_model, d_model, mem_args) else: self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
def __init__(self, d_model, heads, d_ff, dropout): super(TransformerQueryInterLayer, self).__init__() self.d_model, self.heads = d_model, heads self.d_per_head = self.d_model // self.heads self.layer_norm1 = nn.LayerNorm(d_model, eps=1e-6) ### query related self.paragraph_pooling = MultiHeadedPooling(heads, d_model, dropout=dropout, use_final_linear=False) self.query_pooling = MultiHeadedPooling(heads, d_model, dropout=dropout, use_final_linear=False) self.query_layer_norm1 = nn.LayerNorm(self.d_model, eps=1e-6) self.query_layer_norm2 = nn.LayerNorm(self.d_per_head, eps=1e-6) self.layer_norm2 = nn.LayerNorm(self.d_per_head, eps=1e-6) self.inter_att = MultiHeadedAttention(1, self.d_per_head, dropout, use_final_linear=False) self.linear = nn.Linear(self.d_model, self.d_model) self.dropout = nn.Dropout(dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
def __init__(self, d_model, heads, d_ff, dropout): super(TransformerQueryEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout) ## query related self.query_pooling = MultiHeadedPooling(heads, d_model, dropout=dropout, use_final_linear=False) self.query_layer_norm1 = nn.LayerNorm(d_model, eps=1e-16) self.query_layer_norm2 = nn.LayerNorm(d_model, eps=1e-16)
def __init__(self, d_model, heads, d_ff, dropout): super(TransformerNewInterLayer, self).__init__() self.layer_norm1 = nn.LayerNorm(d_model, eps=1e-6) self.pooling = MultiHeadedPooling(heads, d_model, dropout=dropout) self.layer_norm2 = nn.LayerNorm(d_model, eps=1e-6) self.inter_att = MultiHeadedAttention(heads, d_model, dropout, use_final_linear=True) self.dropout = nn.Dropout(dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
def __init__(self, d_model, heads, d_ff, dropout, mem_args=None, use_mem=False): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) if mem_args is not None and use_mem: self.feed_forward = HashingMemory.build(d_model, d_model, mem_args) else: self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)
class TransformerDecoderLayer(nn.Module): """ Args: d_model (int): the dimension of keys/values/queries in MultiHeadedAttention, also the input size of the first-layer of the PositionwiseFeedForward. heads (int): the number of heads for MultiHeadedAttention. d_ff (int): the second-layer of the PositionwiseFeedForward. dropout (float): dropout probability(0-1.0). self_attn_type (string): type of self-attention scaled-dot, average """ def __init__(self, d_model, heads, d_ff, dropout): super(TransformerDecoderLayer, self).__init__() self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) self.context_attn = MultiHeadedAttention(heads, d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6) self.drop = nn.Dropout(dropout) mask = self._get_attn_subsequent_mask(MAX_SIZE) # Register self.mask as a buffer in TransformerDecoderLayer, so # it gets TransformerDecoderLayer's cuda behavior automatically. self.register_buffer('mask', mask) def forward(self, inputs, memory_bank, src_pad_mask, tgt_pad_mask, layer_cache=None, step=None, para_attn=None): """ Args: inputs (`FloatTensor`): `[batch_size x 1 x model_dim]` memory_bank (`FloatTensor`): `[batch_size x src_len x model_dim]` src_pad_mask (`LongTensor`): `[batch_size x 1 x src_len]` tgt_pad_mask (`LongTensor`): `[batch_size x 1 x 1]` Returns: (`FloatTensor`, `FloatTensor`, `FloatTensor`): * output `[batch_size x 1 x model_dim]` * attn `[batch_size x 1 x src_len]` * all_input `[batch_size x current_step x model_dim]` """ dec_mask = torch.gt( tgt_pad_mask + self.mask[:, :tgt_pad_mask.size(1), :tgt_pad_mask.size(1)], 0) input_norm = self.layer_norm_1(inputs) all_input = input_norm query, _ = self.self_attn(all_input, all_input, input_norm, mask=dec_mask, layer_cache=layer_cache, type="self") query = self.drop(query) + inputs query_norm = self.layer_norm_2(query) mid, attn = self.context_attn(memory_bank, memory_bank, query_norm, mask=src_pad_mask, layer_cache=layer_cache, type="context") if para_attn is not None: # para_attn size is batch x block_size # attn size is slength x batch batch_size = memory_bank.size(0) dim_per_head = self.context_attn.dim_per_head head_count = self.context_attn.head_count def shape(x): """ projection """ return x.view(batch_size, -1, head_count, dim_per_head) \ .transpose(1, 2) def unshape(x): """ compute context """ return x.transpose(1, 2).contiguous() \ .view(batch_size, -1, head_count * dim_per_head) if layer_cache is not None: value = layer_cache['memory_values'] else: value = self.context_attn.linear_values(memory_bank) value = shape(value) attn = attn * para_attn.unsqueeze(1).repeat( 1, head_count, 1, 1) # multiply for one step # renormalize attention attn = attn / attn.sum(-1).unsqueeze(-1) drop_attn = self.context_attn.dropout(attn) mid = unshape(torch.matmul(drop_attn, value)) mid = self.context_attn.final_linear(mid) output = self.feed_forward(self.drop(mid) + query) return output, all_input, attn # return output def _get_attn_subsequent_mask(self, size): """ Get an attention mask to avoid using the subsequent info. Args: size: int Returns: (`LongTensor`): * subsequent_mask `[1 x size x size]` """ attn_shape = (1, size, size) subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8') subsequent_mask = torch.from_numpy(subsequent_mask) return subsequent_mask