def __init__(self, d_model, heads, d_ff, dropout):
        super(TransformerDecoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
        mask = self._get_attn_subsequent_mask(MAX_SIZE)
        # Register self.mask as a buffer in TransformerDecoderLayer, so
        # it gets TransformerDecoderLayer's cuda behavior automatically.
        self.register_buffer('mask', mask)
Beispiel #2
0
    def __init__(self, d_model, heads, d_ff, dropout):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 mem_args=None,
                 use_mem=False):
        super(TransformerInterLayer, self).__init__()
        self.d_model, self.heads = d_model, heads
        self.d_per_head = self.d_model // self.heads
        self.layer_norm1 = nn.LayerNorm(d_model, eps=1e-6)

        self.pooling = MultiHeadedPooling(heads,
                                          d_model,
                                          dropout=dropout,
                                          use_final_linear=False)

        self.layer_norm2 = nn.LayerNorm(self.d_per_head, eps=1e-6)

        self.inter_att = MultiHeadedAttention(1,
                                              self.d_per_head,
                                              dropout,
                                              use_final_linear=False)

        self.linear = nn.Linear(self.d_model, self.d_model)

        self.dropout = nn.Dropout(dropout)
        if mem_args is not None and use_mem:
            self.feed_forward = HashingMemory.build(d_model, d_model, mem_args)
        else:
            self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
    def __init__(self, d_model, heads, d_ff, dropout):
        super(TransformerQueryInterLayer, self).__init__()
        self.d_model, self.heads = d_model, heads
        self.d_per_head = self.d_model // self.heads
        self.layer_norm1 = nn.LayerNorm(d_model, eps=1e-6)

        ### query related
        self.paragraph_pooling = MultiHeadedPooling(heads,
                                                    d_model,
                                                    dropout=dropout,
                                                    use_final_linear=False)
        self.query_pooling = MultiHeadedPooling(heads,
                                                d_model,
                                                dropout=dropout,
                                                use_final_linear=False)
        self.query_layer_norm1 = nn.LayerNorm(self.d_model, eps=1e-6)
        self.query_layer_norm2 = nn.LayerNorm(self.d_per_head, eps=1e-6)

        self.layer_norm2 = nn.LayerNorm(self.d_per_head, eps=1e-6)

        self.inter_att = MultiHeadedAttention(1,
                                              self.d_per_head,
                                              dropout,
                                              use_final_linear=False)

        self.linear = nn.Linear(self.d_model, self.d_model)

        self.dropout = nn.Dropout(dropout)

        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
    def __init__(self, d_model, heads, d_ff, dropout):
        super(TransformerQueryEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)

        ## query related
        self.query_pooling = MultiHeadedPooling(heads,
                                                d_model,
                                                dropout=dropout,
                                                use_final_linear=False)
        self.query_layer_norm1 = nn.LayerNorm(d_model, eps=1e-16)
        self.query_layer_norm2 = nn.LayerNorm(d_model, eps=1e-16)
Beispiel #6
0
    def __init__(self, d_model, heads, d_ff, dropout):
        super(TransformerNewInterLayer, self).__init__()

        self.layer_norm1 = nn.LayerNorm(d_model, eps=1e-6)

        self.pooling = MultiHeadedPooling(heads, d_model, dropout=dropout)

        self.layer_norm2 = nn.LayerNorm(d_model, eps=1e-6)

        self.inter_att = MultiHeadedAttention(heads,
                                              d_model,
                                              dropout,
                                              use_final_linear=True)
        self.dropout = nn.Dropout(dropout)

        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
    def __init__(self,
                 d_model,
                 heads,
                 d_ff,
                 dropout,
                 mem_args=None,
                 use_mem=False):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
        if mem_args is not None and use_mem:
            self.feed_forward = HashingMemory.build(d_model, d_model, mem_args)
        else:
            self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)
Beispiel #8
0
class TransformerDecoderLayer(nn.Module):
    """
    Args:
      d_model (int): the dimension of keys/values/queries in
                       MultiHeadedAttention, also the input size of
                       the first-layer of the PositionwiseFeedForward.
      heads (int): the number of heads for MultiHeadedAttention.
      d_ff (int): the second-layer of the PositionwiseFeedForward.
      dropout (float): dropout probability(0-1.0).
      self_attn_type (string): type of self-attention scaled-dot, average
    """
    def __init__(self, d_model, heads, d_ff, dropout):
        super(TransformerDecoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
        self.context_attn = MultiHeadedAttention(heads,
                                                 d_model,
                                                 dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm_1 = nn.LayerNorm(d_model, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(d_model, eps=1e-6)
        self.drop = nn.Dropout(dropout)
        mask = self._get_attn_subsequent_mask(MAX_SIZE)
        # Register self.mask as a buffer in TransformerDecoderLayer, so
        # it gets TransformerDecoderLayer's cuda behavior automatically.
        self.register_buffer('mask', mask)

    def forward(self,
                inputs,
                memory_bank,
                src_pad_mask,
                tgt_pad_mask,
                layer_cache=None,
                step=None,
                para_attn=None):
        """
        Args:
            inputs (`FloatTensor`): `[batch_size x 1 x model_dim]`
            memory_bank (`FloatTensor`): `[batch_size x src_len x model_dim]`
            src_pad_mask (`LongTensor`): `[batch_size x 1 x src_len]`
            tgt_pad_mask (`LongTensor`): `[batch_size x 1 x 1]`

        Returns:
            (`FloatTensor`, `FloatTensor`, `FloatTensor`):

            * output `[batch_size x 1 x model_dim]`
            * attn `[batch_size x 1 x src_len]`
            * all_input `[batch_size x current_step x model_dim]`

        """
        dec_mask = torch.gt(
            tgt_pad_mask +
            self.mask[:, :tgt_pad_mask.size(1), :tgt_pad_mask.size(1)], 0)
        input_norm = self.layer_norm_1(inputs)
        all_input = input_norm

        query, _ = self.self_attn(all_input,
                                  all_input,
                                  input_norm,
                                  mask=dec_mask,
                                  layer_cache=layer_cache,
                                  type="self")

        query = self.drop(query) + inputs

        query_norm = self.layer_norm_2(query)
        mid, attn = self.context_attn(memory_bank,
                                      memory_bank,
                                      query_norm,
                                      mask=src_pad_mask,
                                      layer_cache=layer_cache,
                                      type="context")

        if para_attn is not None:
            # para_attn size is batch x block_size
            # attn size is slength x batch
            batch_size = memory_bank.size(0)
            dim_per_head = self.context_attn.dim_per_head
            head_count = self.context_attn.head_count

            def shape(x):
                """  projection """
                return x.view(batch_size, -1, head_count, dim_per_head) \
                    .transpose(1, 2)

            def unshape(x):
                """  compute context """
                return x.transpose(1, 2).contiguous() \
                    .view(batch_size, -1, head_count * dim_per_head)

            if layer_cache is not None:
                value = layer_cache['memory_values']
            else:
                value = self.context_attn.linear_values(memory_bank)
                value = shape(value)

            attn = attn * para_attn.unsqueeze(1).repeat(
                1, head_count, 1, 1)  # multiply for one step
            # renormalize attention
            attn = attn / attn.sum(-1).unsqueeze(-1)
            drop_attn = self.context_attn.dropout(attn)

            mid = unshape(torch.matmul(drop_attn, value))
            mid = self.context_attn.final_linear(mid)

        output = self.feed_forward(self.drop(mid) + query)

        return output, all_input, attn
        # return output

    def _get_attn_subsequent_mask(self, size):
        """
        Get an attention mask to avoid using the subsequent info.

        Args:
            size: int

        Returns:
            (`LongTensor`):

            * subsequent_mask `[1 x size x size]`
        """
        attn_shape = (1, size, size)
        subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
        subsequent_mask = torch.from_numpy(subsequent_mask)
        return subsequent_mask