Esempio n. 1
0
    def __init__(self,
                 d_model,
                 d_ff,
                 attn_type,
                 n_heads,
                 dropout,
                 dropout_att,
                 layer_norm_eps):
        super(TransformerEncoderBlock, self).__init__()

        self.n_heads = n_heads

        # self-attention
        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.self_attn = MultiheadAttentionMechanism(key_dim=d_model,
                                                     query_dim=d_model,
                                                     attn_type=attn_type,
                                                     attn_dim=d_model,
                                                     n_heads=n_heads,
                                                     dropout=dropout_att)
        self.dropout1 = nn.Dropout(dropout)

        # feed-forward
        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.dropout2 = nn.Dropout(dropout)
Esempio n. 2
0
class TransformerEncoderBlock(nn.Module):
    """A single layer of the transformer encoder.

    Args:
        d_model (int): dimension of MultiheadAttentionMechanism
        d_ff (int): dimention of PositionwiseFeedForward
        attn_type (str):
        n_heads (int): number of heads for multi-head attention
        dropout (float): dropout probabilities for linear layers
        dropout_att (float): dropout probabilities for attention distributions
        layer_norm_eps (float): epsilon parameter for layer normalization

    """
    def __init__(self, d_model, d_ff, attn_type, n_heads, dropout, dropout_att,
                 layer_norm_eps):
        super(TransformerEncoderBlock, self).__init__()

        self.n_heads = n_heads

        # self-attention
        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.self_attn = MultiheadAttentionMechanism(key_dim=d_model,
                                                     query_dim=d_model,
                                                     attn_type=attn_type,
                                                     attn_dim=d_model,
                                                     n_heads=n_heads,
                                                     dropout=dropout_att)
        self.dropout1 = nn.Dropout(dropout)

        # feed-forward
        self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, xs, xx_mask=None, cache=False):
        """Transformer encoder layer definition.

        Args:
            xs (FloatTensor): `[B, T, d_model]`
            xx_mask (ByteTensor): `[B, n_heads, T, T]`
            cache (bool):
        Returns:
            xs (FloatTensor): `[B, T, d_model]`
            xx_aws (FloatTensor): `[B, T, T]`

        """
        # self-attention
        if not cache:
            self.self_attn.reset()
        _xs = self.norm1(xs)
        _xs, xx_aws = self.self_attn(_xs, _xs, _xs, mask=xx_mask)
        xs = self.dropout1(_xs) + xs

        # position-wise feed-forward
        _xs = self.norm2(xs)
        _xs = self.feed_forward(_xs)
        xs = self.dropout2(_xs) + xs

        return xs, xx_aws
Esempio n. 3
0
    def __init__(self,
                 d_model,
                 d_ff,
                 atype,
                 n_heads,
                 dropout,
                 dropout_att,
                 layer_norm_eps,
                 ffn_activation,
                 param_init,
                 src_tgt_attention=True):
        super(TransformerDecoderBlock, self).__init__()

        self.atype = atype
        self.n_heads = n_heads
        self.src_tgt_attention = src_tgt_attention

        # self-attention
        if atype == "average":
            raise NotImplementedError
        else:
            self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
            self.self_attn = MultiheadAttentionMechanism(kdim=d_model,
                                                         qdim=d_model,
                                                         adim=d_model,
                                                         atype=atype,
                                                         n_heads=n_heads,
                                                         dropout=dropout_att,
                                                         param_init=param_init)

        # attention for encoder stacks
        if src_tgt_attention:
            self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
            self.src_attn = MultiheadAttentionMechanism(kdim=d_model,
                                                        qdim=d_model,
                                                        adim=d_model,
                                                        atype=atype,
                                                        n_heads=n_heads,
                                                        dropout=dropout_att,
                                                        param_init=param_init)

        # feed-forward
        self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, d_model,
                                                    dropout, ffn_activation,
                                                    param_init)

        self.dropout = nn.Dropout(p=dropout)
Esempio n. 4
0
class TransformerDecoderBlock(nn.Module):
    """A single layer of the transformer decoder.

        Args:
            d_model (int): dimension of keys/values/queries in
                           MultiheadAttentionMechanism, also the input size of
                           the first-layer of the PositionwiseFeedForward
            d_ff (int): second-layer of the PositionwiseFeedForward
            attn_type (str):
            n_heads (int): number of heads for multi-head attention
            dropout (float): dropout probabilities for linear layers
            dropout_att (float): dropout probabilities for attention probabilities
            attn_type (str): type of self-attention, scaled_dot or average
            layer_norm_eps (float):
            src_attention (bool): if False, ignore source-target attention

    """

    def __init__(self,
                 d_model,
                 d_ff,
                 attn_type,
                 n_heads,
                 dropout,
                 dropout_att,
                 layer_norm_eps,
                 src_attention=True):
        super(TransformerDecoderBlock, self).__init__()

        self.attn_type = attn_type
        self.n_heads = n_heads
        self.src_attention = src_attention

        # self-attention
        if attn_type == "average":
            raise NotImplementedError
        else:
            self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
            self.self_attn = MultiheadAttentionMechanism(key_dim=d_model,
                                                         query_dim=d_model,
                                                         attn_type=attn_type,
                                                         attn_dim=d_model,
                                                         n_heads=n_heads,
                                                         dropout=dropout_att)
            self.dropout1 = nn.Dropout(dropout)

        # attention for encoder stacks
        if src_attention:
            self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
            self.src_attn = MultiheadAttentionMechanism(key_dim=d_model,
                                                        query_dim=d_model,
                                                        attn_type=attn_type,
                                                        attn_dim=d_model,
                                                        n_heads=n_heads,
                                                        dropout=dropout_att)
            self.dropout2 = nn.Dropout(dropout)

        # feed-forward
        self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.dropout3 = nn.Dropout(dropout)

    def forward(self, ys, yy_mask=None, xs=None, xy_mask=None):
        """Transformer decoder layer definition.

        Args:
            ys (FloatTensor): `[B, L, d_model]`
            yy_mask ():
            xs (FloatTensor): encoder outputs. `[B, T, d_model]`
            xy_mask ():
        Returns:
            ys (FloatTensor): `[B, L, d_model]`
            yy_aw (FloatTensor)`[B, L, L]`
            xy_aw (FloatTensor): `[B, L, T]`

        """
        # self-attention
        if self.attn_type == "average":
            raise NotImplementedError
        else:
            self.self_attn.reset()
            _ys = self.norm1(ys)
            _ys, yy_aw = self.self_attn(_ys, _ys, _ys, mask=yy_mask)
            ys = self.dropout1(_ys) + ys

        # attention for encoder stacks
        xy_aw = None
        if self.src_attention:
            self.src_attn.reset()
            _ys = self.norm2(ys)
            _ys, xy_aw = self.src_attn(key=xs, value=xs, query=_ys, mask=xy_mask)
            ys = self.dropout2(_ys) + ys

        # position-wise feed-forward
        _ys = self.norm3(ys)
        _ys = self.feed_forward(_ys)
        ys = self.dropout3(_ys) + ys

        return ys, yy_aw, xy_aw
Esempio n. 5
0
    def __init__(self,
                 d_model,
                 d_ff,
                 atype,
                 n_heads,
                 dropout,
                 dropout_att,
                 dropout_layer,
                 layer_norm_eps,
                 ffn_activation,
                 param_init,
                 src_tgt_attention=True,
                 memory_transformer=False,
                 mma_chunk_size=0,
                 mma_n_heads_mono=1,
                 mma_n_heads_chunk=1,
                 mma_init_r=2,
                 mma_eps=1e-6,
                 mma_std=1.0,
                 mma_no_denominator=False,
                 mma_1dconv=False,
                 dropout_head=0,
                 share_chunkwise_attention=False,
                 lm_fusion='',
                 ffn_bottleneck_dim=0):

        super().__init__()

        self.atype = atype
        self.n_heads = n_heads
        self.src_tgt_attention = src_tgt_attention
        self.memory_transformer = memory_transformer

        # self-attention
        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        mha = RelMHA if memory_transformer else MHA
        self.self_attn = mha(kdim=d_model,
                             qdim=d_model,
                             adim=d_model,
                             odim=d_model,
                             n_heads=n_heads,
                             dropout=dropout_att,
                             dropout_head=dropout_head,
                             param_init=param_init,
                             xl_like=memory_transformer)

        # attention over encoder stacks
        if src_tgt_attention:
            self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
            if 'mocha' in atype:
                self.n_heads = mma_n_heads_mono
                self.src_attn = MoChA(
                    kdim=d_model,
                    qdim=d_model,
                    adim=d_model,
                    odim=d_model,
                    atype='scaled_dot',
                    chunk_size=mma_chunk_size,
                    n_heads_mono=mma_n_heads_mono,
                    n_heads_chunk=mma_n_heads_chunk,
                    init_r=mma_init_r,
                    eps=mma_eps,
                    noise_std=mma_std,
                    no_denominator=mma_no_denominator,
                    conv1d=mma_1dconv,
                    dropout=dropout_att,
                    dropout_head=dropout_head,
                    param_init=param_init,
                    share_chunkwise_attention=share_chunkwise_attention)
            else:
                self.src_attn = MHA(kdim=d_model,
                                    qdim=d_model,
                                    adim=d_model,
                                    odim=d_model,
                                    n_heads=n_heads,
                                    dropout=dropout_att,
                                    param_init=param_init)
        else:
            self.src_attn = None

        # position-wise feed-forward
        self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.feed_forward = FFN(d_model, d_ff, dropout, ffn_activation,
                                param_init, ffn_bottleneck_dim)

        self.dropout = nn.Dropout(p=dropout)
        self.dropout_layer = dropout_layer

        # LM fusion
        self.lm_fusion = lm_fusion
        if lm_fusion:
            self.norm_lm = nn.LayerNorm(d_model, eps=layer_norm_eps)
            # NOTE: LM should be projected to d_model in advance
            self.linear_lm_feat = nn.Linear(d_model, d_model)
            self.linear_lm_gate = nn.Linear(d_model * 2, d_model)
            self.linear_lm_fusion = nn.Linear(d_model * 2, d_model)
            if 'attention' in lm_fusion:
                self.lm_attn = MHA(kdim=d_model,
                                   qdim=d_model,
                                   adim=d_model,
                                   odim=d_model,
                                   n_heads=n_heads,
                                   dropout=dropout_att,
                                   param_init=param_init)

        self.reset_visualization()
Esempio n. 6
0
class TransformerDecoderBlock(nn.Module):
    """A single layer of the Transformer decoder.

    Args:
        d_model (int): input dimension of MultiheadAttentionMechanism and PositionwiseFeedForward
        d_ff (int): hidden dimension of PositionwiseFeedForward
        atype (str): type of attention mechanism
        n_heads (int): number of heads for multi-head attention
        dropout (float): dropout probabilities for linear layers
        dropout_att (float): dropout probabilities for attention probabilities
        dropout_layer (float): LayerDrop probability
        dropout_head (float): HeadDrop probability
        layer_norm_eps (float): epsilon parameter for layer normalization
        ffn_activation (str): nonlinear function for PositionwiseFeedForward
        param_init (str): parameter initialization method
        src_tgt_attention (bool): use source-target attention
        memory_transformer (bool): TransformerXL decoder
        mma_chunk_size (int): chunk size for chunkwise attention. -1 means infinite lookback.
        mma_n_heads_mono (int): number of MMA head
        mma_n_heads_chunk (int): number of hard chunkwise attention head
        mma_init_r (int): initial bias value for MMA
        mma_eps (float): epsilon value for MMA
        mma_std (float): standard deviation of Gaussian noise for MMA
        mma_no_denominator (bool): remove denominator in MMA
        mma_1dconv (bool): 1dconv for MMA
        share_chunkwise_attention (bool): share chunkwise attention in the same layer of MMA
        lm_fusion (str): type of LM fusion
        ffn_bottleneck_dim (int): bottleneck dimension for the light-weight FFN layer

    """
    def __init__(self,
                 d_model,
                 d_ff,
                 atype,
                 n_heads,
                 dropout,
                 dropout_att,
                 dropout_layer,
                 layer_norm_eps,
                 ffn_activation,
                 param_init,
                 src_tgt_attention=True,
                 memory_transformer=False,
                 mma_chunk_size=0,
                 mma_n_heads_mono=1,
                 mma_n_heads_chunk=1,
                 mma_init_r=2,
                 mma_eps=1e-6,
                 mma_std=1.0,
                 mma_no_denominator=False,
                 mma_1dconv=False,
                 dropout_head=0,
                 share_chunkwise_attention=False,
                 lm_fusion='',
                 ffn_bottleneck_dim=0):

        super().__init__()

        self.atype = atype
        self.n_heads = n_heads
        self.src_tgt_attention = src_tgt_attention
        self.memory_transformer = memory_transformer

        # self-attention
        self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        mha = RelMHA if memory_transformer else MHA
        self.self_attn = mha(kdim=d_model,
                             qdim=d_model,
                             adim=d_model,
                             odim=d_model,
                             n_heads=n_heads,
                             dropout=dropout_att,
                             dropout_head=dropout_head,
                             param_init=param_init,
                             xl_like=memory_transformer)

        # attention over encoder stacks
        if src_tgt_attention:
            self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps)
            if 'mocha' in atype:
                self.n_heads = mma_n_heads_mono
                self.src_attn = MoChA(
                    kdim=d_model,
                    qdim=d_model,
                    adim=d_model,
                    odim=d_model,
                    atype='scaled_dot',
                    chunk_size=mma_chunk_size,
                    n_heads_mono=mma_n_heads_mono,
                    n_heads_chunk=mma_n_heads_chunk,
                    init_r=mma_init_r,
                    eps=mma_eps,
                    noise_std=mma_std,
                    no_denominator=mma_no_denominator,
                    conv1d=mma_1dconv,
                    dropout=dropout_att,
                    dropout_head=dropout_head,
                    param_init=param_init,
                    share_chunkwise_attention=share_chunkwise_attention)
            else:
                self.src_attn = MHA(kdim=d_model,
                                    qdim=d_model,
                                    adim=d_model,
                                    odim=d_model,
                                    n_heads=n_heads,
                                    dropout=dropout_att,
                                    param_init=param_init)
        else:
            self.src_attn = None

        # position-wise feed-forward
        self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps)
        self.feed_forward = FFN(d_model, d_ff, dropout, ffn_activation,
                                param_init, ffn_bottleneck_dim)

        self.dropout = nn.Dropout(p=dropout)
        self.dropout_layer = dropout_layer

        # LM fusion
        self.lm_fusion = lm_fusion
        if lm_fusion:
            self.norm_lm = nn.LayerNorm(d_model, eps=layer_norm_eps)
            # NOTE: LM should be projected to d_model in advance
            self.linear_lm_feat = nn.Linear(d_model, d_model)
            self.linear_lm_gate = nn.Linear(d_model * 2, d_model)
            self.linear_lm_fusion = nn.Linear(d_model * 2, d_model)
            if 'attention' in lm_fusion:
                self.lm_attn = MHA(kdim=d_model,
                                   qdim=d_model,
                                   adim=d_model,
                                   odim=d_model,
                                   n_heads=n_heads,
                                   dropout=dropout_att,
                                   param_init=param_init)

        self.reset_visualization()

    @property
    def yy_aws(self):
        return self._yy_aws

    @property
    def xy_aws(self):
        return self._xy_aws

    @property
    def xy_aws_beta(self):
        return self._xy_aws_beta

    @property
    def xy_aws_p_choose(self):
        return self._xy_aws_p_choose

    @property
    def yy_aws_lm(self):
        return self._yy_aws_lm

    def reset_visualization(self):
        self._yy_aws = None
        self._xy_aws = None
        self._xy_aws_beta = None
        self._xy_aws_p_choose = None
        self._yy_aws_lm = None

    def reset(self):
        if self.src_attn is not None:
            self.src_attn.reset()

    def forward(self,
                ys,
                yy_mask,
                xs=None,
                xy_mask=None,
                cache=None,
                xy_aws_prev=None,
                mode='hard',
                eps_wait=-1,
                lmout=None,
                pos_embs=None,
                memory=None,
                u_bias=None,
                v_bias=None):
        """Transformer decoder forward pass.

        Args:
            ys (FloatTensor): `[B, L, d_model]`
            yy_mask (ByteTensor): `[B, L (query), L (key)]`
            xs (FloatTensor): encoder outputs. `[B, T, d_model]`
            xy_mask (ByteTensor): `[B, L, T]`
            cache (FloatTensor): `[B, L-1, d_model]`
            xy_aws_prev (FloatTensor): `[B, H, L, T]`
            mode (str): decoding mode for MMA
            eps_wait (int): wait time delay for head-synchronous decoding in MMA
            lmout (FloatTensor): `[B, L, d_model]`
            pos_embs (LongTensor): `[L, 1, d_model]`
            memory (FloatTensor): `[B, L_prev, d_model]`
            u_bias (FloatTensor): global parameter for TransformerXL
            v_bias (FloatTensor): global parameter for TransformerXL
        Returns:
            out (FloatTensor): `[B, L, d_model]`

        """
        self.reset_visualization()

        # LayerDrop
        if self.dropout_layer > 0 and self.training and random.random(
        ) < self.dropout_layer:
            return ys

        residual = ys
        if self.memory_transformer:
            if cache is not None:
                pos_embs = pos_embs[-ys.size(1):]
            if memory is not None and memory.dim() > 1:
                cat = self.norm1(torch.cat([memory, ys], dim=1))
                ys = cat[:, memory.size(1):]
            else:
                ys = self.norm1(ys)
                cat = ys
        else:
            ys = self.norm1(ys)  # pre-norm

        if cache is not None:
            ys_q = ys[:, -1:]
            residual = residual[:, -1:]
            yy_mask = yy_mask[:, -1:]
        else:
            ys_q = ys

        # self-attention
        if self.memory_transformer:
            out, self._yy_aws = self.self_attn(cat, ys_q, pos_embs, yy_mask,
                                               u_bias, v_bias)  # k/q/m
        else:
            out, self._yy_aws = self.self_attn(ys, ys, ys_q,
                                               mask=yy_mask)[:2]  # k/v/q
        out = self.dropout(out) + residual

        # attention over encoder stacks
        if self.src_tgt_attention:
            residual = out
            out = self.norm2(out)
            out, self._xy_aws, attn_state = self.src_attn(
                xs,
                xs,
                out,
                mask=xy_mask,  # k/v/q
                aw_prev=xy_aws_prev,
                mode=mode,
                eps_wait=eps_wait)
            out = self.dropout(out) + residual

            if attn_state.get('beta', None) is not None:
                self._xy_aws_beta = attn_state['beta']
            if attn_state.get('p_choose', None) is not None:
                self._xy_aws_p_choose = attn_state['p_choose']

        # LM integration
        if self.lm_fusion:
            residual = out
            out = self.norm_lm(out)
            lmout = self.linear_lm_feat(lmout)

            # attention over LM outputs
            if 'attention' in self.lm_fusion:
                out, self._yy_aws_lm, _ = self.lm_attn(lmout,
                                                       lmout,
                                                       out,
                                                       mask=yy_mask)  # k/v/q

            gate = torch.sigmoid(
                self.linear_lm_gate(torch.cat([out, lmout], dim=-1)))
            gated_lmout = gate * lmout
            out = self.linear_lm_fusion(torch.cat([out, gated_lmout], dim=-1))
            out = self.dropout(out) + residual

        # position-wise feed-forward
        residual = out
        out = self.norm3(out)
        out = self.feed_forward(out)
        out = self.dropout(out) + residual

        if cache is not None:
            out = torch.cat([cache, out], dim=1)

        return out