Beispiel #1
0
def test_basetransformerlayer():
    attn_cfgs = dict(type='MultiheadAttention', embed_dims=256, num_heads=8),
    feedforward_channels = 2048
    ffn_dropout = 0.1
    operation_order = ('self_attn', 'norm', 'ffn', 'norm')

    # test deprecated_args
    baselayer = BaseTransformerLayer(attn_cfgs=attn_cfgs,
                                     feedforward_channels=feedforward_channels,
                                     ffn_dropout=ffn_dropout,
                                     operation_order=operation_order)
    assert baselayer.batch_first is False
    assert baselayer.ffns[0].feedforward_channels == feedforward_channels

    attn_cfgs = dict(type='MultiheadAttention', num_heads=8, embed_dims=256),
    feedforward_channels = 2048
    ffn_dropout = 0.1
    operation_order = ('self_attn', 'norm', 'ffn', 'norm')
    baselayer = BaseTransformerLayer(attn_cfgs=attn_cfgs,
                                     feedforward_channels=feedforward_channels,
                                     ffn_dropout=ffn_dropout,
                                     operation_order=operation_order,
                                     batch_first=True)
    assert baselayer.attentions[0].batch_first
    in_tensor = torch.rand(2, 10, 256)
    baselayer(in_tensor)
Beispiel #2
0
    def __init__(self,
                 n_layers=2,
                 n_head=8,
                 d_model=512,
                 d_inner=2048,
                 dropout=0.1,
                 max_len=8 * 32,
                 init_cfg=None):
        super().__init__(init_cfg=init_cfg)

        assert d_model % n_head == 0, 'd_model must be divisible by n_head'

        self.pos_encoder = PositionalEncoding(d_model, n_position=max_len)
        encoder_layer = BaseTransformerLayer(
            operation_order=('self_attn', 'norm', 'ffn', 'norm'),
            attn_cfgs=dict(
                type='MultiheadAttention',
                embed_dims=d_model,
                num_heads=n_head,
                attn_drop=dropout,
                dropout_layer=dict(type='Dropout', drop_prob=dropout),
            ),
            ffn_cfgs=dict(
                type='FFN',
                embed_dims=d_model,
                feedforward_channels=d_inner,
                ffn_drop=dropout,
            ),
            norm_cfg=dict(type='LN'),
        )
        self.transformer = ModuleList(
            [copy.deepcopy(encoder_layer) for _ in range(n_layers)])
    def __init__(self,
                 d_model=512,
                 n_head=8,
                 d_inner=2048,
                 n_layers=4,
                 max_seq_len=40,
                 dropout=0.1,
                 detach_tokens=True,
                 num_chars=90,
                 use_self_attn=False,
                 pad_idx=0,
                 init_cfg=None,
                 **kwargs):
        super().__init__(init_cfg=init_cfg)
        self.detach_tokens = detach_tokens

        self.d_model = d_model
        self.max_seq_len = max_seq_len

        self.proj = nn.Linear(num_chars, d_model, False)
        self.token_encoder = PositionalEncoding(d_model,
                                                n_position=self.max_seq_len,
                                                dropout=0.1)
        self.pos_encoder = PositionalEncoding(d_model,
                                              n_position=self.max_seq_len)
        self.pad_idx = pad_idx

        if use_self_attn:
            operation_order = ('self_attn', 'norm', 'cross_attn', 'norm',
                               'ffn', 'norm')
        else:
            operation_order = ('cross_attn', 'norm', 'ffn', 'norm')

        decoder_layer = BaseTransformerLayer(
            operation_order=operation_order,
            attn_cfgs=dict(
                type='MultiheadAttention',
                embed_dims=d_model,
                num_heads=n_head,
                attn_drop=dropout,
                dropout_layer=dict(type='Dropout', drop_prob=dropout),
            ),
            ffn_cfgs=dict(
                type='FFN',
                embed_dims=d_model,
                feedforward_channels=d_inner,
                ffn_drop=dropout,
            ),
            norm_cfg=dict(type='LN'),
        )
        self.decoder_layers = ModuleList(
            [copy.deepcopy(decoder_layer) for _ in range(n_layers)])

        self.cls = nn.Linear(d_model, num_chars)
Beispiel #4
0
def test_basetransformerlayer_cuda():
    # To test if the BaseTransformerLayer's behaviour remains
    # consistent after being deepcopied
    operation_order = ('self_attn', 'ffn')
    baselayer = BaseTransformerLayer(
        operation_order=operation_order,
        batch_first=True,
        attn_cfgs=dict(
            type='MultiheadAttention',
            embed_dims=256,
            num_heads=8,
        ),
    )
    baselayers = ModuleList([copy.deepcopy(baselayer) for _ in range(2)])
    baselayers.to('cuda')
    x = torch.rand(2, 10, 256).cuda()
    for m in baselayers:
        x = m(x)
        assert x.shape == torch.Size([2, 10, 256])