Example #1
0
    def __init__(self,
                 d_model,
                 d_ff,
                 cov_kernel_size,
                 n_heads,
                 nblocks=12,
                 pos_dropout=0.0,
                 slf_attn_dropout=0.0,
                 ffn_dropout=0.0,
                 residual_dropout=0.1,
                 conv_dropout=0.0,
                 macaron_style=True,
                 ffn_scale=0.5,
                 conv_bias=True,
                 relative_positional=True,
                 activation='glu'):
        super(ConformerEncoder, self).__init__()

        self.relative_positional = relative_positional

        self.pos_emb = PositionalEncoding(d_model, pos_dropout)

        self.blocks = nn.ModuleList([
            ConformerEncoderBlock(d_model, d_ff, cov_kernel_size, n_heads,
                                  slf_attn_dropout, ffn_dropout,
                                  residual_dropout, conv_dropout,
                                  macaron_style, ffn_scale, conv_bias,
                                  relative_positional, activation)
            for _ in range(nblocks)
        ])

        self.output_size = d_model
Example #2
0
    def __init__(self, vocab_size, d_model=256, n_heads=4, d_ff=2048, memory_dim=256, n_blocks=6, pos_dropout=0.0, slf_attn_dropout=0.0, src_attn_dropout=0.0, ffn_dropout=0.0,
                 residual_dropout=0.1, activation='relu', normalize_before=True, concat_after=False, share_embedding=False):
        super(TransformerDecoder, self).__init__()

        self.decoder_type = 'transformer'
        self.normalize_before = normalize_before
        self.relative_positional = False

        self.d_model = d_model

        self.embedding = nn.Embedding(vocab_size, d_model)

        self.pos_emb = PositionalEncoding(d_model, pos_dropout)

        self.blocks = nn.ModuleList([
            TransformerDecoderLayer(
                n_heads, d_model, d_ff, memory_dim, slf_attn_dropout, src_attn_dropout,
                ffn_dropout, residual_dropout, normalize_before=normalize_before, concat_after=concat_after,
                relative_positional=False, activation=activation) for _ in range(n_blocks)
        ])

        if self.normalize_before:
            self.after_norm = nn.LayerNorm(d_model)

        self.output_layer = nn.Linear(d_model, vocab_size)

        if share_embedding:
            assert self.embedding.weight.size() == self.output_layer.weight.size()
            self.output_layer.weight = self.embedding.weight
            logger.info('Tie the weights between the embedding and output layer.')
Example #3
0
    def __init__(self, params):
        super(TransformerLanguageModel, self).__init__(params)

        self.model_type = 'transformer_lm'
        self.normalize_before = False
        self.smoothing = params['smoothing']
        self.vocab_size = params['vocab_size']
        self.num_blocks = params['num_blocks']

        self.embedding = nn.Embedding(self.vocab_size, params['d_model'])
        self.pos_embedding = PositionalEncoding(params['d_model'], 0.0)

        self.blocks = nn.ModuleList([
            TransformerEncoderLayer(
                params['n_heads'], params['d_model'], params['d_ff'],
                slf_attn_dropout=0.0, ffn_dropout=0.0,
                residual_dropout=params['residual_dropout'],
                normalize_before=False, concat_after=False, activation='glu') for _ in range(self.num_blocks)
        ])

        if self.normalize_before:
            self.after_norm = nn.LayerNorm(params['d_model'])

        self.output_project = nn.Linear(params['d_model'], self.vocab_size)

        if params['share_embedding']:
            self.output_project.weight = self.embedding.weight
            print('Share the weight of embedding to the output project layer!')

        self.crit = LabelSmoothingLoss(size=self.vocab_size, smoothing=self.smoothing, padding_idx=PAD)
Example #4
0
class ConformerEncoder(BaseEncoder):
    def __init__(self,
                 d_model,
                 d_ff,
                 cov_kernel_size,
                 n_heads,
                 nblocks=12,
                 pos_dropout=0.0,
                 slf_attn_dropout=0.0,
                 ffn_dropout=0.0,
                 residual_dropout=0.1,
                 conv_dropout=0.0,
                 macaron_style=True,
                 ffn_scale=0.5,
                 conv_bias=True,
                 relative_positional=True,
                 activation='glu'):
        super(ConformerEncoder, self).__init__()

        self.relative_positional = relative_positional

        self.pos_emb = PositionalEncoding(d_model, pos_dropout)

        self.blocks = nn.ModuleList([
            ConformerEncoderBlock(d_model, d_ff, cov_kernel_size, n_heads,
                                  slf_attn_dropout, ffn_dropout,
                                  residual_dropout, conv_dropout,
                                  macaron_style, ffn_scale, conv_bias,
                                  relative_positional, activation)
            for _ in range(nblocks)
        ])

        self.output_size = d_model

    def forward(self, inputs, mask):

        if self.relative_positional:
            enc_output = inputs
            # [1, 2T - 1]
            position = torch.arange(-(inputs.size(1) - 1),
                                    inputs.size(1),
                                    device=inputs.device).reshape(1, -1)
            pos = self.pos_emb._embedding_from_positions(position)
        else:
            enc_output, pos = self.pos_emb(inputs)

        enc_output.masked_fill_(~mask.unsqueeze(2), 0.0)

        attn_weights = {}
        for i, block in enumerate(self.blocks):
            enc_output, attn_weight = block(enc_output, mask, pos)
            attn_weights['enc_block_%d' % i] = attn_weight

        return enc_output, mask, attn_weights
Example #5
0
    def __init__(
        self,
        d_model=256,
        n_heads=4,
        d_ff=2048,
        n_blocks=6,
        pos_dropout=0.0,
        slf_attn_dropout=0.0,
        ffn_dropout=0.0,
        residual_dropout=0.1,
        normalize_before=False,
        concat_after=False,
        relative_positional=False,
        activation="relu",
    ):
        super(TransformerEncoder, self).__init__()

        self.normalize_before = normalize_before
        self.relative_positional = relative_positional

        self.pos_emb = PositionalEncoding(d_model, pos_dropout)

        self.blocks = nn.ModuleList([
            TransformerEncoderLayer(
                n_heads,
                d_model,
                d_ff,
                slf_attn_dropout,
                ffn_dropout,
                residual_dropout=residual_dropout,
                normalize_before=normalize_before,
                concat_after=concat_after,
                relative_positional=relative_positional,
                activation=activation,
            ) for _ in range(n_blocks)
        ])

        if self.normalize_before:
            self.norm = nn.LayerNorm(d_model)
Example #6
0
class TransformerEncoder(nn.Module):
    def __init__(
        self,
        d_model=256,
        n_heads=4,
        d_ff=2048,
        n_blocks=6,
        pos_dropout=0.0,
        slf_attn_dropout=0.0,
        ffn_dropout=0.0,
        residual_dropout=0.1,
        normalize_before=False,
        concat_after=False,
        relative_positional=False,
        activation="relu",
    ):
        super(TransformerEncoder, self).__init__()

        self.normalize_before = normalize_before
        self.relative_positional = relative_positional

        self.pos_emb = PositionalEncoding(d_model, pos_dropout)

        self.blocks = nn.ModuleList([
            TransformerEncoderLayer(
                n_heads,
                d_model,
                d_ff,
                slf_attn_dropout,
                ffn_dropout,
                residual_dropout=residual_dropout,
                normalize_before=normalize_before,
                concat_after=concat_after,
                relative_positional=relative_positional,
                activation=activation,
            ) for _ in range(n_blocks)
        ])

        if self.normalize_before:
            self.norm = nn.LayerNorm(d_model)

    def forward(self, inputs, mask):
        if self.relative_positional:
            enc_output = inputs

            position = flow.arange(-(inputs.size(1) - 1),
                                   inputs.size(1),
                                   device=inputs.device).reshape(1, -1)
            pos = self.pos_emb._embedding_from_positions(position)
        else:
            enc_output, pos = self.pos_emb(inputs)

        attn_weights = {}
        for i, block in enumerate(self.blocks):
            enc_output, attn_weight = block(enc_output, mask.unsqueeze(1), pos)
            attn_weights["enc_block_%d" % i] = attn_weight

        if self.normalize_before:
            enc_output = self.norm(enc_output)

        return enc_output, mask, attn_weights
Example #7
0
class ConformerEncoder(BaseEncoder):
    def __init__(
        self,
        d_model,
        d_ff,
        cov_kernel_size,
        n_heads,
        nblocks=12,
        pos_dropout=0.0,
        slf_attn_dropout=0.0,
        ffn_dropout=0.0,
        residual_dropout=0.1,
        conv_dropout=0.0,
        macaron_style=True,
        ffn_scale=0.5,
        conv_bias=True,
        positional_encoding=True,
        relative_positional=True,
        conv_first=False,
        activation="glu",
    ):
        super(ConformerEncoder, self).__init__()

        self.positional_encoding = positional_encoding
        self.relative_positional = relative_positional
        self.output_size = d_model

        if self.positional_encoding:
            self.pos_emb = PositionalEncoding(d_model, pos_dropout)

        self.blocks = nn.ModuleList([
            ConformerEncoderBlock(
                d_model,
                d_ff,
                cov_kernel_size,
                n_heads,
                slf_attn_dropout,
                ffn_dropout,
                residual_dropout,
                conv_dropout,
                macaron_style,
                conv_first,
                ffn_scale,
                conv_bias,
                relative_positional,
                activation,
            ) for _ in range(nblocks)
        ])

        self.output_size = d_model

    def _pos_encoding(self, inputs):
        if self.relative_positional:
            enc_output = inputs
            position = flow.arange(-(inputs.size(1) - 1),
                                   inputs.size(1),
                                   device=inputs.device).reshape(1, -1)
            pos = self.pos_emb._embedding_from_positions(position)
        else:
            enc_output, pos = self.pos_emb(inputs)
        return enc_output, pos

    def forward(self, inputs, mask):

        if self.positional_encoding:
            enc_output, pos = self._pos_encoding(inputs)
        else:
            enc_output = inputs
            pos = None

        attn_weights = {}
        for i, block in enumerate(self.blocks):
            enc_output, attn_weight = block(enc_output, mask, pos)
            attn_weights["enc_block_%d" % i] = attn_weight

        return enc_output, mask, attn_weights
Example #8
0
class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model=256, n_heads=4, d_ff=2048, memory_dim=256, n_blocks=6, pos_dropout=0.0, slf_attn_dropout=0.0, src_attn_dropout=0.0, ffn_dropout=0.0,
                 residual_dropout=0.1, activation='relu', normalize_before=True, concat_after=False, share_embedding=False):
        super(TransformerDecoder, self).__init__()

        self.decoder_type = 'transformer'
        self.normalize_before = normalize_before
        self.relative_positional = False

        self.d_model = d_model

        self.embedding = nn.Embedding(vocab_size, d_model)

        self.pos_emb = PositionalEncoding(d_model, pos_dropout)

        self.blocks = nn.ModuleList([
            TransformerDecoderLayer(
                n_heads, d_model, d_ff, memory_dim, slf_attn_dropout, src_attn_dropout,
                ffn_dropout, residual_dropout, normalize_before=normalize_before, concat_after=concat_after,
                relative_positional=False, activation=activation) for _ in range(n_blocks)
        ])

        if self.normalize_before:
            self.after_norm = nn.LayerNorm(d_model)

        self.output_layer = nn.Linear(d_model, vocab_size)

        if share_embedding:
            assert self.embedding.weight.size() == self.output_layer.weight.size()
            self.output_layer.weight = self.embedding.weight
            logger.info('Tie the weights between the embedding and output layer.')

    def forward(self, targets, memory, memory_mask):

        dec_output = self.embedding(targets)
        if self.relative_positional:
            # [1, 2T - 1]
            position = torch.arange(-(dec_output.size(1)-1), dec_output.size(1), device=dec_output.device).reshape(1, -1)
            pos = self.pos_emb._embedding_from_positions(position)
        else:  
            dec_output, pos = self.pos_emb(dec_output)

        dec_mask = get_transformer_decoder_mask(targets)

        attn_weights = {}
        for i, block in enumerate(self.blocks):
            dec_output, attn_weight = block(dec_output, dec_mask, memory, memory_mask.unsqueeze(1), pos)
            attn_weights['dec_block_%d' % i] = attn_weight

        if self.normalize_before:
            dec_output = self.after_norm(dec_output)

        logits = self.output_layer(dec_output)

        return logits, attn_weights

    def inference(self, preds, memory, memory_mask=None, cache=None):

        assert preds.dim() == 2
        # dec_output = self.embedding(preds)
        # dec_output, pos = self.pos_encoding.inference(dec_output)
        # mask = get_transformer_decoder_mask(preds)

        # new_caches = []
        # attn_weights = {}
        # for i, block in enumerate(self.blocks):
        #     block_cache = cache[i] if cache is not None else {'slf': None, 'src': None}
        #     dec_output, attn_weight, block_cache = block.inference(dec_output, mask, memory, memory_mask.unsqueeze(1), pos, cache=block_cache)
        #     attn_weights['dec_block_%d' % i] = attn_weight
        #     new_caches.append(block_cache)

        # if self.normalize_before:
        #     dec_output = self.after_norm(dec_output)

        # logits = self.output_layer(dec_output) # logits [batch_size, 1, model_size]
        logits, attn_weights = self.forward(preds,  memory, memory_mask)

        log_probs = F.log_softmax(logits[:, -1, :], dim=-1) # logits [batch_size, 1, model_size]

        return log_probs, cache, attn_weights
Example #9
0
class TransformerDecoder(nn.Module):
    def __init__(
        self,
        vocab_size,
        d_model=256,
        n_heads=4,
        d_ff=2048,
        memory_dim=256,
        n_blocks=6,
        pos_dropout=0.0,
        slf_attn_dropout=0.0,
        src_attn_dropout=0.0,
        ffn_dropout=0.0,
        residual_dropout=0.1,
        activation="relu",
        normalize_before=True,
        concat_after=False,
        share_embedding=False,
    ):
        super(TransformerDecoder, self).__init__()

        self.decoder_type = "transformer"
        self.normalize_before = normalize_before
        self.relative_positional = False

        self.d_model = d_model

        self.embedding = nn.Embedding(vocab_size, d_model)

        self.pos_emb = PositionalEncoding(d_model, pos_dropout)

        self.blocks = nn.ModuleList(
            [
                TransformerDecoderLayer(
                    n_heads,
                    d_model,
                    d_ff,
                    memory_dim,
                    slf_attn_dropout,
                    src_attn_dropout,
                    ffn_dropout,
                    residual_dropout,
                    normalize_before=normalize_before,
                    concat_after=concat_after,
                    relative_positional=False,
                    activation=activation,
                )
                for _ in range(n_blocks)
            ]
        )

        if self.normalize_before:
            self.after_norm = nn.LayerNorm(d_model)

        self.output_layer = nn.Linear(d_model, vocab_size)

        if share_embedding:
            assert self.embedding.weight.size() == self.output_layer.weight.size()
            self.output_layer.weight = self.embedding.weight
            logger.info("Tie the weights between the embedding and output layer.")

    def forward(self, targets, memory, memory_mask):

        dec_output = self.embedding(targets)
        if self.relative_positional:

            position = flow.arange(
                -(dec_output.size(1) - 1), dec_output.size(1), device=dec_output.device
            ).reshape(1, -1)
            pos = self.pos_emb._embedding_from_positions(position)
        else:
            dec_output, pos = self.pos_emb(dec_output)

        dec_mask = get_transformer_decoder_mask(targets)

        attn_weights = {}
        for i, block in enumerate(self.blocks):
            dec_output, attn_weight = block(
                dec_output, dec_mask, memory, memory_mask.unsqueeze(1), pos
            )
            attn_weights["dec_block_%d" % i] = attn_weight

        if self.normalize_before:
            dec_output = self.after_norm(dec_output)

        logits = self.output_layer(dec_output)

        return logits, attn_weights

    def inference(self, preds, memory, memory_mask=None, cache=None):

        assert preds.dim() == 2
        logits, attn_weights = self.forward(preds, memory, memory_mask)
        logsoftmax = nn.LogSoftmax(dim=-1)
        log_probs = logsoftmax(logits[:, -1, :])

        return log_probs, cache, attn_weights