Beispiel #1
0
class PositionalEmbeddingAudio(nn.Module):
    """This module learns audio positional embeddings up to a fixed maximum size.

    Padding symbols are ignored, but it is necessary to specify whether padding
    is added on the left side (left_pad=True) or right side (left_pad=False).
    """

    def __init__(self, num_embeddings, embedding_dim, padding_idx, learned=True):
        super().__init__()
        if learned:
            self.embeddings = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx)
        else:
            self.embeddings = SinusoidalPositionalEmbedding(int(embedding_dim), padding_idx)
        self.padding_idx = padding_idx

    def forward(self, input, lengths, incremental_state=None):
        """Input is expected to be of size [bsz x seqlen x feature_dim]."""
        max_length = max(lengths)
        pos_tensor = lengths.new(input.size(0), max_length).fill_(self.padding_idx)
        for i, l in enumerate(lengths):
            pos_tensor[i, :l] = self.padding_idx + 1
        return self.embeddings(pos_tensor)

    def max_positions(self):
        """Maximum number of supported positions."""
        return self.embeddings.max_positions()


    @property
    def weight(self):
        return self.embeddings.weight
Beispiel #2
0
class TransformerEncoder(FairseqEncoder):
    """Transformer encoder."""
    def __init__(self,
                 dictionary,
                 embed_dim=256,
                 max_positions=1024,
                 pos="learned",
                 num_layers=2,
                 num_heads=8,
                 filter_size=256,
                 hidden_size=256,
                 dropout=0.1,
                 attention_dropout=0.1,
                 relu_dropout=0.1,
                 rank_scale=0.0):
        super().__init__(dictionary)
        assert pos == "learned" or pos == "timing" or pos == "nopos"

        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.relu_dropout = relu_dropout
        self.pos = pos

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        if self.pos == "learned":
            self.embed_positions = PositionalEmbedding(
                max_positions,
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)
        if self.pos == "timing":
            self.embed_positions = SinusoidalPositionalEmbedding(
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)

        self.layers = num_layers

        self.self_attention_blocks = nn.ModuleList()
        self.ffn_blocks = nn.ModuleList()
        self.norm1_blocks = nn.ModuleList()
        self.norm2_blocks = nn.ModuleList()
        for i in range(num_layers):
            self.self_attention_blocks.append(
                MultiheadAttention(hidden_size,
                                   hidden_size,
                                   hidden_size,
                                   num_heads,
                                   rank_scale=rank_scale))
            self.ffn_blocks.append(
                FeedForwardNetwork(hidden_size, filter_size, relu_dropout))
            self.norm1_blocks.append(LayerNormalization(hidden_size))
            self.norm2_blocks.append(LayerNormalization(hidden_size))
        self.out_norm = LayerNormalization(hidden_size)

    def forward(self, src_tokens, src_lengths):
        # embed tokens plus positions
        input_to_padding = attention_bias_ignore_padding(
            src_tokens, self.dictionary.pad())
        encoder_self_attention_bias = encoder_attention_bias(input_to_padding)
        encoder_input = self.embed_tokens(src_tokens)
        if self.pos != "nopos":
            encoder_input += self.embed_positions(src_tokens)

        x = F.dropout(encoder_input, p=self.dropout, training=self.training)

        for self_attention, ffn, norm1, norm2 in zip(
                self.self_attention_blocks, self.ffn_blocks, self.norm1_blocks,
                self.norm2_blocks):
            y = self_attention(norm1(x), None, encoder_self_attention_bias)
            x = residual(x, y, self.dropout, self.training)
            y = ffn(norm2(x))
            x = residual(x, y, self.dropout, self.training)
        x = self.out_norm(x)
        return x

    def max_positions(self):
        """Maximum input length supported by the encoder."""
        if self.pos == "learned":
            return self.embed_positions.max_positions()
        else:
            return 1024
Beispiel #3
0
class DualPathEncoder(FairseqEncoder):
    """Transformer encoder."""
    def __init__(self,
                 dictionary,
                 embed_dim=256,
                 max_positions=1024,
                 pos="learned",
                 num_layers=2,
                 num_heads=8,
                 filter_size=256,
                 hidden_size=256,
                 dropout=0.1,
                 attention_dropout=0.1,
                 relu_dropout=0.1,
                 convolutions=((256, 3), ) * 4):
        super().__init__(dictionary)
        assert pos == "learned" or pos == "timing" or pos == "nopos"

        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.relu_dropout = relu_dropout
        self.pos = pos

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        if self.pos == "learned":
            self.embed_positions = PositionalEmbedding(
                max_positions,
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)
        if self.pos == "timing":
            self.embed_positions = SinusoidalPositionalEmbedding(
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)

        self.layers = num_layers

        self.self_attention_blocks = nn.ModuleList()
        self.ffn_blocks = nn.ModuleList()
        self.norm1_blocks = nn.ModuleList()
        self.norm2_blocks = nn.ModuleList()
        for i in range(num_layers):
            self.self_attention_blocks.append(
                MultiheadAttention(hidden_size, hidden_size, hidden_size,
                                   num_heads))
            self.ffn_blocks.append(
                FeedForwardNetwork(hidden_size, filter_size, relu_dropout))
            self.norm1_blocks.append(LayerNormalization(hidden_size))
            self.norm2_blocks.append(LayerNormalization(hidden_size))
        self.out_norm = LayerNormalization(hidden_size)

        in_channels = convolutions[0][0]
        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        for (out_channels, kernel_size) in convolutions:
            pad = (kernel_size - 1) / 2
            self.projections.append(
                Linear(in_channels, out_channels
                       ) if in_channels != out_channels else None)
            self.convolutions.append(
                ConvTBC(in_channels,
                        out_channels * 2,
                        kernel_size,
                        padding=pad,
                        dropout=dropout))
            in_channels = out_channels
        self.fc2 = Linear(in_channels, embed_dim)

    def forward(self, src_tokens, src_lengths):
        # embed tokens plus positions
        input_to_padding = attention_bias_ignore_padding(
            src_tokens, self.dictionary.pad())
        encoder_self_attention_bias = encoder_attention_bias(input_to_padding)
        encoder_input = self.embed_tokens(src_tokens)
        if self.pos != "nopos":
            encoder_input += self.embed_positions(src_tokens)

        x = F.dropout(encoder_input, p=self.dropout, training=self.training)
        z = x

        for self_attention, ffn, norm1, norm2 in zip(
                self.self_attention_blocks, self.ffn_blocks, self.norm1_blocks,
                self.norm2_blocks):
            y = self_attention(norm1(x), None, encoder_self_attention_bias)
            x = residual(x, y, self.dropout, self.training)
            y = ffn(norm2(x))
            x = residual(x, y, self.dropout, self.training)
        x = self.out_norm(x)

        z = self.fc1(z)
        z = z.transpose(0, 1)
        for proj, conv in zip(self.projections, self.convolutions):
            r = z if proj is None else proj(x)
            z = F.dropout(z, p=self.dropout, training=self.training)
            z = conv(z)
            z = F.glu(z, dim=2)
            z = (z + r) * math.sqrt(0.5)
        z = z.transpose(1, 0)
        z = self.fc2(z)
        return (x, z)

    def max_positions(self):
        """Maximum input length supported by the encoder."""
        if self.pos == "learned":
            return self.embed_positions.max_positions()
        else:
            return 1024
Beispiel #4
0
class TransformerDecoder(FairseqIncrementalDecoder):
    """Transformer decoder."""
    def __init__(self,
                 dictionary,
                 embed_dim=256,
                 max_positions=1024,
                 pos="learned",
                 num_layers=2,
                 num_heads=8,
                 filter_size=256,
                 hidden_size=256,
                 dropout=0.1,
                 attention_dropout=0.1,
                 relu_dropout=0.1,
                 share_embed=False,
                 rank_scale=0.0):
        super().__init__(dictionary)
        self.register_buffer('version', torch.Tensor([2]))
        assert pos == "learned" or pos == "timing" or pos == "nopos"

        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.relu_dropout = relu_dropout
        self.pos = pos

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        if self.pos == "learned":
            self.embed_positions = PositionalEmbedding(
                max_positions,
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_TARGET)
        if self.pos == "timing":
            self.embed_positions = SinusoidalPositionalEmbedding(
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_TARGET)

        self.layers = num_layers

        self.self_attention_blocks = nn.ModuleList()
        self.encdec_attention_blocks = nn.ModuleList()
        self.ffn_blocks = nn.ModuleList()
        self.norm1_blocks = nn.ModuleList()
        self.norm2_blocks = nn.ModuleList()
        self.norm3_blocks = nn.ModuleList()
        for i in range(num_layers):
            self.self_attention_blocks.append(
                MultiheadAttentionDecoder(hidden_size,
                                          hidden_size,
                                          hidden_size,
                                          num_heads,
                                          rank_scale=rank_scale))
            self.ffn_blocks.append(
                FeedForwardNetwork(hidden_size, filter_size, relu_dropout))
            self.norm1_blocks.append(LayerNormalization(hidden_size))
            self.norm2_blocks.append(LayerNormalization(hidden_size))
            self.norm3_blocks.append(LayerNormalization(hidden_size))
            self.encdec_attention_blocks.append(
                MultiheadAttention(hidden_size,
                                   hidden_size,
                                   hidden_size,
                                   num_heads,
                                   rank_scale=rank_scale))
        self.out_norm = LayerNormalization(hidden_size)
        out_embed_dim = hidden_size
        if share_embed:
            assert out_embed_dim == embed_dim, \
                "Shared embed weights implies same dimensions " \
                " out_embed_dim={} vs embed_dim={}".format(out_embed_dim, embed_dim)
            self.out_embed = nn.Linear(hidden_size, num_embeddings)
            self.out_embed.weight = self.embed_tokens.weight
        else:
            self.out_embed = Linear(hidden_size,
                                    num_embeddings,
                                    dropout=dropout)

    def forward(self, input_tokens, encoder_out, incremental_state=None):
        # split and transpose encoder outputs

        input_to_padding = attention_bias_ignore_padding(
            input_tokens, self.dictionary.pad())
        decoder_self_attention_bias = encoder_attention_bias(input_to_padding)
        decoder_self_attention_bias += attention_bias_lower_triangle(
            input_tokens)
        # embed positions

        positions = self.embed_positions(input_tokens, incremental_state)
        if incremental_state is not None:
            input_tokens = input_tokens[:, -1:]
            decoder_self_attention_bias = decoder_self_attention_bias[:,
                                                                      -1:, :]

        # embed tokens and positions
        x = self.embed_tokens(input_tokens) + positions
        x = F.dropout(x, p=self.dropout, training=self.training)

        avg_attn_scores = None
        num_attn_layers = len(self.encdec_attention_blocks)
        for self_attention, encdec_attention, ffn, norm1, norm2, norm3 in zip(
                self.self_attention_blocks, self.encdec_attention_blocks,
                self.ffn_blocks, self.norm1_blocks, self.norm2_blocks,
                self.norm3_blocks):
            y = self_attention(norm1(x), None, decoder_self_attention_bias,
                               incremental_state)
            x = residual(x, y, self.dropout, self.training)

            if incremental_state is not None:
                y, attn_scores = encdec_attention(norm2(x), encoder_out, None,
                                                  True)
                attn_scores = attn_scores / self.layers
                if avg_attn_scores is None:
                    avg_attn_scores = attn_scores
                else:
                    avg_attn_scores.add_(attn_scores)
            else:
                y = encdec_attention(norm2(x), encoder_out, None)
            x = residual(x, y, self.dropout, self.training)

            y = ffn(norm3(x))
            x = residual(x, y, self.dropout, self.training)
        x = self.out_embed(self.out_norm(x))
        return x, avg_attn_scores

    def max_positions(self):
        """Maximum output length supported by the decoder."""
        return self.embed_positions.max_positions()

    def upgrade_state_dict(self, state_dict):
        if state_dict.get('decoder.version', torch.Tensor([1]))[0] < 2:
            # old models use incorrect weight norm dimension
            for i, conv in enumerate(self.convolutions):
                # reconfigure weight norm
                nn.utils.remove_weight_norm(conv)
                self.convolutions[i] = nn.utils.weight_norm(conv, dim=0)
            state_dict['decoder.version'] = torch.Tensor([1])
        return state_dict
Beispiel #5
0
class DPEncoder(FairseqEncoder):
    """Transformer encoder."""
    def __init__(self,
                 dictionary,
                 embed_dim=256,
                 max_positions=1024,
                 pos="learned",
                 num_layers=2,
                 num_heads=8,
                 filter_size=256,
                 hidden_size=256,
                 dropout=0.1,
                 attention_dropout=0.1,
                 relu_dropout=0.1,
                 convolutions=4):
        super().__init__(dictionary)
        assert pos == "learned" or pos == "timing" or pos == "nopos"

        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.relu_dropout = relu_dropout
        self.pos = pos

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        if self.pos == "learned":
            self.embed_positions = PositionalEmbedding(
                max_positions,
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)
        if self.pos == "timing":
            self.embed_positions = SinusoidalPositionalEmbedding(
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)

        self.layers = num_layers
        self.attnpath = AttnPathEncoder(self.layers,
                                        num_heads=num_heads,
                                        filter_size=filter_size,
                                        hidden_size=hidden_size,
                                        dropout=dropout,
                                        attention_dropout=attention_dropout,
                                        relu_dropout=relu_dropout)
        self.cnnpath = CNNPathEncoder(self.layers,
                                      hidden_size=hidden_size,
                                      dropout=dropout,
                                      in_embed=hidden_size,
                                      out_embed=hidden_size)

    def forward(self, src_tokens, src_lengths):
        # embed tokens plus positions
        input_to_padding = attention_bias_ignore_padding(
            src_tokens, self.dictionary.pad())
        encoder_self_attention_bias = encoder_attention_bias(input_to_padding)
        encoder_input = self.embed_tokens(src_tokens)
        if self.pos != "nopos":
            encoder_input += self.embed_positions(src_tokens)

        x = F.dropout(encoder_input, p=self.dropout, training=self.training)

        attn_x = self.attnpath(x)
        cnn_x = self.cnnpath(x)

        return (attn_x, cnn_x)

    def max_positions(self):
        """Maximum input length supported by the encoder."""
        if self.pos == "learned":
            return self.embed_positions.max_positions()
        else:
            return 1024