Ejemplo n.º 1
0
 def __init__(self, num_embeddings, embedding_dim, padding_idx, learned=True):
     super().__init__()
     if learned:
         self.embeddings = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx)
     else:
         self.embeddings = SinusoidalPositionalEmbedding(int(embedding_dim), padding_idx)
     self.padding_idx = padding_idx
Ejemplo n.º 2
0
    def __init__(self,
                 dictionary,
                 embed_dim=256,
                 max_positions=1024,
                 pos="learned",
                 num_layers=2,
                 num_heads=8,
                 filter_size=256,
                 hidden_size=256,
                 dropout=0.1,
                 attention_dropout=0.1,
                 relu_dropout=0.1,
                 rank_scale=0.0):
        super().__init__(dictionary)
        assert pos == "learned" or pos == "timing" or pos == "nopos"

        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.relu_dropout = relu_dropout
        self.pos = pos

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        if self.pos == "learned":
            self.embed_positions = PositionalEmbedding(
                max_positions,
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)
        if self.pos == "timing":
            self.embed_positions = SinusoidalPositionalEmbedding(
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)

        self.layers = num_layers

        self.self_attention_blocks = nn.ModuleList()
        self.ffn_blocks = nn.ModuleList()
        self.norm1_blocks = nn.ModuleList()
        self.norm2_blocks = nn.ModuleList()
        for i in range(num_layers):
            self.self_attention_blocks.append(
                MultiheadAttention(hidden_size,
                                   hidden_size,
                                   hidden_size,
                                   num_heads,
                                   rank_scale=rank_scale))
            self.ffn_blocks.append(
                FeedForwardNetwork(hidden_size, filter_size, relu_dropout))
            self.norm1_blocks.append(LayerNormalization(hidden_size))
            self.norm2_blocks.append(LayerNormalization(hidden_size))
        self.out_norm = LayerNormalization(hidden_size)
Ejemplo n.º 3
0
    def __init__(self,
                 dictionary,
                 embed_dim=256,
                 max_positions=1024,
                 pos="learned",
                 num_layers=2,
                 num_heads=8,
                 filter_size=256,
                 hidden_size=256,
                 dropout=0.1,
                 attention_dropout=0.1,
                 relu_dropout=0.1,
                 convolutions=4):
        super().__init__(dictionary)
        assert pos == "learned" or pos == "timing" or pos == "nopos"

        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.relu_dropout = relu_dropout
        self.pos = pos

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        if self.pos == "learned":
            self.embed_positions = PositionalEmbedding(
                max_positions,
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)
        if self.pos == "timing":
            self.embed_positions = SinusoidalPositionalEmbedding(
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)

        self.layers = num_layers
        self.attnpath = AttnPathEncoder(self.layers,
                                        num_heads=num_heads,
                                        filter_size=filter_size,
                                        hidden_size=hidden_size,
                                        dropout=dropout,
                                        attention_dropout=attention_dropout,
                                        relu_dropout=relu_dropout)
        self.cnnpath = CNNPathEncoder(self.layers,
                                      hidden_size=hidden_size,
                                      dropout=dropout,
                                      in_embed=hidden_size,
                                      out_embed=hidden_size)
Ejemplo n.º 4
0
class PositionalEmbeddingAudio(nn.Module):
    """This module learns audio positional embeddings up to a fixed maximum size.

    Padding symbols are ignored, but it is necessary to specify whether padding
    is added on the left side (left_pad=True) or right side (left_pad=False).
    """

    def __init__(self, num_embeddings, embedding_dim, padding_idx, learned=True):
        super().__init__()
        if learned:
            self.embeddings = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx)
        else:
            self.embeddings = SinusoidalPositionalEmbedding(int(embedding_dim), padding_idx)
        self.padding_idx = padding_idx

    def forward(self, input, lengths, incremental_state=None):
        """Input is expected to be of size [bsz x seqlen x feature_dim]."""
        max_length = max(lengths)
        pos_tensor = lengths.new(input.size(0), max_length).fill_(self.padding_idx)
        for i, l in enumerate(lengths):
            pos_tensor[i, :l] = self.padding_idx + 1
        return self.embeddings(pos_tensor)

    def max_positions(self):
        """Maximum number of supported positions."""
        return self.embeddings.max_positions()


    @property
    def weight(self):
        return self.embeddings.weight
Ejemplo n.º 5
0
def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx, left_pad, learned=False):
    if learned:
        m = LearnedPositionalEmbedding(num_embeddings + padding_idx + 1, embedding_dim, padding_idx, left_pad)
        nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
        nn.init.constant_(m.weight[padding_idx], 0)
    else:
        m = SinusoidalPositionalEmbedding(embedding_dim, padding_idx, left_pad, num_embeddings + padding_idx + 1)
    return m
Ejemplo n.º 6
0
def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx, learned=False):
    if learned:
        m = LearnedPositionalEmbedding(num_embeddings + padding_idx + 1, embedding_dim, padding_idx)
        nn.init.normal_(m.weight, mean=0, std=0.02)
        nn.init.constant_(m.weight[padding_idx], 0)
    else:
        m = SinusoidalPositionalEmbedding(
            embedding_dim,
            padding_idx,
            init_size=num_embeddings + padding_idx + 1,
        )
    return m
Ejemplo n.º 7
0
    def __init__(self, args):
        """
        Transformer model, following Vaswani et al., 2017.
        """
        scale_factor = args.encoder_embed_dim // 256
        super(RotomerTransformerModel,
              self).__init__(scale_factor=scale_factor)

        self.layers = nn.ModuleList([
            TransformerEncoderLayer(args) for i in range(args.encoder_layers)
        ])

        self.fc1 = nn.Linear(args.encoder_embed_dim, args.encoder_embed_dim)
        self.fc2 = nn.Linear(args.encoder_embed_dim, 1)

        self.embed_positions = SinusoidalPositionalEmbedding(
            args.encoder_embed_dim,
            -1,
            left_pad=False,
        )
Ejemplo n.º 8
0
def PositionalEmbeddingCreator(
    num_embeddings, embedding_dim, padding_idx, left_pad, learned=False
):
    if learned:
        m = LearnedPositionalEmbedding(
            num_embeddings + padding_idx + 1, embedding_dim, padding_idx
        )
        nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
        nn.init.constant_(m.weight[padding_idx], 0)
    else:
        # sys.stderr.write(str(type(embedding_dim)) + ",")
        # sys.stderr.write(str(type(padding_idx)) + ",")
        # sys.stderr.write(str(type(left_pad)) + ",")
        # sys.stderr.write(str(type(num_embeddings)))
        # sys.stderr.write("\n")
        m = SinusoidalPositionalEmbedding(
            embedding_dim=embedding_dim,
            padding_idx=padding_idx,
            init_size=num_embeddings + padding_idx + 1,
        )
    return m
Ejemplo n.º 9
0
    def __init__(self,
                 n_layers,
                 n_heads,
                 d_model,
                 attn_dropout,
                 relu_dropout,
                 emb_dropout,
                 res_dropout,
                 attn_mask,
                 scale_embedding=True):
        super(CrossmodalTransformer, self).__init__()
        self.attn_mask = attn_mask
        self.emb_scale = math.sqrt(d_model) if scale_embedding else 1.0
        self.pos_emb = SinusoidalPositionalEmbedding(d_model, 0, init_size=128)
        self.dropout = nn.Dropout(emb_dropout)

        layer = TransformerEncoderBlock(d_model=d_model,
                                        n_heads=n_heads,
                                        d_feedforward=d_model * 4,
                                        attn_dropout=attn_dropout,
                                        res_dropout=res_dropout,
                                        relu_dropout=relu_dropout)
        self.layers = _get_clones(layer, n_layers)
Ejemplo n.º 10
0
 def __init__(
     self,
     d_model,
     nhead,
     emb_dropout,
     attn_dropout,
     res_dropout,
     relu_dropout,
     n_layer,
     attn_mask,
     scale_embedding=True,
 ):
     super(CrossmodalTransformer, self).__init__()
     self.attn_mask = attn_mask
     self.emb_scale = math.sqrt(d_model) if scale_embedding else 1.0
     self.pos = SinusoidalPositionalEmbedding(d_model, 0, init_size=128)
     self.emb_dropout = emb_dropout
     self.layers = nn.ModuleList([])
     for layer in range(n_layer):
         new_layer = TransformerEncoderBlock(d_model, nhead, d_model * 4,
                                             attn_dropout, res_dropout,
                                             relu_dropout)
         self.layers.append(new_layer)
Ejemplo n.º 11
0
class TransformerDecoder(FairseqIncrementalDecoder):
    """Transformer decoder."""
    def __init__(self,
                 dictionary,
                 embed_dim=256,
                 max_positions=1024,
                 pos="learned",
                 num_layers=2,
                 num_heads=8,
                 filter_size=256,
                 hidden_size=256,
                 dropout=0.1,
                 attention_dropout=0.1,
                 relu_dropout=0.1,
                 share_embed=False,
                 rank_scale=0.0):
        super().__init__(dictionary)
        self.register_buffer('version', torch.Tensor([2]))
        assert pos == "learned" or pos == "timing" or pos == "nopos"

        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.relu_dropout = relu_dropout
        self.pos = pos

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        if self.pos == "learned":
            self.embed_positions = PositionalEmbedding(
                max_positions,
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_TARGET)
        if self.pos == "timing":
            self.embed_positions = SinusoidalPositionalEmbedding(
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_TARGET)

        self.layers = num_layers

        self.self_attention_blocks = nn.ModuleList()
        self.encdec_attention_blocks = nn.ModuleList()
        self.ffn_blocks = nn.ModuleList()
        self.norm1_blocks = nn.ModuleList()
        self.norm2_blocks = nn.ModuleList()
        self.norm3_blocks = nn.ModuleList()
        for i in range(num_layers):
            self.self_attention_blocks.append(
                MultiheadAttentionDecoder(hidden_size,
                                          hidden_size,
                                          hidden_size,
                                          num_heads,
                                          rank_scale=rank_scale))
            self.ffn_blocks.append(
                FeedForwardNetwork(hidden_size, filter_size, relu_dropout))
            self.norm1_blocks.append(LayerNormalization(hidden_size))
            self.norm2_blocks.append(LayerNormalization(hidden_size))
            self.norm3_blocks.append(LayerNormalization(hidden_size))
            self.encdec_attention_blocks.append(
                MultiheadAttention(hidden_size,
                                   hidden_size,
                                   hidden_size,
                                   num_heads,
                                   rank_scale=rank_scale))
        self.out_norm = LayerNormalization(hidden_size)
        out_embed_dim = hidden_size
        if share_embed:
            assert out_embed_dim == embed_dim, \
                "Shared embed weights implies same dimensions " \
                " out_embed_dim={} vs embed_dim={}".format(out_embed_dim, embed_dim)
            self.out_embed = nn.Linear(hidden_size, num_embeddings)
            self.out_embed.weight = self.embed_tokens.weight
        else:
            self.out_embed = Linear(hidden_size,
                                    num_embeddings,
                                    dropout=dropout)

    def forward(self, input_tokens, encoder_out, incremental_state=None):
        # split and transpose encoder outputs

        input_to_padding = attention_bias_ignore_padding(
            input_tokens, self.dictionary.pad())
        decoder_self_attention_bias = encoder_attention_bias(input_to_padding)
        decoder_self_attention_bias += attention_bias_lower_triangle(
            input_tokens)
        # embed positions

        positions = self.embed_positions(input_tokens, incremental_state)
        if incremental_state is not None:
            input_tokens = input_tokens[:, -1:]
            decoder_self_attention_bias = decoder_self_attention_bias[:,
                                                                      -1:, :]

        # embed tokens and positions
        x = self.embed_tokens(input_tokens) + positions
        x = F.dropout(x, p=self.dropout, training=self.training)

        avg_attn_scores = None
        num_attn_layers = len(self.encdec_attention_blocks)
        for self_attention, encdec_attention, ffn, norm1, norm2, norm3 in zip(
                self.self_attention_blocks, self.encdec_attention_blocks,
                self.ffn_blocks, self.norm1_blocks, self.norm2_blocks,
                self.norm3_blocks):
            y = self_attention(norm1(x), None, decoder_self_attention_bias,
                               incremental_state)
            x = residual(x, y, self.dropout, self.training)

            if incremental_state is not None:
                y, attn_scores = encdec_attention(norm2(x), encoder_out, None,
                                                  True)
                attn_scores = attn_scores / self.layers
                if avg_attn_scores is None:
                    avg_attn_scores = attn_scores
                else:
                    avg_attn_scores.add_(attn_scores)
            else:
                y = encdec_attention(norm2(x), encoder_out, None)
            x = residual(x, y, self.dropout, self.training)

            y = ffn(norm3(x))
            x = residual(x, y, self.dropout, self.training)
        x = self.out_embed(self.out_norm(x))
        return x, avg_attn_scores

    def max_positions(self):
        """Maximum output length supported by the decoder."""
        return self.embed_positions.max_positions()

    def upgrade_state_dict(self, state_dict):
        if state_dict.get('decoder.version', torch.Tensor([1]))[0] < 2:
            # old models use incorrect weight norm dimension
            for i, conv in enumerate(self.convolutions):
                # reconfigure weight norm
                nn.utils.remove_weight_norm(conv)
                self.convolutions[i] = nn.utils.weight_norm(conv, dim=0)
            state_dict['decoder.version'] = torch.Tensor([1])
        return state_dict
Ejemplo n.º 12
0
class DualPathEncoder(FairseqEncoder):
    """Transformer encoder."""
    def __init__(self,
                 dictionary,
                 embed_dim=256,
                 max_positions=1024,
                 pos="learned",
                 num_layers=2,
                 num_heads=8,
                 filter_size=256,
                 hidden_size=256,
                 dropout=0.1,
                 attention_dropout=0.1,
                 relu_dropout=0.1,
                 convolutions=((256, 3), ) * 4):
        super().__init__(dictionary)
        assert pos == "learned" or pos == "timing" or pos == "nopos"

        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.relu_dropout = relu_dropout
        self.pos = pos

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        if self.pos == "learned":
            self.embed_positions = PositionalEmbedding(
                max_positions,
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)
        if self.pos == "timing":
            self.embed_positions = SinusoidalPositionalEmbedding(
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)

        self.layers = num_layers

        self.self_attention_blocks = nn.ModuleList()
        self.ffn_blocks = nn.ModuleList()
        self.norm1_blocks = nn.ModuleList()
        self.norm2_blocks = nn.ModuleList()
        for i in range(num_layers):
            self.self_attention_blocks.append(
                MultiheadAttention(hidden_size, hidden_size, hidden_size,
                                   num_heads))
            self.ffn_blocks.append(
                FeedForwardNetwork(hidden_size, filter_size, relu_dropout))
            self.norm1_blocks.append(LayerNormalization(hidden_size))
            self.norm2_blocks.append(LayerNormalization(hidden_size))
        self.out_norm = LayerNormalization(hidden_size)

        in_channels = convolutions[0][0]
        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        for (out_channels, kernel_size) in convolutions:
            pad = (kernel_size - 1) / 2
            self.projections.append(
                Linear(in_channels, out_channels
                       ) if in_channels != out_channels else None)
            self.convolutions.append(
                ConvTBC(in_channels,
                        out_channels * 2,
                        kernel_size,
                        padding=pad,
                        dropout=dropout))
            in_channels = out_channels
        self.fc2 = Linear(in_channels, embed_dim)

    def forward(self, src_tokens, src_lengths):
        # embed tokens plus positions
        input_to_padding = attention_bias_ignore_padding(
            src_tokens, self.dictionary.pad())
        encoder_self_attention_bias = encoder_attention_bias(input_to_padding)
        encoder_input = self.embed_tokens(src_tokens)
        if self.pos != "nopos":
            encoder_input += self.embed_positions(src_tokens)

        x = F.dropout(encoder_input, p=self.dropout, training=self.training)
        z = x

        for self_attention, ffn, norm1, norm2 in zip(
                self.self_attention_blocks, self.ffn_blocks, self.norm1_blocks,
                self.norm2_blocks):
            y = self_attention(norm1(x), None, encoder_self_attention_bias)
            x = residual(x, y, self.dropout, self.training)
            y = ffn(norm2(x))
            x = residual(x, y, self.dropout, self.training)
        x = self.out_norm(x)

        z = self.fc1(z)
        z = z.transpose(0, 1)
        for proj, conv in zip(self.projections, self.convolutions):
            r = z if proj is None else proj(x)
            z = F.dropout(z, p=self.dropout, training=self.training)
            z = conv(z)
            z = F.glu(z, dim=2)
            z = (z + r) * math.sqrt(0.5)
        z = z.transpose(1, 0)
        z = self.fc2(z)
        return (x, z)

    def max_positions(self):
        """Maximum input length supported by the encoder."""
        if self.pos == "learned":
            return self.embed_positions.max_positions()
        else:
            return 1024
Ejemplo n.º 13
0
    def __init__(self,
                 dictionary,
                 embed_dim=256,
                 max_positions=1024,
                 pos="learned",
                 num_layers=2,
                 num_heads=8,
                 filter_size=256,
                 hidden_size=256,
                 dropout=0.1,
                 attention_dropout=0.1,
                 relu_dropout=0.1,
                 convolutions=((256, 3), ) * 4):
        super().__init__(dictionary)
        assert pos == "learned" or pos == "timing" or pos == "nopos"

        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.relu_dropout = relu_dropout
        self.pos = pos

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        if self.pos == "learned":
            self.embed_positions = PositionalEmbedding(
                max_positions,
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)
        if self.pos == "timing":
            self.embed_positions = SinusoidalPositionalEmbedding(
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)

        self.layers = num_layers

        self.self_attention_blocks = nn.ModuleList()
        self.ffn_blocks = nn.ModuleList()
        self.norm1_blocks = nn.ModuleList()
        self.norm2_blocks = nn.ModuleList()
        for i in range(num_layers):
            self.self_attention_blocks.append(
                MultiheadAttention(hidden_size, hidden_size, hidden_size,
                                   num_heads))
            self.ffn_blocks.append(
                FeedForwardNetwork(hidden_size, filter_size, relu_dropout))
            self.norm1_blocks.append(LayerNormalization(hidden_size))
            self.norm2_blocks.append(LayerNormalization(hidden_size))
        self.out_norm = LayerNormalization(hidden_size)

        in_channels = convolutions[0][0]
        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        for (out_channels, kernel_size) in convolutions:
            pad = (kernel_size - 1) / 2
            self.projections.append(
                Linear(in_channels, out_channels
                       ) if in_channels != out_channels else None)
            self.convolutions.append(
                ConvTBC(in_channels,
                        out_channels * 2,
                        kernel_size,
                        padding=pad,
                        dropout=dropout))
            in_channels = out_channels
        self.fc2 = Linear(in_channels, embed_dim)
Ejemplo n.º 14
0
def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx, left_pad):
    m = SinusoidalPositionalEmbedding(embedding_dim, padding_idx, left_pad,
                                      num_embeddings + padding_idx + 1)
    return m
Ejemplo n.º 15
0
    def __init__(self,
                 args,
                 dictionary,
                 embed_tokens,
                 left_pad=False,
                 final_norm=True):
        super().__init__(dictionary)
        self.padding_idx = embed_tokens.padding_idx
        self.dropout = args.dropout
        self.share_input_output_embed = args.share_input_output_embed

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args.embed_dim
        output_embed_dim = args.output_dim

        padding_idx = embed_tokens.padding_idx

        self.embed_tokens = embed_tokens
        #self.embed_scale = math.sqrt(embed_dim)  # todo: try with input_embed_dim

        self.max_positions = args.max_positions + 1

        self.embed_segment = nn.Embedding(
            args.num_segment,
            embed_dim,
            self.padding_idx,
        ) if args.num_segment > 0 else None

        self.project_in_dim = nn.Linear(
            input_embed_dim, embed_dim,
            bias=False) if embed_dim != input_embed_dim else None
        self.prediction_word_embedding = nn.Parameter(
            torch.Tensor(1, 1, embed_dim).zero_())
        self.embed_positions = PositionalEmbedding(
            self.max_positions,
            embed_dim,
            padding_idx,
            left_pad=left_pad,
        ) if not args.no_token_positional_embeddings else None

        def make_layers(args, layers, needs_key_values):
            if args.universal:
                layers = [
                    ShuffleTransformerDecoderLayer(
                        args, needs_key_values=needs_key_values)
                ] * layers
            else:
                layers = [
                    ShuffleTransformerDecoderLayer(
                        args, needs_key_values=needs_key_values)
                    for _ in range(layers)
                ]
            return nn.ModuleList(layers)

        self.stacked_decoder = args.stacked_decoder
        self.encoder_layers = make_layers(args,
                                          args.encoder_layers,
                                          needs_key_values=True)
        self.decoder_layers = make_layers(
            args, args.decoder_layers,
            needs_key_values=False) if args.asymmetric else self.encoder_layers

        if not args.stacked_decoder and args.encoder_layers != args.decoder_layers:
            raise (
                "If not using stacked-decoder, encoder and decoder must have the same number of layers"
            )
        if not args.asymmetric and args.encoder_layers != args.decoder_layers:
            raise (
                "If not using asymmetric, encoder and decoder must have the same number of layers"
            )

        if args.relative_position == 'sinusoidal':
            num_positions = self.max_positions
            sinusoidal_positions = SinusoidalPositionalEmbedding.get_embedding(
                num_positions, args.embed_dim // args.attention_heads)
            sinusoidal_relative_positions = []
            for i in range(num_positions):
                sinusoidal_relative_positions.append(
                    torch.cat([
                        sinusoidal_positions[num_positions - i:],
                        sinusoidal_positions[:num_positions - i]
                    ], 0))
                # Make sentinel token have same relative position to everything
                sinusoidal_relative_positions[-1][0] = 0
                assert sinusoidal_relative_positions[-1].size(
                ) == sinusoidal_positions.size()
            sinusoidal_relative_positions = torch.stack(
                sinusoidal_relative_positions, 0)
            self.sinusoidal_relative_positions = nn.Parameter(
                sinusoidal_relative_positions)

            assert sinusoidal_relative_positions.size() == (
                num_positions, num_positions,
                args.embed_dim // args.attention_heads)
            #assert (sinusoidal_relative_positions[0] == sinusoidal_positions).all()
            assert (sinusoidal_relative_positions[7, 7] ==
                    sinusoidal_relative_positions[11, 11]).all()
            assert (sinusoidal_relative_positions[5, 11] ==
                    sinusoidal_relative_positions[6, 12]).all()
        else:
            self.sinusoidal_relative_positions = None

        self.adaptive_softmax = None

        self.project_out_dim = nn.Linear(embed_dim, output_embed_dim, bias=False) \
            if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None

        self.load_softmax = not getattr(args, 'remove_head', False)
        if self.load_softmax:
            if args.adaptive_softmax_cutoff is not None:
                self.adaptive_softmax = AdaptiveSoftmax(
                    len(dictionary),
                    output_embed_dim,
                    options.eval_str_list(args.adaptive_softmax_cutoff,
                                          type=int),
                    dropout=args.adaptive_softmax_dropout,
                    adaptive_inputs=embed_tokens
                    if args.tie_adaptive_weights else None,
                    factor=args.adaptive_softmax_factor,
                    tie_proj=args.tie_adaptive_proj,
                )
            elif not self.share_input_output_embed:
                self.embed_out = nn.Parameter(
                    torch.Tensor(len(dictionary), output_embed_dim))
                nn.init.normal_(self.embed_out,
                                mean=0,
                                std=output_embed_dim**-0.5)

            #if args.sentence_class_num > 0:
            #    self.sentence_projection_layer = Linear(embed_dim, args.sentence_class_num, bias=False)

        self.normalize = args.normalize_before and final_norm
        if self.normalize:
            self.layer_norm = BertLayerNorm(embed_dim)

        self.apply(self.init_bert_weights)
Ejemplo n.º 16
0
class TransformerEncoder(FairseqEncoder):
    """Transformer encoder."""
    def __init__(self,
                 dictionary,
                 embed_dim=256,
                 max_positions=1024,
                 pos="learned",
                 num_layers=2,
                 num_heads=8,
                 filter_size=256,
                 hidden_size=256,
                 dropout=0.1,
                 attention_dropout=0.1,
                 relu_dropout=0.1,
                 rank_scale=0.0):
        super().__init__(dictionary)
        assert pos == "learned" or pos == "timing" or pos == "nopos"

        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.relu_dropout = relu_dropout
        self.pos = pos

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        if self.pos == "learned":
            self.embed_positions = PositionalEmbedding(
                max_positions,
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)
        if self.pos == "timing":
            self.embed_positions = SinusoidalPositionalEmbedding(
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)

        self.layers = num_layers

        self.self_attention_blocks = nn.ModuleList()
        self.ffn_blocks = nn.ModuleList()
        self.norm1_blocks = nn.ModuleList()
        self.norm2_blocks = nn.ModuleList()
        for i in range(num_layers):
            self.self_attention_blocks.append(
                MultiheadAttention(hidden_size,
                                   hidden_size,
                                   hidden_size,
                                   num_heads,
                                   rank_scale=rank_scale))
            self.ffn_blocks.append(
                FeedForwardNetwork(hidden_size, filter_size, relu_dropout))
            self.norm1_blocks.append(LayerNormalization(hidden_size))
            self.norm2_blocks.append(LayerNormalization(hidden_size))
        self.out_norm = LayerNormalization(hidden_size)

    def forward(self, src_tokens, src_lengths):
        # embed tokens plus positions
        input_to_padding = attention_bias_ignore_padding(
            src_tokens, self.dictionary.pad())
        encoder_self_attention_bias = encoder_attention_bias(input_to_padding)
        encoder_input = self.embed_tokens(src_tokens)
        if self.pos != "nopos":
            encoder_input += self.embed_positions(src_tokens)

        x = F.dropout(encoder_input, p=self.dropout, training=self.training)

        for self_attention, ffn, norm1, norm2 in zip(
                self.self_attention_blocks, self.ffn_blocks, self.norm1_blocks,
                self.norm2_blocks):
            y = self_attention(norm1(x), None, encoder_self_attention_bias)
            x = residual(x, y, self.dropout, self.training)
            y = ffn(norm2(x))
            x = residual(x, y, self.dropout, self.training)
        x = self.out_norm(x)
        return x

    def max_positions(self):
        """Maximum input length supported by the encoder."""
        if self.pos == "learned":
            return self.embed_positions.max_positions()
        else:
            return 1024
Ejemplo n.º 17
0
    def __init__(self,
                 dictionary,
                 embed_dim=256,
                 max_positions=1024,
                 pos="learned",
                 num_layers=2,
                 num_heads=8,
                 filter_size=256,
                 hidden_size=256,
                 dropout=0.1,
                 attention_dropout=0.1,
                 relu_dropout=0.1,
                 share_embed=False,
                 rank_scale=0.0):
        super().__init__(dictionary)
        self.register_buffer('version', torch.Tensor([2]))
        assert pos == "learned" or pos == "timing" or pos == "nopos"

        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.relu_dropout = relu_dropout
        self.pos = pos

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        if self.pos == "learned":
            self.embed_positions = PositionalEmbedding(
                max_positions,
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_TARGET)
        if self.pos == "timing":
            self.embed_positions = SinusoidalPositionalEmbedding(
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_TARGET)

        self.layers = num_layers

        self.self_attention_blocks = nn.ModuleList()
        self.encdec_attention_blocks = nn.ModuleList()
        self.ffn_blocks = nn.ModuleList()
        self.norm1_blocks = nn.ModuleList()
        self.norm2_blocks = nn.ModuleList()
        self.norm3_blocks = nn.ModuleList()
        for i in range(num_layers):
            self.self_attention_blocks.append(
                MultiheadAttentionDecoder(hidden_size,
                                          hidden_size,
                                          hidden_size,
                                          num_heads,
                                          rank_scale=rank_scale))
            self.ffn_blocks.append(
                FeedForwardNetwork(hidden_size, filter_size, relu_dropout))
            self.norm1_blocks.append(LayerNormalization(hidden_size))
            self.norm2_blocks.append(LayerNormalization(hidden_size))
            self.norm3_blocks.append(LayerNormalization(hidden_size))
            self.encdec_attention_blocks.append(
                MultiheadAttention(hidden_size,
                                   hidden_size,
                                   hidden_size,
                                   num_heads,
                                   rank_scale=rank_scale))
        self.out_norm = LayerNormalization(hidden_size)
        out_embed_dim = hidden_size
        if share_embed:
            assert out_embed_dim == embed_dim, \
                "Shared embed weights implies same dimensions " \
                " out_embed_dim={} vs embed_dim={}".format(out_embed_dim, embed_dim)
            self.out_embed = nn.Linear(hidden_size, num_embeddings)
            self.out_embed.weight = self.embed_tokens.weight
        else:
            self.out_embed = Linear(hidden_size,
                                    num_embeddings,
                                    dropout=dropout)
Ejemplo n.º 18
0
class DPEncoder(FairseqEncoder):
    """Transformer encoder."""
    def __init__(self,
                 dictionary,
                 embed_dim=256,
                 max_positions=1024,
                 pos="learned",
                 num_layers=2,
                 num_heads=8,
                 filter_size=256,
                 hidden_size=256,
                 dropout=0.1,
                 attention_dropout=0.1,
                 relu_dropout=0.1,
                 convolutions=4):
        super().__init__(dictionary)
        assert pos == "learned" or pos == "timing" or pos == "nopos"

        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.relu_dropout = relu_dropout
        self.pos = pos

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        if self.pos == "learned":
            self.embed_positions = PositionalEmbedding(
                max_positions,
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)
        if self.pos == "timing":
            self.embed_positions = SinusoidalPositionalEmbedding(
                embed_dim,
                padding_idx,
                left_pad=LanguagePairDataset.LEFT_PAD_SOURCE)

        self.layers = num_layers
        self.attnpath = AttnPathEncoder(self.layers,
                                        num_heads=num_heads,
                                        filter_size=filter_size,
                                        hidden_size=hidden_size,
                                        dropout=dropout,
                                        attention_dropout=attention_dropout,
                                        relu_dropout=relu_dropout)
        self.cnnpath = CNNPathEncoder(self.layers,
                                      hidden_size=hidden_size,
                                      dropout=dropout,
                                      in_embed=hidden_size,
                                      out_embed=hidden_size)

    def forward(self, src_tokens, src_lengths):
        # embed tokens plus positions
        input_to_padding = attention_bias_ignore_padding(
            src_tokens, self.dictionary.pad())
        encoder_self_attention_bias = encoder_attention_bias(input_to_padding)
        encoder_input = self.embed_tokens(src_tokens)
        if self.pos != "nopos":
            encoder_input += self.embed_positions(src_tokens)

        x = F.dropout(encoder_input, p=self.dropout, training=self.training)

        attn_x = self.attnpath(x)
        cnn_x = self.cnnpath(x)

        return (attn_x, cnn_x)

    def max_positions(self):
        """Maximum input length supported by the encoder."""
        if self.pos == "learned":
            return self.embed_positions.max_positions()
        else:
            return 1024