def __init__(self, args):
        super().__init__()
        self.embed_dim = args['model']['encoder_embed_dim']
        if args['model']['multihead_attention_version'] == 'pytorch':
            from ncc.modules.attention.pytorch_multihead_attention import PytorchMultiheadAttention
            self.self_attn = PytorchMultiheadAttention(
                self.embed_dim,
                args['model']['encoder_attention_heads'],
                dropout=args['model']['attention_dropout'])
        elif args['model']['multihead_attention_version'] == 'ncc':
            from ncc.modules.attention.ncc_multihead_attention import NccMultiheadAttention
            self.self_attn = NccMultiheadAttention(
                self.embed_dim,
                args['model']['encoder_attention_heads'],
                dropout=args['model']['attention_dropout'],
            )

        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
        self.dropout = args['model']['dropout']
        self.activation_fn = get_activation(
            activation_string=args['model'].get('activation_fn', 'relu'))
        self.activation_dropout = args['model']['activation_dropout']
        if self.activation_dropout == 0:
            # for backwards compatibility with models that use args.relu_dropout
            self.activation_dropout = args['model']['relu_dropout']
        self.normalize_before = args['model']['encoder_normalize_before']
        self.fc1 = Linear(self.embed_dim,
                          args['model']['encoder_ffn_embed_dim'])
        self.fc2 = Linear(args['model']['encoder_ffn_embed_dim'],
                          self.embed_dim)
        self.final_layer_norm = LayerNorm(self.embed_dim)
Esempio n. 2
0
    def __init__(
        self,
        embed_dim,
        num_heads,
        kdim=None,
        vdim=None,
        max_positions=None,
        dropout=0.0,
    ):
        super(MultiheadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim

        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        assert (self.head_dim * num_heads == self.embed_dim
                ), "embed_dim must be divisible by num_heads"
        self.scaling = self.head_dim**-0.5

        self.bias = nn.Parameter(
            torch.tril(torch.ones(max_positions,
                                  max_positions)).view(1, 1, max_positions,
                                                       max_positions))

        self.k_proj = Linear(self.kdim, embed_dim)
        self.v_proj = Linear(self.vdim, embed_dim)
        self.q_proj = Linear(embed_dim, embed_dim)
        self.out_proj = Linear(embed_dim, embed_dim)
Esempio n. 3
0
 def __init__(self,
              input_embed_dim,
              source_embed_dim,
              output_embed_dim,
              bias=False):
     super().__init__()
     self.input_proj = Linear(input_embed_dim, source_embed_dim, bias=bias)
     self.output_proj = Linear(input_embed_dim + source_embed_dim,
                               output_embed_dim,
                               bias=bias)
Esempio n. 4
0
    def __init__(
        self, dictionary, src_modalities=['code'], embed_dim=512, hidden_size=512, out_embed_dim=512,
        num_layers=1, dropout_in=0.1, dropout_out=0.1, attention=True,
        encoder_output_units=512, pretrained_embed=None,
        share_input_output_embed=False, adaptive_softmax_cutoff=None,
        max_target_positions=DEFAULT_MAX_TARGET_POSITIONS
    ):
        super().__init__(dictionary)
        self.src_modalities = src_modalities
        self.dropout_in = dropout_in
        self.dropout_out = dropout_out
        self.hidden_size = hidden_size
        self.share_input_output_embed = share_input_output_embed
        self.need_attn = True
        self.max_target_positions = max_target_positions

        self.adaptive_softmax = None
        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        if pretrained_embed is None:
            self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)
        else:
            self.embed_tokens = pretrained_embed

        self.encoder_output_units = encoder_output_units
        if encoder_output_units != hidden_size and encoder_output_units != 0:
            self.encoder_hidden_proj = Linear(encoder_output_units, hidden_size)
            self.encoder_cell_proj = Linear(encoder_output_units, hidden_size)
        else:
            self.encoder_hidden_proj = self.encoder_cell_proj = None

        # disable input feeding if there is no encoder
        # input feeding is described in arxiv.org/abs/1508.04025
        input_feed_size = 0 if encoder_output_units == 0 else hidden_size
        self.layers = nn.ModuleList([
            LSTMCell(
                input_size=input_feed_size + embed_dim if layer == 0 else hidden_size,
                hidden_size=hidden_size,
            )
            for layer in range(num_layers)
        ])
        if attention:
            # TODO make bias configurable
            # self.attention = AttentionLayer(hidden_size, encoder_output_units, hidden_size, bias=False)
            self.attention = None
        else:
            self.attention = None
        if hidden_size != out_embed_dim:
            self.additional_fc = Linear(hidden_size, out_embed_dim)
        # if adaptive_softmax_cutoff is not None:
        #     # setting adaptive_softmax dropout to dropout_out for now but can be redefined
        #     self.adaptive_softmax = AdaptiveSoftmax(num_embeddings, hidden_size, adaptive_softmax_cutoff,
        #                                             dropout=dropout_out)
        elif not self.share_input_output_embed:
            self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out)
Esempio n. 5
0
    def __init__(
        self,
        embed_dim,
        num_heads,
        kdim=None,
        vdim=None,
        dropout=0.0,
        bias=True,
        add_bias_kv=False,
        add_zero_attn=False,
        self_attention=False,
        encoder_decoder_attention=False,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim

        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        assert (
            self.head_dim * num_heads == self.embed_dim
        ), "embed_dim must be divisible by num_heads"
        self.scaling = self.head_dim ** -0.5

        self.self_attention = self_attention
        self.encoder_decoder_attention = encoder_decoder_attention

        assert not self.self_attention or self.qkv_same_dim, (
            "Self-attention requires query, key and " "value to be of the same size"
        )

        self.k_proj = Linear(
            self.kdim, embed_dim, bias=bias,
            weight_initializer=trunc_normal(mean=.0, std=.02),
        )
        self.v_proj = Linear(
            self.vdim, embed_dim, bias=bias,
            weight_initializer=trunc_normal(mean=.0, std=.02),
        )
        self.q_proj = Linear(
            embed_dim, embed_dim, bias=bias,
            weight_initializer=trunc_normal(mean=.0, std=.02),
        )
        self.out_proj = Linear(
            embed_dim, embed_dim, bias=bias,
            weight_initializer=trunc_normal(mean=.0, std=.02),
        )
        self.add_zero_attn = add_zero_attn
    def __init__(self,
                 args,
                 no_encoder_attn=False,
                 add_bias_kv=False,
                 add_zero_attn=False):
        super().__init__()
        self.embed_dim = args['model']['decoder_embed_dim']
        self.cross_self_attention = args['model']['cross_self_attention']
        self.self_attn = NccMultiheadAttention(
            embed_dim=self.embed_dim,
            num_heads=args['model']['decoder_attention_heads'],
            dropout=args['model']['attention_dropout'],
            add_bias_kv=add_bias_kv,
            add_zero_attn=add_zero_attn,
            self_attention=not self.cross_self_attention,
            # maximum_relative_position=args['model']['decoder_max_relative_len'],
        )
        self.dropout = args['model']['dropout']
        self.activation_fn = get_activation(args['model']['activation_fn'])
        self.activation_dropout = args['model']['activation_dropout']
        if self.activation_dropout == 0:
            # for backwards compatibility with models that use args.relu_dropout
            self.activation_dropout = args['model']['relu_dropout']
        self.normalize_before = args['model']['decoder_normalize_before']

        self.self_attn_layer_norm = LayerNorm(self.embed_dim)

        if no_encoder_attn:
            self.encoder_attn = None
            self.encoder_attn_layer_norm = None
        else:
            self.encoder_attn = NccMultiheadAttention(
                self.embed_dim,
                args['model']['decoder_attention_heads'],
                kdim=args['model']['encoder_embed_dim'],
                vdim=args['model']['encoder_embed_dim'],
                dropout=args['model']['attention_dropout'],
                encoder_decoder_attention=True,
            )
            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)

        self.fc1 = Linear(self.embed_dim,
                          args['model']['decoder_ffn_embed_dim'])
        self.fc2 = Linear(args['model']['decoder_ffn_embed_dim'],
                          self.embed_dim)

        self.final_layer_norm = LayerNorm(self.embed_dim)
        self.need_attn = True
Esempio n. 7
0
 def __init__(
     self,
     dictionary,
     embed_dim=512,
     hidden_size=512,
     num_layers=1,
     bidirectional=False,
     dropout=0.5,
     pretrained_embed=None,
     shared_embedding=False,
 ):
     super(LSTMDecoder, self).__init__(dictionary)
     if pretrained_embed is None:
         self.embed_tokens = Embedding(len(dictionary),
                                       embed_dim,
                                       padding_idx=dictionary.pad())
     else:
         self.embed_tokens = pretrained_embed
     self.rnn = LSTM(
         embed_dim,
         hidden_size,
         num_layers=num_layers,
         dropout=dropout,
         batch_first=True,
         bidirectional=
         False,  # in prediction task, cannot set bidirectional True
     )
     # self.dropout = dropout
     # self.bidirectional = bidirectional
     # if bidirectional:
     #     self.proj = Linear(hidden_size * 2, hidden_size)
     self.fc_out = Linear(hidden_size, len(dictionary))
     if shared_embedding:
         self.fc_out.weight = self.embed_tokens.weight
Esempio n. 8
0
 def __init__(self, embed_dim, attention_heads, dropout, ffn_embed_dim, activation_fn):
     super().__init__()
     self.dropout = dropout
     self.self_attn = MultiheadAttention(
         embed_dim=embed_dim, num_heads=attention_heads, dropout=dropout,
     )
     self.self_attn_layer_norm = LayerNorm(embed_dim)
     self.fc1 = Linear(
         embed_dim, ffn_embed_dim,
         weight_initializer=trunc_normal(mean=.0, std=.02),
     )
     self.fc2 = Linear(
         ffn_embed_dim, embed_dim,
         weight_initializer=trunc_normal(mean=.0, std=.02),
     )
     self.ff_layer_norm = LayerNorm(embed_dim)
     self.activation_fn = get_activation(activation_fn)
Esempio n. 9
0
    def __init__(
        self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False
    ):
        super().__init__()
        self.embed_dim = args['model']['decoder_embed_dim']
        self.cross_self_attention = args['model']['cross_self_attention']
        self.self_attn = RelativeMultiheadAttention(
            embed_dim=self.embed_dim,
            num_heads=args['model']['decoder_attention_heads'],
            dropout=args['model']['attention_dropout'],
            add_bias_kv=add_bias_kv,
            add_zero_attn=add_zero_attn,
            self_attention=not self.cross_self_attention,
            maximum_relative_position=args['model']['decoder_max_relative_len'],
        )
        self.dropout = args['model']['dropout']
        self.activation_fn = get_activation(args['model']['activation_fn'])
        self.activation_dropout = args['model']['activation_dropout']
        if self.activation_dropout == 0:
            # for backwards compatibility with models that use args.relu_dropout
            self.activation_dropout = args['model']['relu_dropout']

        # use layerNorm rather than FusedLayerNorm for exporting.
        # char_inputs can be used to determint this.
        # TODO  remove this once we update apex with the fix
        self.self_attn_layer_norm = LayerNorm(self.embed_dim)

        if no_encoder_attn:
            self.encoder_attn = None
            self.encoder_attn_layer_norm = None
        else:
            self.encoder_attn = RelativeMultiheadAttention(
                self.embed_dim,
                args['model']['decoder_attention_heads'],
                kdim=args['model']['encoder_embed_dim'],
                vdim=args['model']['encoder_embed_dim'],
                dropout=args['model']['attention_dropout'],
                encoder_decoder_attention=True,
            )
            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)

        self.fc1 = Linear(self.embed_dim, args['model']['decoder_ffn_embed_dim'])
        self.fc2 = Linear(args['model']['decoder_ffn_embed_dim'], self.embed_dim)

        self.ff_layer_norm = LayerNorm(self.embed_dim)
        self.need_attn = True
Esempio n. 10
0
    def __init__(self,
                 dictionary,
                 embed_dim=400,
                 pos_len=100,
                 pos_dim=50,
                 hidden_size=400,
                 out_embed_dim=400,
                 num_layers=1,
                 dropout_in=0.5,
                 dropout_out=0.5,
                 encoder_output_units=400,
                 pretrained_embed=None,
                 share_input_output_embed=False,
                 max_target_positions=DEFAULT_MAX_TARGET_POSITIONS):
        super().__init__(dictionary)
        self.dropout_in = dropout_in
        self.dropout = dropout_out
        self.hidden_size = hidden_size
        self.share_input_output_embed = share_input_output_embed
        self.max_target_positions = max_target_positions

        num_embeddings = len(dictionary)
        if pretrained_embed is None:
            self.embed_tokens = Embedding(num_embeddings,
                                          embed_dim,
                                          padding_idx=dictionary.pad())
        else:
            self.embed_tokens = pretrained_embed

        self.pos_len = pos_len + 1
        self.pos_dim = pos_dim
        self.pos_embed = Embedding(self.pos_len, pos_dim)

        # disable input feeding if there is no encoder
        # input feeding is described in arxiv.org/abs/1508.04025
        # self.layers = nn.ModuleList([
        #     LSTMCell(
        #         # input_size=encoder_output_units + pos_dim if layer == 0 else hidden_size,
        #         input_size=encoder_output_units if layer == 0 else hidden_size,
        #         hidden_size=hidden_size,
        #     )
        #     for layer in range(num_layers)
        # ])
        self.layers = nn.ModuleList([
            LSTM(
                in_dim=encoder_output_units +
                pos_dim if layer == 0 else hidden_size,
                # in_dim=encoder_output_units if layer == 0 else hidden_size,
                out_dim=hidden_size,
            ) for layer in range(num_layers)
        ])

        # W_H(h)+W_T(t) => fc_out
        self.W_H = nn.Linear(self.hidden_size, self.hidden_size)
        self.W_T = nn.Linear(self.hidden_size, self.hidden_size)

        if not self.share_input_output_embed:
            self.fc_out = Linear(out_embed_dim, num_embeddings)
Esempio n. 11
0
    def __init__(self, dictionary, embed_dim, out_channels, kernel_size,
                 **kwargs):
        super().__init__(dictionary)
        # word embedding + positional embedding
        self.embed = Embedding(
            len(dictionary), embed_dim)  # , padding_idx=self.dictionary.pad())

        self.position_encoding = kwargs.get('position_encoding', None)
        if self.position_encoding == 'learned':
            self.position_embed = Parameter(1,
                                            kwargs['max_tokens'],
                                            embed_dim,
                                            initializer=trunc_normal(mean=0.,
                                                                     std=0.02))
        else:
            self.position_embed = None
        # pooling
        pooling = kwargs.get('pooling', None)
        self.pooling = pooling1d(pooling)
        if 'weighted' in pooling:
            self.weight_layer = Linear(embed_dim, 1, bias=False)
        else:
            self.weight_layer = None
        # conv1d
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        # padding mode = ['valid'(default), 'same']
        self.padding = kwargs.get('padding', 'valid')
        if self.padding == 'same':
            self.padding_size = []
            for kernel_sz in self.kernel_size:
                padding_right = (kernel_sz - 1) // 2
                padding_left = kernel_sz - 1 - padding_right
                self.padding_size.append((
                    0,
                    0,
                    padding_left,
                    padding_right,
                ))
        self.conv_layers = nn.ModuleList([])
        # input: [bsz, 1, seq_len, embed_dim]
        # filters = 1 -> embed_dim
        # kernel_size = (kernel_width, embed_dim)
        # =>  output: [bsz, embed_dim, seq_len - kernel_width + 1]
        for idx, kernel_sz in enumerate(self.kernel_size):
            self.conv_layers.append(
                Conv2d(in_channels=1,
                       out_channels=embed_dim,
                       kernel_size=(kernel_sz, embed_dim)))

        self.residual = kwargs.get('residual', False)  # residual
        self.dropout = kwargs.get('dropout', None)
        activation_fn = kwargs.get('activation_fn', None)
        self.activation_fn = get_activation(
            activation_fn) if activation_fn else None
    def __init__(self,
                 args,
                 no_encoder_attn=False,
                 add_bias_kv=False,
                 add_zero_attn=False):
        super().__init__()
        self.embed_dim = args['model']['decoder_embed_dim']
        self.dropout = args['model']['dropout']

        self.cross_self_attention = args['model'].get('cross_self_attention',
                                                      False)
        self.self_attn = self.build_self_attention(
            self.embed_dim,
            args,
            add_bias_kv=add_bias_kv,
            add_zero_attn=add_zero_attn,
        )
        self.activation_fn = get_activation(args['model'].get(
            'activation_fn', 'relu'))
        self.activation_dropout = args['model'].get('activation_dropout', 0.)
        if self.activation_dropout == 0:
            # for backwards compatibility with models that use args.relu_dropout
            self.activation_dropout = args['model'].get('relu_dropout', 0.)
        self.normalize_before = args['model']['decoder_normalize_before']

        self.self_attn_layer_norm = LayerNorm(self.embed_dim)

        if no_encoder_attn:
            self.encoder_attn = None
            self.encoder_attn_layer_norm = None
        else:
            self.encoder_attn = self.build_encoder_attention(
                self.embed_dim, args)
            self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)

        self.fc1 = Linear(self.embed_dim,
                          args['model']['decoder_ffn_embed_dim'])
        self.fc2 = Linear(args['model']['decoder_ffn_embed_dim'],
                          self.embed_dim)

        self.final_layer_norm = LayerNorm(self.embed_dim)
        self.need_attn = True
Esempio n. 13
0
 def __init__(
     self,
     dictionary,
     embed_dim,
     # rnn config
     rnn_cell,
     rnn_hidden_dim,
     rnn_dropout=None,
     rnn_num_layers=2,
     rnn_bidirectional=False,
     # auxiliary input
     aux_dim=2,
     inner_dim=32,
     out_dim=2,
 ):
     super(DeepTuneEncoder, self).__init__(dictionary)
     self.embed = Embedding(len(dictionary), embed_dim)
     # LSTM
     self.rnn_dropout = rnn_dropout
     self.rnn = getattr(nn, str.upper(rnn_cell))(
         embed_dim,
         rnn_hidden_dim,
         num_layers=rnn_num_layers,
         dropout=self.rnn_dropout,  # rnn inner dropout between layers
         bidirectional=rnn_bidirectional,
         batch_first=True,
     )
     self.src_out_proj = nn.Sequential(
         Linear(rnn_hidden_dim, out_dim),
         nn.Sigmoid(),
     )
     # Auxiliary inputs. wgsize and dsize
     self.bn = BatchNorm1d(rnn_hidden_dim + aux_dim)
     self.hybrid_out_proj = nn.Sequential(
         Linear(rnn_hidden_dim + aux_dim, inner_dim),
         nn.ReLU(),
         Linear(inner_dim, out_dim),
         nn.Sigmoid(),
     )
 def __init__(self, args):
     super().__init__()
     self.embed_dim = args['model']['encoder_embed_dim']
     self.self_attn = RelativeMultiheadAttention(
         self.embed_dim,
         args['model']['encoder_attention_heads'],
         dropout=args['model']['attention_dropout'],
         self_attention=True,
         maximum_relative_position=args['model']
         ['encoder_max_relative_len'])
     self.self_attn_layer_norm = LayerNorm(self.embed_dim)
     self.dropout = args['model']['dropout']
     self.activation_fn = get_activation(args['model']['activation_fn'])
     self.activation_dropout = args['model']['activation_dropout']
     if self.activation_dropout == 0:
         # for backwards compatibility with models that use args.relu_dropout
         self.activation_dropout = args['model']['relu_dropout']
     self.fc1 = Linear(self.embed_dim,
                       args['model']['encoder_ffn_embed_dim'])
     self.fc2 = Linear(args['model']['encoder_ffn_embed_dim'],
                       self.embed_dim)
     self.ff_layer_norm = LayerNorm(self.embed_dim)
Esempio n. 15
0
    def __init__(self,
                 dictionary, embed_dim, token_types, max_positions,
                 self_attn_layers, attention_heads, ffn_embed_dim, activation_fn,
                 dropout, **kwargs,
                 ):
        super(SelfAttnEncoder, self).__init__(dictionary)
        # word embedding
        self.embed = Embedding(
            len(dictionary), embed_dim, padding_idx=self.dictionary.pad(),
            initializer=trunc_normal(mean=.0, std=.02),
        )
        # type embedding
        if token_types is not None:
            self.type_embed = Embedding(
                token_types, embed_dim,
                initializer=trunc_normal(mean=.0, std=.02),
            )
        else:
            self.type_embed = None
        # positional embedding
        if max_positions is not None:
            self.positional_embed = Parameter(
                1, max_positions, embed_dim,
                initializer=trunc_normal(mean=.0, std=.02),
            )
        else:
            self.positional_embed = None
        # layer norm for embedding
        self.embed_layer_norm = LayerNorm(embed_dim)
        self.dropout = dropout

        # self attn
        self.num_layers = self_attn_layers
        self.layers = nn.ModuleList(
            [TransformerEncoderLayer(embed_dim, attention_heads, dropout, ffn_embed_dim, activation_fn)
             for _ in range(self_attn_layers)]
        )

        # pooling
        pooling = kwargs.get('pooling', None)
        self.pooling = pooling1d(pooling)
        if 'weighted' in pooling:
            self.weight_layer = Linear(embed_dim, 1, bias=False, weight_initializer=xavier_uniform())
        else:
            self.weight_layer = None
Esempio n. 16
0
 def __init__(
     self,
     dictionary,
     embed_dim,
     pooling='weighted_mean',
     dropout=0.1,
     **kwargs,
 ):
     super().__init__(dictionary)
     self.padding_idx = self.dictionary.pad()
     self.embed = Embedding(len(dictionary),
                            embed_dim,
                            padding_idx=self.padding_idx,
                            initializer=xavier_uniform())
     self.dropout = dropout
     self.pooling = pooling1d(pooling)
     if self.pooling:
         self.weight_layer = Linear(embed_dim, 1, bias=False, weight_initializer=xavier_uniform()) \
             if 'weighted' in pooling else None
Esempio n. 17
0
    def __init__(
        self,
        dictionary,
        embed_dim,
        embed_out,
        dropout,
        edge_types,
        # scoring/transform MLPs
        out_dropout,
        dim_inner,
        dim_out,
    ):
        super(PoemEncoder, self).__init__(dictionary)
        # embedding block
        if dictionary is not None:
            self.embed = Embedding(len(dictionary), embed_dim)
        else:
            self.embed = None
        self.embed_modules = nn.Sequential(
            Linear(embed_dim, embed_out, bias=False), nn.ReLU(),
            nn.Dropout(dropout))
        # MLP-GNN
        self.gnn_modules = GNNEncoder(edge_types, dim_in=embed_out, dim_inner=dim_out, dim_out=embed_out, \
                                      dropout=dropout)

        # scoring MLP
        def get_mlp():
            return nn.Sequential(
                nn.Dropout(out_dropout),
                nn.Linear(embed_dim + embed_out, dim_inner, bias=False),
                nn.ReLU(),
                nn.Linear(dim_inner, dim_out, bias=False),
                nn.ReLU(),
            )

        self.score_mlp = get_mlp()
        self.transform_mlp = get_mlp()
        self.out_linear = nn.Sequential(
            nn.Linear(dim_out, 2),
            nn.Sigmoid(),
        )
Esempio n. 18
0
 def __init__(
         self,
         dictionary,
         embed_dim,
         dropout,
         # rnn config
         rnn_cell,
         rnn_hidden_dim,
         rnn_dropout,
         rnn_num_layers=1,
         rnn_bidirectional=False,
         **kwargs):
     super().__init__(dictionary)
     # word embedding + positional embedding
     self.embed = Embedding(len(dictionary),
                            embed_dim,
                            initializer=xavier_uniform())
     self.dropout = dropout
     # pooling
     pooling = kwargs.get('pooling', None)
     self.pooling = pooling1d(pooling)
     if 'weighted' in pooling:
         self.weight_layer = Linear(embed_dim,
                                    1,
                                    bias=False,
                                    weight_initializer=xavier_uniform())
     else:
         self.weight_layer = None
     # rnn
     self.rnn_dropout = rnn_dropout
     self.rnn_num_layers = rnn_num_layers
     self.rnn_bidirectional = rnn_bidirectional
     self.rnn = getattr(nn, str.upper(rnn_cell))(
         embed_dim,
         rnn_hidden_dim,
         num_layers=rnn_num_layers,
         dropout=self.rnn_dropout,  # rnn inner dropout between layers
         bidirectional=rnn_bidirectional,
         batch_first=True,
     )
Esempio n. 19
0
    def __init__(self,
                 dictionary,
                 embed_dim=512,
                 hidden_size=512,
                 out_embed_dim=512,
                 num_layers=1,
                 dropout_in=0.1,
                 dropout_out=0.1,
                 attention=True,
                 encoder_output_units=512,
                 pretrained_embed=None,
                 share_input_output_embed=False,
                 adaptive_softmax_cutoff=None,
                 max_target_positions=DEFAULT_MAX_TARGET_POSITIONS):
        super().__init__(dictionary)
        self.dropout_in = dropout_in
        self.dropout_out = dropout_out
        self.hidden_size = hidden_size
        self.share_input_output_embed = share_input_output_embed
        self.need_attn = True
        self.max_target_positions = max_target_positions

        self.adaptive_softmax = None
        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        if pretrained_embed is None:
            self.embed_tokens = Embedding(num_embeddings, embed_dim,
                                          padding_idx)
        else:
            self.embed_tokens = pretrained_embed

        self.encoder_output_units = encoder_output_units
        self.lstm = LSTM(hidden_size,
                         hidden_size,
                         dropout=dropout_in,
                         batch_first=True)
        self.fc_out = Linear(out_embed_dim, num_embeddings, bias=False)
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))
        self._future_mask = torch.empty(0)

        self.dropout = args['model']['dropout']
        self.decoder_layerdrop = args['model']['decoder_layerdrop']
        self.share_input_output_embed = args['model'][
            'share_decoder_input_output_embed']

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args['model']['decoder_embed_dim']
        self.embed_dim = embed_dim
        self.output_embed_dim = args['model']['decoder_output_dim']

        self.padding_idx = dictionary.pad()  # embed_tokens.padding_idx TODO
        self.max_target_positions = args['task']['max_target_positions']

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args['model'][
            'no_scale_embedding'] else math.sqrt(embed_dim)

        self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False)
                               if embed_dim != input_embed_dim else None)

        offset_positions_by_padding = args['model'].get(
            'offset_positions_by_padding', True)
        if args['model']['decoder_positional_embeddings']:
            self.embed_positions = None
        else:
            # Option 1
            if args['model'][
                    'decoder_position_encoding_version'] == 'ncc_sinusoidal':
                self.embed_positions = SinusoidalPositionalEmbedding(
                    self.embed_dim,
                    padding_idx=self.padding_idx if offset_positions_by_padding else None,
                    init_size=args['model']['max_target_positions'] + self.padding_idx + 1 \
                        if offset_positions_by_padding else args['model']['max_target_positions'],
                )
            # Option 2
            elif args['model'][
                    'decoder_position_encoding_version'] == 'ncc_learned':
                num_embeddings = args['model']['max_target_positions']
                if offset_positions_by_padding:
                    num_embeddings += self.padding_idx + 1
                m = LearnedPositionalEmbedding(
                    num_embeddings,
                    self.embed_dim,
                    padding_idx=self.padding_idx
                    if offset_positions_by_padding else None)
                nn.init.normal_(m.weight, mean=0, std=self.embed_dim**-0.5)
                if self.padding_idx is not None:
                    nn.init.constant_(m.weight[self.padding_idx], 0)
                self.embed_positions = m

        self.cross_self_attention = args['model']['cross_self_attention']
        self.layer_wise_attention = args['model']['layer_wise_attention']

        self.layers = nn.ModuleList([
            NccTransformerDecoderLayer(args, no_encoder_attn)
            for _ in range(args['model']['decoder_layers'])
        ])
        self.num_layers = len(self.layers)

        self.project_out_dim = (
            Linear(embed_dim, self.output_embed_dim, bias=False)
            if embed_dim != self.output_embed_dim
            and not args['model']['tie_adaptive_weights'] else None)

        self.out_generator = Linear(
            embed_dim,
            len(dictionary),
            bias=args['model']['decoder_out_embed_bias'])
        if self.share_input_output_embed:
            self.out_generator.weight = self.embed_tokens.weight

        if args['model']['decoder_normalize_before'] and not args['model'][
                'no_decoder_final_norm']:
            self.layer_norm = nn.LayerNorm(embed_dim)
        else:
            self.layer_norm = None
        if args['model']['layernorm_embedding']:
            self.layernorm_embedding = nn.LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None
Esempio n. 21
0
 def get_mlp():
     return nn.Sequential(
         Linear(dim_in, dim_inner, bias=False),
         nn.ReLU(),
         Linear(dim_inner, dim_out, bias=False),
     )
Esempio n. 22
0
 def __init__(self, input_size: int, hidden_size: int) -> None:
     super(NaryTreeLSTMCell, self).__init__()
     self.W_iou = Linear(input_size, 3 * hidden_size, bias=False)
     self.U_iou = Linear(2 * hidden_size, 3 * hidden_size, bias=False)
     self.b_iou = nn.Parameter(torch.zeros(1, 3 * hidden_size))
     self.U_f = Linear(2 * hidden_size, 2 * hidden_size)
    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
        self.args = args
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([3]))
        self._future_mask = torch.empty(0)

        self.dropout = args['model']['dropout']
        self.decoder_layerdrop = args['model']['decoder_layerdrop']
        self.share_input_output_embed = args['model'][
            'share_decoder_input_output_embed']

        input_embed_dim = embed_tokens.embedding_dim
        embed_dim = args['model']['decoder_embed_dim']
        self.embed_dim = embed_dim
        self.output_embed_dim = args['model']['decoder_output_dim']

        self.padding_idx = embed_tokens.padding_idx
        self.max_target_positions = args['model']['max_target_positions']

        self.embed_tokens = embed_tokens

        self.embed_scale = 1.0 if args['model'][
            'no_scale_embedding'] else math.sqrt(embed_dim)

        self.project_in_dim = (Linear(input_embed_dim, embed_dim, bias=False)
                               if embed_dim != input_embed_dim else None)

        self.embed_positions = (PositionalEmbedding(
            args['model']['max_target_positions'],
            embed_dim,
            self.padding_idx,
            learned=args['model']['decoder_learned_pos'],
        ) if not args['model']['no_token_positional_embeddings'] else None)

        if args['model']['layernorm_embedding']:
            self.layernorm_embedding = LayerNorm(embed_dim)
        else:
            self.layernorm_embedding = None

        self.cross_self_attention = args['model'].get('cross_self_attention',
                                                      False)

        if self.decoder_layerdrop > 0.0:
            self.layers = LayerDropModuleList(p=self.decoder_layerdrop)
        else:
            self.layers = nn.ModuleList([])
        self.layers.extend([
            self.build_decoder_layer(args, no_encoder_attn)
            for _ in range(args['model']['decoder_layers'])
        ])
        self.num_layers = len(self.layers)

        if args['model']['decoder_normalize_before'] and not getattr(
                args['model'], "no_decoder_final_norm", False):
            self.layer_norm = LayerNorm(embed_dim)
        else:
            self.layer_norm = None

        self.project_out_dim = (Linear(
            embed_dim, self.output_embed_dim, bias=False) if
                                embed_dim != self.output_embed_dim else None)

        self.adaptive_softmax = None
        self.output_projection = None
        if args['model']['adaptive_softmax_cutoff'] is not None:
            self.adaptive_softmax = AdaptiveSoftmax(
                len(dictionary),
                self.output_embed_dim,
                eval(args['model']['adaptive_softmax_cutoff']),
                dropout=args['model']['adaptive_softmax_dropout'],
                adaptive_inputs=embed_tokens
                if args['model']['tie_adaptive_weights'] else None,
                factor=args['model']['adaptive_softmax_factor'],
                tie_proj=args['model']['tie_adaptive_proj'],
            )
        elif self.share_input_output_embed:
            self.output_projection = Linear(
                self.embed_tokens.weight.shape[1],
                self.embed_tokens.weight.shape[0],
                bias=False,
            )
            self.output_projection.weight = self.embed_tokens.weight
        else:
            self.output_projection = Linear(self.output_embed_dim,
                                            len(dictionary),
                                            bias=False)