Example #1
0
    def __init__(self, d_model, n_head, d_k=64, d_v=64, res_dropout=0.1):
        super(MultiHeadAttention, self).__init__()

        self.w_qs = nn.ModuleList(
            [Linear(d_model, d_k, bias=False) for _ in range(n_head)])
        self.w_ks = nn.ModuleList(
            [Linear(d_model, d_k, bias=False) for _ in range(n_head)])
        self.w_vs = nn.ModuleList(
            [Linear(d_model, d_v, bias=False) for _ in range(n_head)])

        self.attention = ScaledDotProductAttention(d_model)
        self.layer_norm = LayerNormalization(d_model)
        self.proj = Linear(n_head * d_v, d_model)
        self.dropout = nn.Dropout(res_dropout)
Example #2
0
    def __init__(
            self, n_src_vocab, n_tgt_vocab, n_max_seq, n_layers=6, n_head=8,
            d_word_vec=512, d_model=512, d_inner_hid=1024, d_k=64, d_v=64,
            dropout=0.1, proj_share_weight=True, embs_share_weight=True):

        super(Transformer, self).__init__()
        self.encoder = Encoder(
            n_src_vocab, n_max_seq, n_layers=n_layers, n_head=n_head,
            d_word_vec=d_word_vec, d_model=d_model,
            d_inner_hid=d_inner_hid, dropout=dropout)
        self.decoder = Decoder(
            n_tgt_vocab, n_max_seq, n_layers=n_layers, n_head=n_head,
            d_word_vec=d_word_vec, d_model=d_model,
            d_inner_hid=d_inner_hid, dropout=dropout)
        self.tgt_word_proj = Linear(d_model, n_tgt_vocab, bias=False)
        self.dropout = nn.Dropout(dropout)
        self.padding_bottleneck=PaddingBottleneck()
        # We will store the padding tensor here to find it after a call to forward:
        self.padding=None
        self.padding_amount=self.padding_bottleneck.padding_amount
        assert d_model == d_word_vec, \
        'To facilitate the residual connections, \
         the dimensions of all module output shall be the same.'

        if proj_share_weight:
            # Share the weight matrix between tgt word embedding/projection
            assert d_model == d_word_vec
            self.tgt_word_proj.weight = self.decoder.tgt_word_emb.weight

        if embs_share_weight:
            # Share the weight matrix between src/tgt word embeddings
            # assume the src/tgt word vec size are the same
            assert n_src_vocab == n_tgt_vocab, \
            "To share word embedding table, the vocabulary size of src/tgt shall be the same."
            self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight
Example #3
0
    def __init__(
            self, n_src_vocab, n_tgt_vocab, n_max_seq, n_layers=6, n_head=8,
            d_word_vec=512, d_model=512, d_inner_hid=1024, d_k=64, d_v=64,
            dropout=0.1, proj_share_weight=True, embs_share_weight=True):

        super(Transformer, self).__init__()
        self.encoder = Encoder(
            n_src_vocab, n_max_seq, n_layers=n_layers, n_head=n_head,
            d_word_vec=d_word_vec, d_model=d_model,
            d_inner_hid=d_inner_hid, dropout=dropout)
        self.decoder = Decoder(
            n_tgt_vocab, n_max_seq, n_layers=n_layers, n_head=n_head,
            d_word_vec=d_word_vec, d_model=d_model,
            d_inner_hid=d_inner_hid, dropout=dropout)
        self.tgt_word_proj = Linear(d_model, n_tgt_vocab, bias=False)
        self.dropout = nn.Dropout(dropout)

        if proj_share_weight:
            # Share the weight matrix between tgt word embedding/projection
            assert d_model == d_word_vec
            self.tgt_word_proj.weight = self.decoder.tgt_word_emb.weight

        if embs_share_weight:
            # Share the weight matrix between src/tgt word embeddings
            # assume the src/tgt word vec size are the same
            assert n_src_vocab == n_tgt_vocab, \
            "To share word embedding table, the vocabulary size of src/tgt shall be the same."
            self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight
Example #4
0
    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        """初始化多头
        
        Arguments:
            n_head {int} -- 头的数量
            d_model {int} -- 模型总维度
            d_k {int} -- Query和Key分别的子头维度
            d_v {int} -- Value的子头维度
    
        """
        super(MultiHeadAttention, self).__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k))
        self.w_ks = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k))
        self.w_vs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_v))

        self.attention = ScaledDotProductAttention(d_model)
        self.layer_norm = LayerNormalization(d_model)
        self.proj = Linear(n_head * d_v, d_model)

        self.dropout = nn.Dropout(dropout)

        init.xavier_normal(self.w_qs)
        init.xavier_normal(self.w_ks)
        init.xavier_normal(self.w_vs)
Example #5
0
    def __init__(self,
                 n_head,
                 d_model,
                 d_k,
                 d_v,
                 dropout=0.1,
                 enc_output=None):
        '''

        :param n_head:
        :param d_model:
        :param d_k:
        :param d_v:
        :param dropout:
        '''
        super().__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k))
        self.w_ks = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k))
        self.w_vs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_v))

        self.attention = ScaledDotProductAttention()
        self.layer_norm = LayerNormalization(d_model)
        self.proj = Linear(n_head * d_v, d_model)

        self.dropout = nn.Dropout(dropout)

        init.xavier_normal_(self.w_qs)
        init.xavier_normal_(self.w_ks)
        init.xavier_normal_(self.w_vs)
Example #6
0
    def __init__(self,
                 lda_mat,
                 n_src_dim,
                 encoder_max_len,
                 d_model=256,
                 dropout=0.1,
                 contexts=[[0]]):

        super(EncoderTest, self).__init__()
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)

        self.trans_pos_enc = nn.Embedding(encoder_max_len,
                                          d_model,
                                          padding_idx=constants.PAD)
        self.trans_pos_enc.weight.data = position_encoding_init(
            encoder_max_len, d_model)
        self.trans_pos_enc.weight.requires_grad = False

        #project the source to dim of model
        lda_concat_index = [-2, -1, 0, 1, 2]
        self.concat = ConcatLayer(lda_concat_index)
        self.lda_layer = LDALayer(lda_mat)
        self.src_projection = Linear(n_src_dim * len(lda_concat_index),
                                     d_model,
                                     bias=False)
        #self.tdnn0 = TDNNLayer(n_src_dim * len(lda_concat_index), d_model, contexts[0], dropout=dropout)
        self.tdnn_stack = nn.ModuleList([
            TDNNLayer(d_model, d_model, context, dropout=dropout)
            for context in contexts
        ])
    def __init__(self,
                 n_tgt_vocab,
                 n_max_seq,
                 n_layers=4,
                 n_head=8,
                 d_word_vec=64,
                 d_model=64,
                 d_inner_hid=200,
                 dropout=0.1):

        super(Decoder, self).__init__()
        n_position = n_max_seq + 1
        self.n_max_seq = n_max_seq
        self.d_model = d_model

        self.position_enc = nn.Embedding(n_position,
                                         d_word_vec,
                                         padding_idx=Constants.PAD)
        self.position_enc.weight.data = position_encoding_init(
            n_position, d_word_vec)

        self.tgt_word_emb = nn.Embedding(n_tgt_vocab,
                                         d_word_vec,
                                         padding_idx=Constants.PAD)
        self.tgt_word_proj = Linear(d_model, n_tgt_vocab, bias=False)
        self.dropout = nn.Dropout(dropout)

        self.layer_stack = nn.ModuleList([
            DecoderLayer(d_model, d_inner_hid, n_head) for _ in range(n_layers)
        ])
Example #8
0
    def __init__(self,
                 n_src_vocab,
                 n_tgt_vocab,
                 n_max_seq,
                 emb_path=None,
                 n_layers=6,
                 n_head=6,
                 d_word_vec=300,
                 d_model=300,
                 d_inner_hid=500,
                 d_k=50,
                 d_v=50,
                 dropout=0.1,
                 proj_share_weight=True,
                 embs_share_weight=True):

        super(Transformer, self).__init__()
        self.encoder = Encoder(n_src_vocab,
                               n_max_seq,
                               emb_path=emb_path,
                               n_layers=n_layers,
                               n_head=n_head,
                               d_word_vec=d_word_vec,
                               d_model=d_model,
                               d_k=d_k,
                               d_v=d_v,
                               d_inner_hid=d_inner_hid,
                               dropout=dropout)
        self.decoder = Decoder(n_tgt_vocab,
                               n_max_seq,
                               emb_path=emb_path,
                               n_layers=n_layers,
                               n_head=n_head,
                               d_word_vec=d_word_vec,
                               d_model=d_model,
                               d_k=d_k,
                               d_v=d_v,
                               d_inner_hid=d_inner_hid,
                               dropout=dropout)
        self.tgt_word_proj = Linear(
            d_model, n_tgt_vocab, bias=False)  # d_model到n_tgt_vocab的映射,最终预测结果
        self.dropout = nn.Dropout(dropout)

        assert d_model == d_word_vec, \
            'To facilitate the residual connections, \
             the dimensions of all module output shall be the same.'

        if proj_share_weight:
            # Share the weight matrix between tgt word embedding/projection
            assert d_model == d_word_vec
            self.tgt_word_proj.weight = self.decoder.tgt_word_emb.weight

        if embs_share_weight:
            # Share the weight matrix between src/tgt word embeddings
            # assume the src/tgt word vec size are the same
            assert n_src_vocab == n_tgt_vocab, \
                "To share word embedding table, the vocabulary size of src/tgt shall be the same."
            self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight
Example #9
0
    def __init__(self,
                 n_tgt_vocab,
                 decoder_max_len,
                 n_layers=2,
                 n_head=3,
                 sub_sequence=(-1, 1),
                 d_k=64,
                 d_v=64,
                 en_d_model=256,
                 de_d_model=128,
                 d_inner_hid=128,
                 dropout=0.1):

        super(Decoder, self).__init__()
        self.sub = sub_sequence

        self.en_d_model = en_d_model
        self.de_d_model = de_d_model
        self.dropout = nn.Dropout(dropout)
        self.position_enc = nn.Embedding(decoder_max_len,
                                         de_d_model,
                                         padding_idx=constants.PAD)
        self.position_enc.weight.data = position_encoding_init(
            decoder_max_len, de_d_model)
        self.position_enc.weight.requires_grad = False

        self.tgt_word_emb = nn.Embedding(n_tgt_vocab,
                                         de_d_model,
                                         padding_idx=constants.PAD)
        self.tgt_word_proj = Linear(de_d_model, n_tgt_vocab, bias=False)
        self.layer_stack = nn.ModuleList([
            DecoderLayer(de_d_model,
                         d_inner_hid,
                         n_head,
                         d_k,
                         d_v,
                         dropout=dropout) for _ in range(n_layers)
        ])

        #project the encoder output to dim of decoder
        self.enc_dec_projection = Linear(en_d_model, de_d_model, bias=False)
Example #10
0
    def __init__(self,
                 n_src_dim,
                 n_tgt_vocab,
                 n_max_seq,
                 n_layers=6,
                 n_head=8,
                 d_model=512,
                 d_inner_hid=1024,
                 d_k=64,
                 d_v=64,
                 dropout=0.1,
                 proj_share_weight=True,
                 embs_share_weight=True):

        super(Transformer, self).__init__()
        self.encoder = Encoder(n_max_seq,
                               n_layers=n_layers,
                               n_head=n_head,
                               d_model=d_model,
                               d_inner_hid=d_inner_hid,
                               dropout=dropout)
        self.decoder = Decoder(n_max_seq,
                               n_layers=n_layers,
                               n_head=n_head,
                               d_model=d_model,
                               d_inner_hid=d_inner_hid,
                               dropout=dropout)

        #project the source to dim of model
        self.src_projection = Linear(n_src_dim, d_model, bias=False)
        self.tgt_word_emb = nn.Embedding(n_tgt_vocab,
                                         d_model,
                                         padding_idx=Constants.PAD)
        self.tgt_word_proj = Linear(d_model, n_tgt_vocab, bias=False)

        self.dropout = nn.Dropout(dropout)
Example #11
0
    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v

        self.w_qs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k))
        self.w_ks = nn.Parameter(torch.FloatTensor(n_head, d_model, d_k))
        self.w_vs = nn.Parameter(torch.FloatTensor(n_head, d_model, d_v))

        self.attention = ScaledDotProductAttention(d_model)
        self.layer_norm = LayerNormalization(d_model)
        self.proj = Linear(n_head * d_v, d_model)

        self.dropout = nn.Dropout(dropout)

        init.xavier_normal(self.w_qs)
        init.xavier_normal(self.w_ks)
        init.xavier_normal(self.w_vs)
Example #12
0
    def __init__(self,
                 n_src_dim,
                 encoder_max_len,
                 n_layers=2,
                 n_head=3,
                 sub_sequence=(-1, 1),
                 d_k=64,
                 d_v=64,
                 d_model=256,
                 d_inner_hid=256,
                 dropout=0.1):

        super(Encoder, self).__init__()
        self.sub = sub_sequence

        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)

        self.position_enc = nn.Embedding(encoder_max_len,
                                         d_model,
                                         padding_idx=constants.PAD)
        self.position_enc.weight.data = position_encoding_init(
            encoder_max_len, d_model)
        self.position_enc.weight.requires_grad = False

        self.trans_pos_enc = nn.Embedding(encoder_max_len,
                                          d_model,
                                          padding_idx=constants.PAD)
        self.trans_pos_enc.weight.data = position_encoding_init(
            encoder_max_len, d_model)
        self.trans_pos_enc.weight.requires_grad = False

        #project the source to dim of model
        self.src_projection = Linear(n_src_dim, d_model, bias=False)
        self.layer_stack = nn.ModuleList([
            EncoderLayer(d_model,
                         d_inner_hid,
                         n_head,
                         d_k,
                         d_v,
                         dropout=dropout) for _ in range(n_layers)
        ])
Example #13
0
    def __init__(self,
                 user_size,
                 kernel_size=3,
                 n_layers=1,
                 n_head=1,
                 d_k=32,
                 d_v=32,
                 d_word_vec=32,
                 d_model=32,
                 d_inner_hid=32,
                 dropout=0.1,
                 finit=0):

        super(Decoder, self).__init__()
        self.d_model = d_model
        self.user_size = user_size

        self.user_emb = nn.Embedding(user_size,
                                     d_word_vec,
                                     padding_idx=Constants.PAD)
        self.tgt_user_proj = Linear(d_model, user_size, bias=False)

        self.dropout = nn.Dropout(dropout)
        self.conv = nn.Conv1d(d_model,
                              user_size,
                              kernel_size,
                              padding=kernel_size - 1,
                              bias=True)
        self.padding = kernel_size - 1
        self.finit = finit

        self.layer_stack = nn.ModuleList([
            DecoderLayer(d_model,
                         d_inner_hid,
                         n_head,
                         d_k,
                         d_v,
                         dropout=dropout) for _ in range(n_layers)
        ])
    def __init__(self,
                 n_src_vocab,
                 n_tgt_vocab,
                 n_max_seq,
                 n_layers=6,
                 n_head=8,
                 d_word_vec=512,
                 d_model=512,
                 d_inner_hid=1024,
                 d_k=64,
                 d_v=64,
                 dropout=0.1,
                 proj_share_weight=True,
                 embs_share_weight=True,
                 use_ctx=False):

        self.use_ctx = use_ctx

        super(Transformer, self).__init__()
        self.encoder = Encoder(n_src_vocab,
                               n_max_seq,
                               n_layers=n_layers,
                               n_head=n_head,
                               d_word_vec=d_word_vec,
                               d_model=d_model,
                               d_inner_hid=d_inner_hid,
                               dropout=dropout)

        if use_ctx:
            self.encoder_ctx = Encoder(n_src_vocab,
                                       n_max_seq,
                                       n_layers=n_layers,
                                       n_head=n_head,
                                       d_word_vec=d_word_vec,
                                       d_model=d_model,
                                       d_inner_hid=d_inner_hid,
                                       dropout=dropout)

            # Share the word embeddings between the src encoder and the ctx encoder
            self.encoder_ctx.src_word_emb.weight = self.encoder.src_word_emb.weight

        self.decoder = Decoder(n_tgt_vocab,
                               n_max_seq,
                               n_layers=n_layers,
                               n_head=n_head,
                               d_word_vec=d_word_vec,
                               d_model=d_model,
                               d_inner_hid=d_inner_hid,
                               dropout=dropout,
                               use_ctx=use_ctx)
        self.tgt_word_proj = Linear(d_model, n_tgt_vocab, bias=False)
        self.dropout = nn.Dropout(dropout)

        assert d_model == d_word_vec, \
        'To facilitate the residual connections, the dimensions of all module output shall be the same.'

        if proj_share_weight:
            # Share the weight matrix between tgt word embedding/projection
            assert d_model == d_word_vec
            self.tgt_word_proj.weight = self.decoder.tgt_word_emb.weight

        if embs_share_weight:
            # Share the weight matrix between src/tgt word embeddings
            # assume the src/tgt word vec size are the same
            assert n_src_vocab == n_tgt_vocab, \
            "To share word embedding table, the vocabulary size of src/tgt shall be the same."
            self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight