Example #1
0
    def __init__(self,
                 vocab_size,
                 max_len,
                 n_layers=6,
                 d_model=512,
                 d_emb=512,
                 d_hidden=1024,
                 n_heads=8,
                 d_k=64,
                 d_v=64,
                 dropout=0.1,
                 pad_id=0):
        super(Encoder, self).__init__()

        self.pos_enc = PositionalEncoding(max_len + 1,
                                          d_emb,
                                          padding_idx=pad_id)
        self.emb = nn.Embedding(vocab_size, d_emb, padding_idx=pad_id)
        self.layers = nn.ModuleList([
            EncoderLayer(d_model=d_model,
                         d_hidden=d_hidden,
                         n_heads=n_heads,
                         d_k=d_k,
                         d_v=d_v,
                         dropout=dropout) for _ in range(n_layers)
        ])
Example #2
0
    def __init__(self,
                 vocab_size,
                 embedding_rank,
                 inner_rank=None,
                 ffward_rank=None):
        super().__init__()
        self.vocab = vocab_size
        layer = EncoderLayer(
            tconfig.layer_dimension,
            MultiHeadedAttention(tconfig.num_attention_heads,
                                 tconfig.layer_dimension,
                                 rank=inner_rank),
            PositionwiseFeedForward(tconfig.layer_dimension,
                                    tconfig.inner_layer_dimension,
                                    tconfig.dropout,
                                    rank=ffward_rank),
            tconfig.dropout,
        )

        self.layers = clones(layer, tconfig.num_layers)
        self.norm = LayerNorm(layer.size)
        self.src_embed = nn.Sequential(
            FactorizedEmbeddings(vocab_size, tconfig.layer_dimension,
                                 embedding_rank),
            PositionalEncoding(
                tconfig.layer_dimension,
                tconfig.dropout,
            ),
        )
Example #3
0
    def __init__(self,
                 vocab_size: int,
                 embed_dim: int,
                 n_head: int,
                 d_ff: int,
                 pad_idx: int,
                 n_layers: int,
                 dropout: float = 0.1):
        """
        Embedding Parameters
        :param vocab_size: the size of the source vocabulary
        :param embed_dim: embedding dimension. 512 is used in the paper
        :param n_head: the number of multi head. (split the embed_dim to 8.. such that 8 * 64 = 512)
        :param d_ff: inner dimension of position-wise feed-forward
        :param pad_idx: padding index
        :param n_layers: the number of sub-layers

        Flow
           1. embedding layer
           2. positional encoding
           3. residual dropout(0.1)
           4. iterate sub-layers (6 layers are used in paper)
        """
        super(Encoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.position_enc = PositionalEncoding(embed_dim)
        self.dropout = nn.Dropout(dropout)  # Residual Dropout(0.1) in paper

        self.layer_stack: ModuleList[EncoderLayer] = nn.ModuleList([
            EncoderLayer(embed_dim, n_head, d_ff=d_ff, dropout=dropout)
            for _ in range(n_layers)
        ])

        self.layer_norm = NormLayer(embed_dim)
Example #4
0
    def test_size(self):
        d_hidden = 20
        n_words = 100
        d_model = 32
        d_emb = d_model
        d_k = 4
        d_v = 5
        n_heads = 8
        batch_size = 32
        max_len = 15
        enc = EncoderLayer(d_model, d_hidden, n_heads, d_k, d_v)
        enc_emb = nn.Embedding(n_words, d_emb)
        dec_input = Variable(
            torch.LongTensor(
                np.random.randint(0, n_words, (batch_size, max_len))))
        enc_embedded = enc_emb(dec_input)

        enc_output, enc_attn = enc(enc_embedded)
        self.assertEqual(enc_output.size(),
                         torch.Size([batch_size, max_len, d_model]))

        dec = DecoderLayer(d_model, d_hidden, n_heads, d_k, d_v)

        dec_emb = nn.Embedding(n_words, d_emb)
        dec_input = Variable(
            torch.LongTensor(
                np.random.randint(0, n_words, (batch_size, max_len))))
        dec_embedded = dec_emb(dec_input)

        dec_output, dec_slf_attn, inter_attn = dec(dec_embedded, enc_output)
        self.assertEqual(dec_output.size(),
                         torch.Size([batch_size, max_len, d_model]))
Example #5
0
 def __init__(self, n_layers, d_model, d_ffn, n_heads,
              max_seq_len, src_vocab_size, dropout=0.1):
     super(Encoder, self).__init__()
     self.d_model = d_model
     self.src_emb = Embeddings(d_model, src_vocab_size)
     self.pos_enc = PositionalEncoding(d_model, dropout=dropout, max_seq_len=max_seq_len)
     self.dropout_emb = nn.Dropout(dropout)
     self.layers = nn.ModuleList(
         [EncoderLayer(d_model, n_heads, d_ffn, dropout) for _ in range(n_layers)])
Example #6
0
    def __init__(self, input_vocab_num, max_seq_len, pad_idx=0):
        """
        :param input_vocab_num: 全部输入序列的词典的单词数
        :param max_seq_len: 输入序列最大长度
        :param pad_idx: pad的填充位置,默认为0
        """
        super(Encoder, self).__init__()
        self.word_embedding = nn.Embedding(input_vocab_num, config.d_model, padding_idx=pad_idx)  # 词向量层 N*D
        self.pos_encoding = PositionalEncoding(max_seq_len + 1, config.d_model, pad_idx)  # 位置向量层 (N+1)*D
        self.encoder_layers = nn.ModuleList([EncoderLayer() for _ in range(config.layers)])  # 堆叠n层encoder_layer

        self.pad_obj = Mask()  # mask对象
        self.tool = Tools()  # 工具对象
Example #7
0
    def __init__(self,
                 w_emb,
                 p_emb_w,
                 p_emb_s,
                 d_model=256,
                 d_ff=1024,
                 h=8,
                 dropout=0.1,
                 N=6):
        super(LocalEncoder, self).__init__()
        self.w_emb = w_emb
        self.p_emb_w = p_emb_w
        self.p_emb_s = p_emb_s
        self.dropout = nn.Dropout(dropout)

        attn = MultiHeadedAttention(h, d_model)
        ff = PositionwiseFeedForward(d_model, d_ff, dropout)

        self.encoder = Encoder(EncoderLayer(d_model, attn, ff, dropout), N)
Example #8
0
    def __init__(self, config: TransformerConfig):
        super(Encoder, self).__init__()

        self.config = config
        self.word_emb = nn.Embedding(self.config.vocab_size,
                                     self.config.word_vec_size,
                                     padding_idx=self.config.pad_idx)
        self.position_encoder = PositionalEncoding(
            self.config.word_vec_size, n_position=self.config.n_position)
        self.dropout = nn.Dropout(p=self.config.dropout)
        self.layer_stack = nn.ModuleList([
            EncoderLayer(d_model=self.config.d_model,
                         d_inner=self.config.d_inner,
                         n_head=self.config.n_head,
                         d_k=self.config.d_k,
                         d_v=self.config.d_v,
                         dropout=self.config.dropout)
            for _ in range(self.config.encoder_n_layers)
        ])  # note: d_model == word_vec_size, 即token向量化之中的大小
        self.layer_norm = nn.LayerNorm(self.config.d_model, eps=1e-6)
Example #9
0
    def __init__(self, n_src_vocab, len_max_seq, d_word_vec,
                 n_layers, n_head, d_k, d_v, d_model, d_inner, dropout=0.1):
        super().__init__()

        n_position = len_max_seq + 1

        self.d_model = d_model

        self.src_word_emb = nn.Embedding(
            n_src_vocab, d_word_vec, padding_idx=config.PAD)

        self.position_enc = nn.Embedding.from_pretrained(
            positional_encoding(n_position, d_word_vec, padding_idx=1),
            freeze=True)

        self.layer_stack = nn.ModuleList([
            EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
            for _ in range(n_layers)])

        self.fc = nn.Linear(d_model, 2 * config.hidden_dim, bias=False)
Example #10
0
    def test_size(self):
        d_hidden = 20
        n_words = 100
        d_model = 32
        d_emb = d_model
        d_k = 4
        d_v = 5
        n_heads = 8
        batch_size = 32
        max_len = 15
        enc = EncoderLayer(d_model, d_hidden, n_heads, d_k, d_v)
        emb = nn.Embedding(n_words, d_emb)
        x = Variable(
            torch.LongTensor(
                np.random.randint(0, n_words, (batch_size, max_len))))
        embedded = emb(x)

        output, attn = enc(embedded)
        self.assertEqual(output.size(),
                         torch.Size([batch_size, max_len, d_model]))
def Transformer(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1):
    """
    Helper: Construct a model from hyperparameters.
    """
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, deepcopy(attn), deepcopy(ff), dropout), N),
        Decoder(DecoderLayer(d_model, deepcopy(attn), deepcopy(attn), deepcopy(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), deepcopy(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), deepcopy(position)),
        Generator(d_model, tgt_vocab)
    )
    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return model
Example #12
0
def LevenshteinTransformerModel(src_vocab,
                                tgt_vocab,
                                PAD,
                                BOS,
                                EOS,
                                UNK,
                                criterion,
                                d_model=512,
                                n=6,
                                h=8,
                                d_ff=2048,
                                dropout=0.0,
                                input_dropout=0.1):
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, input_dropout)
    model = LevenshteinEncodeDecoder(
        Encoder(EncoderLayer(d_model, deepcopy(attn), deepcopy(ff), dropout),
                n),
        LevenshteinDecoder(DecoderLayer(d_model, deepcopy(attn),
                                        deepcopy(attn), deepcopy(ff), dropout),
                           n=n,
                           output_embed_dim=d_model,
                           tgt_vocab=tgt_vocab),
        nn.Sequential(Embeddings(d_model, src_vocab), deepcopy(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), deepcopy(position)),
        Generator(d_model, tgt_vocab),
        pad=PAD,
        bos=BOS,
        eos=EOS,
        unk=UNK,
        criterion=criterion)
    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)

    return model