def __init__(self, vocab_size, max_len, n_layers=6, d_model=512, d_emb=512, d_hidden=1024, n_heads=8, d_k=64, d_v=64, dropout=0.1, pad_id=0): super(Encoder, self).__init__() self.pos_enc = PositionalEncoding(max_len + 1, d_emb, padding_idx=pad_id) self.emb = nn.Embedding(vocab_size, d_emb, padding_idx=pad_id) self.layers = nn.ModuleList([ EncoderLayer(d_model=d_model, d_hidden=d_hidden, n_heads=n_heads, d_k=d_k, d_v=d_v, dropout=dropout) for _ in range(n_layers) ])
def __init__(self, vocab_size, embedding_rank, inner_rank=None, ffward_rank=None): super().__init__() self.vocab = vocab_size layer = EncoderLayer( tconfig.layer_dimension, MultiHeadedAttention(tconfig.num_attention_heads, tconfig.layer_dimension, rank=inner_rank), PositionwiseFeedForward(tconfig.layer_dimension, tconfig.inner_layer_dimension, tconfig.dropout, rank=ffward_rank), tconfig.dropout, ) self.layers = clones(layer, tconfig.num_layers) self.norm = LayerNorm(layer.size) self.src_embed = nn.Sequential( FactorizedEmbeddings(vocab_size, tconfig.layer_dimension, embedding_rank), PositionalEncoding( tconfig.layer_dimension, tconfig.dropout, ), )
def __init__(self, vocab_size: int, embed_dim: int, n_head: int, d_ff: int, pad_idx: int, n_layers: int, dropout: float = 0.1): """ Embedding Parameters :param vocab_size: the size of the source vocabulary :param embed_dim: embedding dimension. 512 is used in the paper :param n_head: the number of multi head. (split the embed_dim to 8.. such that 8 * 64 = 512) :param d_ff: inner dimension of position-wise feed-forward :param pad_idx: padding index :param n_layers: the number of sub-layers Flow 1. embedding layer 2. positional encoding 3. residual dropout(0.1) 4. iterate sub-layers (6 layers are used in paper) """ super(Encoder, self).__init__() self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx) self.position_enc = PositionalEncoding(embed_dim) self.dropout = nn.Dropout(dropout) # Residual Dropout(0.1) in paper self.layer_stack: ModuleList[EncoderLayer] = nn.ModuleList([ EncoderLayer(embed_dim, n_head, d_ff=d_ff, dropout=dropout) for _ in range(n_layers) ]) self.layer_norm = NormLayer(embed_dim)
def test_size(self): d_hidden = 20 n_words = 100 d_model = 32 d_emb = d_model d_k = 4 d_v = 5 n_heads = 8 batch_size = 32 max_len = 15 enc = EncoderLayer(d_model, d_hidden, n_heads, d_k, d_v) enc_emb = nn.Embedding(n_words, d_emb) dec_input = Variable( torch.LongTensor( np.random.randint(0, n_words, (batch_size, max_len)))) enc_embedded = enc_emb(dec_input) enc_output, enc_attn = enc(enc_embedded) self.assertEqual(enc_output.size(), torch.Size([batch_size, max_len, d_model])) dec = DecoderLayer(d_model, d_hidden, n_heads, d_k, d_v) dec_emb = nn.Embedding(n_words, d_emb) dec_input = Variable( torch.LongTensor( np.random.randint(0, n_words, (batch_size, max_len)))) dec_embedded = dec_emb(dec_input) dec_output, dec_slf_attn, inter_attn = dec(dec_embedded, enc_output) self.assertEqual(dec_output.size(), torch.Size([batch_size, max_len, d_model]))
def __init__(self, n_layers, d_model, d_ffn, n_heads, max_seq_len, src_vocab_size, dropout=0.1): super(Encoder, self).__init__() self.d_model = d_model self.src_emb = Embeddings(d_model, src_vocab_size) self.pos_enc = PositionalEncoding(d_model, dropout=dropout, max_seq_len=max_seq_len) self.dropout_emb = nn.Dropout(dropout) self.layers = nn.ModuleList( [EncoderLayer(d_model, n_heads, d_ffn, dropout) for _ in range(n_layers)])
def __init__(self, input_vocab_num, max_seq_len, pad_idx=0): """ :param input_vocab_num: 全部输入序列的词典的单词数 :param max_seq_len: 输入序列最大长度 :param pad_idx: pad的填充位置,默认为0 """ super(Encoder, self).__init__() self.word_embedding = nn.Embedding(input_vocab_num, config.d_model, padding_idx=pad_idx) # 词向量层 N*D self.pos_encoding = PositionalEncoding(max_seq_len + 1, config.d_model, pad_idx) # 位置向量层 (N+1)*D self.encoder_layers = nn.ModuleList([EncoderLayer() for _ in range(config.layers)]) # 堆叠n层encoder_layer self.pad_obj = Mask() # mask对象 self.tool = Tools() # 工具对象
def __init__(self, w_emb, p_emb_w, p_emb_s, d_model=256, d_ff=1024, h=8, dropout=0.1, N=6): super(LocalEncoder, self).__init__() self.w_emb = w_emb self.p_emb_w = p_emb_w self.p_emb_s = p_emb_s self.dropout = nn.Dropout(dropout) attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) self.encoder = Encoder(EncoderLayer(d_model, attn, ff, dropout), N)
def __init__(self, config: TransformerConfig): super(Encoder, self).__init__() self.config = config self.word_emb = nn.Embedding(self.config.vocab_size, self.config.word_vec_size, padding_idx=self.config.pad_idx) self.position_encoder = PositionalEncoding( self.config.word_vec_size, n_position=self.config.n_position) self.dropout = nn.Dropout(p=self.config.dropout) self.layer_stack = nn.ModuleList([ EncoderLayer(d_model=self.config.d_model, d_inner=self.config.d_inner, n_head=self.config.n_head, d_k=self.config.d_k, d_v=self.config.d_v, dropout=self.config.dropout) for _ in range(self.config.encoder_n_layers) ]) # note: d_model == word_vec_size, 即token向量化之中的大小 self.layer_norm = nn.LayerNorm(self.config.d_model, eps=1e-6)
def __init__(self, n_src_vocab, len_max_seq, d_word_vec, n_layers, n_head, d_k, d_v, d_model, d_inner, dropout=0.1): super().__init__() n_position = len_max_seq + 1 self.d_model = d_model self.src_word_emb = nn.Embedding( n_src_vocab, d_word_vec, padding_idx=config.PAD) self.position_enc = nn.Embedding.from_pretrained( positional_encoding(n_position, d_word_vec, padding_idx=1), freeze=True) self.layer_stack = nn.ModuleList([ EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers)]) self.fc = nn.Linear(d_model, 2 * config.hidden_dim, bias=False)
def test_size(self): d_hidden = 20 n_words = 100 d_model = 32 d_emb = d_model d_k = 4 d_v = 5 n_heads = 8 batch_size = 32 max_len = 15 enc = EncoderLayer(d_model, d_hidden, n_heads, d_k, d_v) emb = nn.Embedding(n_words, d_emb) x = Variable( torch.LongTensor( np.random.randint(0, n_words, (batch_size, max_len)))) embedded = emb(x) output, attn = enc(embedded) self.assertEqual(output.size(), torch.Size([batch_size, max_len, d_model]))
def Transformer(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1): """ Helper: Construct a model from hyperparameters. """ attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) model = EncoderDecoder( Encoder(EncoderLayer(d_model, deepcopy(attn), deepcopy(ff), dropout), N), Decoder(DecoderLayer(d_model, deepcopy(attn), deepcopy(attn), deepcopy(ff), dropout), N), nn.Sequential(Embeddings(d_model, src_vocab), deepcopy(position)), nn.Sequential(Embeddings(d_model, tgt_vocab), deepcopy(position)), Generator(d_model, tgt_vocab) ) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) return model
def LevenshteinTransformerModel(src_vocab, tgt_vocab, PAD, BOS, EOS, UNK, criterion, d_model=512, n=6, h=8, d_ff=2048, dropout=0.0, input_dropout=0.1): attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, input_dropout) model = LevenshteinEncodeDecoder( Encoder(EncoderLayer(d_model, deepcopy(attn), deepcopy(ff), dropout), n), LevenshteinDecoder(DecoderLayer(d_model, deepcopy(attn), deepcopy(attn), deepcopy(ff), dropout), n=n, output_embed_dim=d_model, tgt_vocab=tgt_vocab), nn.Sequential(Embeddings(d_model, src_vocab), deepcopy(position)), nn.Sequential(Embeddings(d_model, tgt_vocab), deepcopy(position)), Generator(d_model, tgt_vocab), pad=PAD, bos=BOS, eos=EOS, unk=UNK, criterion=criterion) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) return model