def __init__( self, n_src_vocab, len_max_seq, d_word_vec, n_layers, n_head, d_k, d_v, d_model, d_inner, dropout=0.1, pretrained_embeddings=None): super().__init__() n_position = len_max_seq + 1 if pretrained_embeddings is None: self.src_word_emb = nn.Embedding( n_src_vocab, d_word_vec, padding_idx=Constants.PAD) else: self.src_word_emb = nn.Embedding.from_pretrained( pretrained_embeddings, padding_idx=Constants.PAD, freeze=True) self.position_enc = nn.Embedding.from_pretrained( get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0), freeze=True) self.segment_enc = nn.Embedding(int(n_position/2), d_word_vec, padding_idx=0) self.layer_stack = nn.ModuleList([ EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers)])
def __init__(self, bert, hidden_size, num_hidden_layers, num_attention_heads, dropout): super().__init__() self.N = num_hidden_layers self.bert = bert self.pe = PositionalEncoder(hidden_size, dropout=dropout) self.layers = get_clones(EncoderLayer(hidden_size, num_attention_heads, dropout), num_hidden_layers) self.norm = Norm(hidden_size)
def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1): super(ImageCaptioningEncoder, self).__init__() self.d_model = d_model self.num_layers = num_layers # self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model) # self.pos_encoding = positional_encoding(maximum_position_encoding, # self.d_model) self.affine = tf.keras.layers.Dense(d_model) self.enc_layers = [ EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers) ] self.dropout = tf.keras.layers.Dropout(rate)
def __init__(self, vocab_size, d_model, N, heads, dropout): super().__init__() self.N = N self.embed = Embedder(vocab_size, d_model) self.pe = PositionalEncoder(d_model, dropout=dropout) self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N) self.norm = Norm(d_model)
def __init__(self, n_src_vocab, len_max_seq, d_word_vec, n_layers, n_heads, d_k, d_v, d_model, d_inner, dropout=0.1): super().__init__() n_position = len_max_seq + 1 self.src_word_emb = nn.Embedding(n_src_vocab, d_word_vec, padding_idx=config.pad_id) self.position_enc = nn.Embedding.from_pretrained( get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0), freeze=True) self.layer_stack = nn.ModuleList([ EncoderLayer(d_model, d_inner, n_heads, d_k, d_v, dropout=dropout) for _ in range(n_layers) ])
def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1): "从超参数构造模型" c = copy.deepcopy attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) model = EncoderDecoder( Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), nn.Sequential(Embeddings(d_model, src_vocab), c(position)), nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)), Generator(d_model, tgt_vocab)) # 从代码来看,使用 Glorot / fan_avg初始化参数很重要。 # 对参数进行均匀分布初始化 for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) return model
def __init__(self, d_model, N_layers, heads, dropout): super().__init__() self.N_layers = N_layers # self.embed = Embedder(vocab_size, d_model) # self.pe = PositionalEncoder(d_model, dropout=dropout) # self.attn = MultiHeadAttention(heads, d_model, dropout=dropout) self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N_layers) self.norm = Norm(d_model)
def __init__(self, vocab_size, d_model, N, heads, dropout, device): super().__init__() self.N = N # We need to use the embedder # self.embed = Embedder(vocab_size, d_model) # self.embed = nn.Linear(vocab_size, d_model) self.pe = PositionalEncoder(d_model, dropout=dropout, device=device) self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N) self.norm = Norm(d_model)
def __init__(self, vocab_size, d_model, N, heads, dropout, field, word_emb, opt): super().__init__() self.N = N self.word_emb = word_emb self.opt = opt # unused, just for querying self.embed = Embedder(vocab_size, d_model, word_emb, field) self.pe = PositionalEncoder(d_model, dropout=dropout) self.layers = get_clones(EncoderLayer(d_model, heads, dropout), N) # attention self.norm = Norm(d_model)
def __init__(self, bpe_size, h, d_model, p, d_ff): super(Transformer, self).__init__() self.bpe_size = bpe_size self.word_emb = nn.Embedding(bpe_size, d_model, padding_idx=0) self.pos_emb = PositionalEncoding(d_model, p) self.encoder = nn.ModuleList([EncoderLayer(h, d_model, p, d_ff) for _ in range(6)]) self.decoder = nn.ModuleList([DecoderLayer(h, d_model, p, d_ff) for _ in range(6)]) self.generator = nn.Linear(d_model, bpe_size, bias=False) # tie weight between word embedding and generator self.generator.weight = self.word_emb.weight self.logsoftmax = nn.LogSoftmax() # pre-save a mask to avoid future information in self-attentions in decoder # save as a buffer, otherwise will need to recreate it and move to GPU during every call mask = torch.ByteTensor(np.triu(np.ones((512,512)), k=1).astype('uint8')) self.register_buffer('mask', mask)
def __init__(self, embed_size, n_layers, n_head, d_k, d_v, d_model, d_inner,): """Load the pretrained ResNet-152 and replace top fc layer.""" super(Encoder, self).__init__() resnet = models.resnet152(pretrained=True) modules = list(resnet.children())[:-1] # delete the last fc layer. self.resnet = nn.Sequential(*modules) self.linear = nn.Linear(resnet.fc.in_features, embed_size) self.bn = nn.BatchNorm1d(embed_size, momentum=0.01) self.dropout = nn.Dropout(p=dropout) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.layer_stack = nn.ModuleList([ EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers)])
def __init__(self, word_emb, rela_emb, max_len, n_layers=6, n_head=8, d_k=64, d_v=64, d_word_vec=512, d_model=512, d_inner_hid=1024, dropout=0.1): super(Encoder, self).__init__() n_position = max_len + 1 self.max_len = max_len self.d_model = d_model self.position_enc = nn.Embedding(n_position, d_word_vec, padding_idx=Constants.PAD) self.position_enc.weight.data = position_encoding_init( n_position, d_word_vec) #Word Embedding layer self.word_embedding = nn.Embedding(word_emb.shape[0], word_emb.shape[1]) self.eord_embedding_weightn = nn.Parameter( torch.from_numpy(word_emb).float()) self.word_embedding.weight.requires_grad = False # fix the embedding matrix #Rela Embedding Layer self.rela_embedding = nn.Embedding(rela_emb.shape[0], rela_emb.shape[1]) self.rela_embedding.weight = nn.Parameter( torch.from_numpy(rela_emb).float()) self.rela_embedding.weight.requires_grad = False # fix the embedding matrix self.layer_stack = nn.ModuleList([ EncoderLayer(d_model, d_inner_hid, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers) ])
def __init__( self, len_seq, d_word_vec, n_layers, n_head, d_k, d_v, d_inner, dropout=0.1): super(Encoder, self).__init__() n_position = len_seq #+ 1 #TODO Because of SOS. Not required for continuous inputs self.position_enc = nn.Embedding.from_pretrained( get_sinusoid_encoding_table(n_position, d_k*n_head, padding_idx=0), #padding index is for SOS;;;; Also d_wrd_vec was changed to d_k (true #features) freeze=True) #Loading the table as a pretrained embedding. freeze=True makes sure it will not be updated and the same #across encoder and decoder self.layer_stack = nn.ModuleList([ EncoderLayer(d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers)])
def __init__(self, d_model=512, n_head=8, n_layers=6, d_inner=2048, n_positions=200, dropout=0.1): super().__init__() self.d_k = self.d_v = self.d_q = d_model // n_head self.position_enc = PositionalEncoding(d_model, n_position=n_positions) # self.dropout = nn.Dropout(p=dropout) self.layer_stack = nn.ModuleList([ EncoderLayer(d_model, d_inner, n_head, self.d_k, self.d_v, dropout=dropout) for _ in range(n_layers) ]) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
def __init__(self, n_src_vocab, n_max_seq, n_layers=3, n_head=3, d_k=64, d_v=64, d_word_vec=128, d_model=128, d_inner_hid=128, dropout=0.8): super(Encoder, self).__init__() n_position = n_max_seq + 1 self.n_max_seq = n_max_seq self.d_model = d_model self.position_enc = nn.Embedding(n_position, d_word_vec, padding_idx=Constants_PAD) self.position_enc.weight.data = position_encoding_init( n_position, d_word_vec) self.src_word_emb = nn.Embedding(n_src_vocab, d_word_vec, padding_idx=Constants_PAD) self.layer_stack = nn.ModuleList([ EncoderLayer(d_model, d_inner_hid, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers) ]) self.output = nn.Linear(800 * 128, 19)
def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=.1): """ construct model from hyper-parameters""" c = copy.deepcopy attn_rpr = MultiHeadedAttention_RPR(d_model, h, max_relative_position=5) attn = MultiHeadedAttention(d_model, h) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) model = EncoderDecoder( Encoder(EncoderLayer(d_model, c(attn_rpr), c(ff), dropout), N), Decoder(DecoderLayer(d_model, c(attn_rpr), c(attn), c(ff), dropout), N), nn.Sequential(Embeddings(d_model, src_vocab), c(position)), nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)), Generator(d_model, tgt_vocab)) for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) return model
def __init__(self, d_input, d_model, N, heads, dropout): super().__init__() self.N = N self.layers = get_clones( EncoderLayer(d_input, d_model, heads, dropout), N) self.norm = nn.LayerNorm(d_model)