def __init__(self, config, src_vocab, target_vocab, s_v, t_v, u): super(Transformer, self).__init__() self.config = config h, N, dropout = self.config.h, self.config.N, self.config.dropout d_model, d_ff = self.config.d_model, self.config.d_ff attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) attncross = MultiHeadedAttention(h, d_model * 2) ffcross = PositionwiseFeedForward(d_model * 2, d_ff, dropout) positioncross = PositionalEncoding(d_model * 2, dropout) self.encoder = Encoder( EncoderLayer(config.d_model, deepcopy(attn), deepcopy(ff), dropout), N) self.encoder_cross = EncoderCross( EncoderLayerCross((config.d_model) * 2, deepcopy(attncross), deepcopy(ffcross), dropout), N) self.src_embed = nn.Sequential( Embeddings(config.d_model, src_vocab, s_v, u), deepcopy(position)) # Embeddings followed by PE # self.src_embed.weight.data.copy_(src_vocab.vectors) self.target_embed = nn.Sequential( Embeddings(config.d_model, target_vocab, t_v, u), deepcopy(position)) # self.target_embed.weight.data.copy_(target_vocab.vectors) # Fully-Connected Layer self.fc = nn.Linear(self.config.d_model, self.config.output_size) self.sigmoid = nn.Sigmoid() self.cos = nn.CosineSimilarity(dim=1, eps=1e-6) self.softmax = nn.Softmax()
def __init__(self, d_model, d_ff, nheads, drop_prob=0.2): super(DecoderLayer, self).__init__() self.dec_attn_layer = MultiHeadedAttention(d_model, nheads, drop_prob) self.enc_dec_attn_layer = MultiHeadedAttention(d_model, nheads, drop_prob) self.ffn = PositionWiseFeedForwardNetwork(d_model, d_ff, drop_prob) self.layer_norm1 = LayerNorm(d_model) self.layer_norm2 = LayerNorm(d_model) self.layer_norm3 = LayerNorm(d_model) self.dropout = nn.Dropout(drop_prob)
def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout): """ :param hidden: hidden size of transformer :param attn_heads: head sizes of multi-head attention :param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size :param dropout: dropout rate """ super().__init__() self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden) self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout) self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout) self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout) self.dropout = nn.Dropout(p=dropout)
def load_model(self, path): V = len(self.vocab.char2id) d_model = 256 d_ff = 1024 h = 4 n_encoders = 4 self_attn = MultiHeadedAttention(h=h, d_model=d_model, d_k=d_model // h, d_v=d_model // h, dropout=0.1) feed_forward = FullyConnectedFeedForward(d_model=d_model, d_ff=d_ff) position = PositionalEncoding(d_model, dropout=0.1) embedding = nn.Sequential(Embeddings(d_model=d_model, vocab=V), position) encoder = Encoder(self_attn=self_attn, feed_forward=feed_forward, size=d_model, dropout=0.1) generator = Generator3(d_model=d_model, vocab_size=V) model = Bert(encoder=encoder, embedding=embedding, generator=generator, n_layers=n_encoders) checkpoint = torch.load(path, map_location=torch.device('cpu')) model.load_state_dict(checkpoint['model_state_dict']) model.eval() return model
def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1): c = copy.deepcopy attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) model = EncoderDecoder( encoder=Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), decoder=Decoder( DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), src_embed=nn.Sequential(Embeddings(d_model, src_vocab), c(position)), tgt_embed=nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)), generator=Generator(d_model, tgt_vocab)) for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) return model
def __init__(self, config, src_vocab): super(Transformer, self).__init__() self.config = config self.src_vocab = src_vocab # 超参数 # h是多头数量, N是层数, dropout是比率 h, N, dropout = self.config.h, self.config.N, self.config.dropout # 词向量维度,全连接维度 d_model, d_ff = self.config.d_model, self.config.d_ff # 多头注意力层 attn = MultiHeadedAttention(h, d_model) # 全连接层 ff = PositionwiseFeedForward(d_model, d_ff, dropout) # 位置向量 position = PositionalEncoding(d_model, dropout) self.encoder = Encoder( EncoderLayer(config.d_model, deepcopy(attn), deepcopy(ff), dropout), N) self.src_embed = nn.Sequential( Embedding(self.config.d_model, self.src_vocab), deepcopy(position)) # embedding with position encoding self.fc = nn.Linear(self.config.d_model, self.config.output_size) self.softmax = nn.Softmax()
class TransformerBlock(nn.Module): """ Bidirectional Encoder = Transformer (self-attention) Transformer = MultiHead_Attention + Feed_Forward with sublayer connection """ def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout): """ :param hidden: hidden size of transformer :param attn_heads: head sizes of multi-head attention :param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size :param dropout: dropout rate """ super().__init__() self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden) self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout) self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout) self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout) self.dropout = nn.Dropout(p=dropout) def forward(self, x, mask): x = self.input_sublayer( x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask)) x = self.output_sublayer(x, self.feed_forward) return self.dropout(x)
def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1): "Helper: Construct a model from hyperparameters." c = copy.deepcopy attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) model = EncoderDecoder( Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), nn.Sequential(Embeddings(d_model, src_vocab), c(position)), nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)), Generator(d_model, tgt_vocab)) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) return model
def __init__(self, config, src_vocab): super(Transformer, self).__init__() self.config = config h, N, dropout = self.config.h, self.config.N, self.config.dropout d_model, d_ff = self.config.d_model, self.config.d_ff attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) self.encoder = Encoder( EncoderLayer(config.d_model, deepcopy(attn), deepcopy(ff), dropout), N) self.src_embed = nn.Sequential(Embeddings(config.d_model, src_vocab)) self.fc = nn.Linear(self.config.d_model, self.config.output_size) self.softmax = nn.Softmax()
def __init__(self, config): super(Transformer, self).__init__() self.config = config h, N, dropout = self.config.h, self.config.N, self.config.dropout d_model, d_ff = self.config.d_model, self.config.d_ff attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) self.encoder = Encoder(EncoderLayer(config.d_model, deepcopy(attn), deepcopy(ff), dropout), N) # self.src_embed = nn.Sequential(Embeddings(config.d_model, src_vocab), # deepcopy(position)) # Embeddings followed by PE # Fully-Connected Layer self.fc = nn.Linear( self.config.d_model, self.config.output_size )
def __init__(self, attention_heads, dropout): layer_norm_residual_fn = lambda module, input_size, output_size: \ LayerNormResidual(module, input_size, output_size, dropout) self.attention_fn = lambda query_size, mem_size, output_size: \ layer_norm_residual_fn(MultiHeadedAttention( query_size, mem_size, mem_size, output_size, attention_heads), query_size, output_size) self.feed_forward_fn = lambda input_size, output_size: \ layer_norm_residual_fn( nn.Sequential( nn.Linear(input_size, output_size * 4), nn.ReLU(), nn.Linear(output_size * 4, output_size) ), input_size, output_size) super(TransformerLayer, self).__init__()
def __init__(self, config, pre_train_weight, embedding_size): super(Transformer, self).__init__() self.config = config self.pre_train_weight = pre_train_weight self.embedding_size = embedding_size # 超参数 # h是多头数量, N是层数, dropout是比率 h, N, dropout = self.config.h, self.config.N, self.config.dropout # 词向量维度,全连接维度 d_model, d_ff = self.config.d_model, self.config.d_ff # 多头注意力层 attn = MultiHeadedAttention(h, d_model) # 全连接层 ff = PositionwiseFeedForward(d_model, d_ff, dropout) # 位置向量 position = PositionalEncoding(d_model, dropout) self.encoder = Encoder(EncoderLayer(config.d_model, deepcopy(attn), deepcopy(ff), dropout), N) self.src_embed = nn.Sequential(Embedding(self.config.d_model, self.pre_train_weight, self.embedding_size), deepcopy(position)) # embedding with position encoding
def __init__(self, config, src_vocab): super(Transformer, self).__init__() self.config = config h, N, dropout = self.config.h, self.config.N, self.config.dropout d_model, d_ff = self.config.d_model, self.config.d_ff self.src_vocab = src_vocab attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) self.encoder_layer = EncoderLayer(config.d_model, deepcopy(attn), deepcopy(ff), dropout) self.encoder = Encoder(self.encoder_layer, N) self.src_word_emb = nn.Embedding(src_vocab, config.d_model, padding_idx=0) # self.pos_bias = nn.Embedding(src_vocab, config.d_model, padding_idx=0) # self.pos_bias = nn.Embedding.from_pretrained(get_sinusoid_encoding_table_dim(src_vocab, config.d_model, padding_idx=0),freeze=True) # self.pos_bias = nn.Embedding.from_pretrained(get_sinusoid_encoding_table_vocab(src_vocab, config.d_model, padding_idx=0),freeze=True) # self.pos_bias = nn.Embedding(1, config.d_model, padding_idx=0) # self.pos_bias = nn.Embedding(src_vocab, 1, padding_idx=0) # self.position_enc = nn.Embedding(src_vocab, config.d_model, padding_idx=0) self.position_enc = nn.Embedding.from_pretrained( get_sinusoid_encoding_table(src_vocab, config.d_model, padding_idx=0), freeze=False) # position_enc = torch.randn(1000, config.d_model) # position_enc = position_enc.unsqueeze(0) # self.register_buffer('position_enc', position_enc) self.drop = nn.Dropout(p=dropout) self.fc = nn.Linear(self.config.d_model, self.config.output_size) self.softmax = nn.Softmax()
def make_transformer_model(src_vocab, tgt_vocab, config): "Make a transformer model base on config." c = copy.deepcopy attn = MultiHeadedAttention(config['h'], config['d_model']) ff = PositionwiseFeedForward(config['d_model'], config['d_ff'], config['dropout']) position = PositionalEncoding(config['d_model'], config['dropout']) # word_embed = nn.Sequential(Embeddings(config['d_model'], src_vocab), c(position)) embed, position = Embeddings(config['d_model'], src_vocab), c(position) model = EncoderDecoder( Encoder(EncoderLayer(config['d_model'], c(attn), c(ff), config['dropout']), config['num_layer']), Decoder(DecoderLayer(config['d_model'], c(attn), c(attn), c(ff), config['dropout']), config['num_layer'], config['d_model'], tgt_vocab, config['pointer_gen']), embed, position, embed, position, config['tie_weights'] ) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) return model
def __init__(self, layer, N, d_model, vocab, pointer_gen, ner_last): super(Decoder, self).__init__() self.layers = clones(layer, N) if pointer_gen: print('pointer_gen') self.bptr = nn.Parameter(torch.FloatTensor(1, 1)) self.Wh = nn.Linear(d_model, 1) self.Wx = nn.Linear(d_model, 1) self.pointer_gen = True else: self.pointer_gen = False self.sm = nn.Softmax(dim=-1) self.ner_last = ner_last if self.ner_last: # if last layer ner -> 2 * d_model (ner concat with x) self.proj = nn.Linear(2 * d_model, vocab) self.norm = LayerNorm(2 * layer.size) self.Ws = nn.Linear(2 * d_model, 1) self.ner_attn = MultiHeadedAttention(1, d_model, 0.3) else: self.proj = nn.Linear(d_model, vocab) self.norm = LayerNorm(layer.size) self.Ws = nn.Linear(d_model, 1)
def create_batch(batch_size, n_batches): for _ in range(n_batches): chars = torch.from_numpy( np.random.randint(2, 28, size=(batch_size, 10))).long() batch = Batch(chars, None, pad_token) yield batch V = 26 + 1 + 1 d_model = 256 h = 8 self_attn = MultiHeadedAttention(h=h, d_model=d_model, d_k=d_model // h, d_v=d_model // h, dropout=0.) feed_forward = FullyConnectedFeedForward(d_model=d_model, d_ff=1024) embedding = Embeddings(d_model=d_model, vocab=V) encoder = Encoder(self_attn=self_attn, feed_forward=feed_forward, size=d_model, dropout=0.) generator = Generator(d_model=d_model, vocab_size=V) model = Bert(encoder=encoder, embedding=embedding, generator=generator, n_layers=4)
def __init__(self, opt): """Basic model building blocks.""" nn.Module.__init__(self) self.opt = opt if self.opt.arc_combine_method == 'attention': #self.attention = MultiHeadedAttention(h=self.opt.attn_heads, d_model=hidden) print( f'\n\nAttn_Input \n{self.opt.inputSize + self.opt.keySize}\n\n' ) self.attention = MultiHeadedAttention(h=1, d_model=self.opt.inputSize + self.opt.keySize) else: self.attention = None if self.opt.grapheme_combination != 'None': self.is_graphemic = True if self.opt.grapheme_encoding: self.grapheme_encoder = GraphemeEncoder(self.opt) self.grapheme_attention = LuongAttention( attn_type=self.opt.grapheme_combination, num_features=self.opt.grapheme_hidden_size * 2, initialisation=self.opt.init_grapheme) self.has_grapheme_encoding = True else: self.grapheme_attention = LuongAttention( attn_type=self.opt.grapheme_combination, num_features=self.opt.grapheme_features, initialisation=self.opt.init_grapheme) self.has_grapheme_encoding = False else: self.is_graphemic = False num_directions = 2 if self.opt.bidirectional else 1 if self.opt.encoder_type == 'ATTENTION': self.model_intermediate = Encoder( self.opt.inputSize, self.opt.hiddenSize, self.opt.hiddenSize, self.opt.init_word, self.opt.nLSTMLayers, use_bias=True, birdirectional=self.opt.bidirectional, attention=self.attention, attention_order=self.opt.attention_order, attention_key=self.opt.attention_key, dropout=self.opt.intermediate_dropout) else: self.model_intermediate = LSTM( LSTMCell, self.opt.inputSize, self.opt.hiddenSize, self.opt.nLSTMLayers, use_bias=True, bidirectional=self.opt.bidirectional, attention=self.attention) self.model_output = DNN_output(num_directions * self.opt.hiddenSize, self.opt.linearSize, 1, self.opt.nFCLayers, self.opt.init_word, use_bias=True, logit=True)
def __init__(self, d_model, vocab_size, nheads): super(PointerGeneratorWithCoverage, self).__init__() self.fc = nn.Linear(2 * d_model, vocab_size) self.p_gen_fc = nn.Linear(2 * d_model, 1) self.single_head_attn1 = MultiHeadedAttention(d_model, 1) self.single_head_attn2 = AttentionWithCoverage(d_model, 400)