def decoder(self, inputs): decoder_inputs, encoder_encodings, encoder_masks = inputs if K.dtype(decoder_inputs) != 'int32': decoder_inputs = K.cast(decoder_inputs, 'int32') decoder_masks = K.equal(decoder_inputs, 0) # Embeddings embeddings = K.gather(self.embeddings, decoder_inputs) embeddings *= self._model_dim**0.5 # Scale # Position Encodings position_encodings = PositionEncoding(self._model_dim)(embeddings) # Embedings + Postion-encodings encodings = embeddings + position_encodings # Dropout encodings = K.dropout(encodings, self._dropout_rate) for i in range(self._decoder_stack): # Masked-Multi-head-Attention masked_attention = MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads, future=True) masked_attention_input = [ encodings, encodings, encodings, decoder_masks ] masked_attention_out = masked_attention(masked_attention_input) # Add & Norm masked_attention_out += encodings masked_attention_out = LayerNormalization()(masked_attention_out) # Multi-head-Attention attention = MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads) attention_input = [ masked_attention_out, encoder_encodings, encoder_encodings, encoder_masks ] attention_out = attention(attention_input) # Add & Norm attention_out += masked_attention_out attention_out = LayerNormalization()(attention_out) # Feed-Forward ff = PositionWiseFeedForward(self._model_dim, self._feed_forward_size) ff_out = ff(attention_out) # Add & Norm ff_out += attention_out encodings = LayerNormalization()(ff_out) # Pre-Softmax 与 Embeddings 共享参数 linear_projection = K.dot(encodings, K.transpose(self.embeddings)) outputs = K.softmax(linear_projection) return outputs
def __init__(self, vocab_size, emb_dim, num_heads, num_fields, num_neighbors, padding_val): super(Model, self).__init__() # Text embedding layer for neighbors self.text_emb = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=emb_dim, mask_zero=True) # Embedding Layer for candidate positions self.cand_pos_emb = tf.keras.layers.Dense(emb_dim) # Embedding layer for relative positions of neighbors. self.neigh_pos_emb = pos_embedding(dim=emb_dim) # Field ID Embedding Layer self.field_emb = tf.keras.layers.Embedding(input_dim=num_fields, output_dim=emb_dim, mask_zero=False) # Self Attention layer self.num_heads = num_heads self.mha = MultiHeadAttention(2 * emb_dim, self.num_heads) # Linear Projection Layer for Neighborhood Encoding+Candidate Pos Embedding self.projection = tf.keras.layers.Dense(emb_dim) # Max pooling layer for neighborhood embedding self.max_pool = tf.keras.layers.MaxPool1D(strides=num_neighbors, padding='same') # Cosine Similarity self.cosine_sim = tf.keras.losses.CosineSimilarity(axis=1, reduction='none') self.padding_val = padding_val
def build_model(): """ Build and compile the model. """ inputs = Input(shape=(MAX_LEN, 20), name='Input') masking = Masking(mask_value=0.0, input_shape=(MAX_LEN, 20), name='Masking')(inputs) hidden = Bidirectional(LSTM(512, use_bias=True, dropout=0.5, return_sequences=True), name='Bidirectional-LSTM')(masking) hidden = MultiHeadAttention(head_num=32, activation='relu', use_bias=True, return_multi_attention=False, name='Multi-Head-Attention')(hidden) hidden = Dropout(0.2, name='Dropout_1')(hidden) hidden = Attention(name='Attention')(hidden) prediction = Dense(1, activation='sigmoid', name='Output')(hidden) model = Model(inputs=inputs, outputs=prediction) adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False) #best model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy']) return model
def encoder(self, inputs): if K.dtype(inputs) != 'int32': inputs = K.cast(inputs, 'int32') masks = K.equal(inputs, 0) # Embeddings embeddings = K.gather(self.embeddings, inputs) embeddings *= self._model_dim**0.5 # Scale # Position Encodings position_encodings = PositionEncoding(self._model_dim)(embeddings) # Embedings + Postion-encodings encodings = embeddings + position_encodings # Dropout encodings = K.dropout(encodings, self._dropout_rate) for i in range(self._encoder_stack): # Multi-head-Attention attention = MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads) attention_input = [encodings, encodings, encodings, masks] attention_out = attention(attention_input) # Add & Norm attention_out += encodings attention_out = LayerNormalization()(attention_out) # Feed-Forward ff = PositionWiseFeedForward(self._model_dim, self._feed_forward_size) ff_out = ff(attention_out) # Add & Norm ff_out += attention_out encodings = LayerNormalization()(ff_out) return encodings, masks
def __init__(self, h=8, d_model=512, d_ff=2048, p_dropout=0.1, max_len=128): super().__init__() self.self_attn = MultiHeadAttention(h, d_model) self.dropout1 = Dropout(p_dropout) self.norm1 = LayerNormalization() self.src_tgt_attn = MultiHeadAttention(h, d_model) self.dropout2 = Dropout(p_dropout) self.norm2 = LayerNormalization() self.ff = FFN(d_model, d_ff) self.dropout3 = Dropout(p_dropout) self.norm3 = LayerNormalization()
def __init__(self, h=8, d_model=512, d_ff=2048, p_dropout=0.1, max_len=128, device='cpu'): super().__init__() self.self_attn = MultiHeadAttention(h, d_model) self.dropout1 = nn.Dropout(p_dropout) self.norm1 = nn.LayerNorm(d_model) self.src_tgt_attn = MultiHeadAttention(h, d_model) self.dropout2 = nn.Dropout(p_dropout) self.norm2 = nn.LayerNorm(d_model) self.ff = FFN(d_model, d_ff) self.dropout3 = nn.Dropout(p_dropout) self.norm3 = nn.LayerNorm(d_model)
def build(self, input_shape): self.embeddings = self.add_weight(shape=(self._vocab_size, self._model_dim), initializer='glorot_uniform', trainable=True, name="embeddings") self.EncoderPositionEncoding = PositionEncoding(self._model_dim) self.EncoderMultiHeadAttetions = [ MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads) for _ in range(self._encoder_stack) ] self.EncoderLayerNorms0 = [ LayerNormalization() for _ in range(self._encoder_stack) ] self.EncoderPositionWiseFeedForwards = [ PositionWiseFeedForward(self._model_dim, self._feed_forward_size) for _ in range(self._encoder_stack) ] self.EncoderLayerNorms1 = [ LayerNormalization() for _ in range(self._encoder_stack) ] self.DecoderPositionEncoding = PositionEncoding(self._model_dim) self.DecoderMultiHeadAttetions0 = [ MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads, future=True) for _ in range(self._decoder_stack) ] self.DecoderLayerNorms0 = [ LayerNormalization() for _ in range(self._decoder_stack) ] self.DecoderMultiHeadAttetions1 = [ MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads) for _ in range(self._decoder_stack) ] self.DecoderLayerNorms1 = [ LayerNormalization() for _ in range(self._decoder_stack) ] self.DecoderPositionWiseFeedForwards = [ PositionWiseFeedForward(self._model_dim, self._feed_forward_size) for _ in range(self._decoder_stack) ] self.DecoderLayerNorms2 = [ LayerNormalization() for _ in range(self._decoder_stack) ] super(Transformer, self).build(input_shape)
def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): super(EncoderLayer, self).__init__() self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
def __init__(self, input_dim, num_heads, feed_forward_hidden=512, **kwargs): super().__init__(**kwargs) self.mha = MultiHeadAttention(n_heads=num_heads, d_model=input_dim, name='MHA') self.ff1 = tf.keras.layers.Dense(feed_forward_hidden, name='ff1') self.ff2 = tf.keras.layers.Dense(input_dim, name='ff2')
def build(self, input_shape): self.MHA_sublayer = ResidualBlock_BN( SelfAttention( MultiHeadAttention(n_heads = self.n_heads, embed_dim = input_shape[2])# input_shape[2] = embed_dim = 128 ), tf.keras.layers.BatchNormalization() ) self.FF_sublayer = ResidualBlock_BN( tf.keras.models.Sequential([ tf.keras.layers.Dense(self.FF_hidden, activation = self.activation), tf.keras.layers.Dense(input_shape[2]) ]), tf.keras.layers.BatchNormalization() ) super().build(input_shape)
def __init__(self, input_dim, num_heads, feed_forward_hidden=512, **kwargs): super().__init__(**kwargs) self.mha = MultiHeadAttention(n_heads=num_heads, d_model=input_dim, name='MHA') self.bn1 = tf.keras.layers.BatchNormalization(name='bn1', trainable=True) self.bn2 = tf.keras.layers.BatchNormalization(name='bn2', trainable=True) self.ff1 = tf.keras.layers.Dense(feed_forward_hidden, name='ff1') self.ff2 = tf.keras.layers.Dense(input_dim, name='ff2')
def __init__(self, embed_dim=128, n_heads=8, clip=10., **kwargs): super().__init__(**kwargs) self.Wk1 = nn.Linear(embed_dim, embed_dim, bias=False) self.Wv = nn.Linear(embed_dim, embed_dim, bias=False) self.Wk2 = nn.Linear(embed_dim, embed_dim, bias=False) self.Wq_fixed = nn.Linear(embed_dim, embed_dim, bias=False) self.Wout = nn.Linear(embed_dim, embed_dim, bias=False) self.Wq_step = nn.Linear(embed_dim + 1, embed_dim, bias=False) self.MHA = MultiHeadAttention(n_heads=n_heads, embed_dim=embed_dim, need_W=False) self.SHA = DotProductAttention(clip=clip, return_logits=True, head_depth=embed_dim) # SHA ==> Single Head Attention, because this layer n_heads = 1 which means no need to spilt heads self.env = Env
def __init__(self, n_heads=8, FF_hidden=512, embed_dim=128, **kwargs): super().__init__(**kwargs) self.n_heads = n_heads self.FF_hidden = FF_hidden self.BN1 = Normalization(embed_dim, normalization='batch') self.BN2 = Normalization(embed_dim, normalization='batch') self.MHA_sublayer = ResidualBlock_BN( SelfAttention( MultiHeadAttention(n_heads=self.n_heads, embed_dim=embed_dim, need_W=True)), self.BN1) self.FF_sublayer = ResidualBlock_BN( nn.Sequential(nn.Linear(embed_dim, FF_hidden, bias=True), nn.ReLU(), nn.Linear(FF_hidden, embed_dim, bias=True)), self.BN2)
def __init__(self, model_config, name=''): super(ModelAttention, self).__init__(name=name) print(model_config) d_fc = model_config['d_fc'] d_model = model_config['d_model'] num_heads = model_config['num_heads'] attn_score_type = model_config['attn_score_type'] dropout_rate = model_config.get('dropout_rate', 0.0) #self.proj_1d = tf.keras.layers.Dense(d_model, activation='linear') act = tf.nn.relu self.variant_encoding = tf.keras.models.Sequential( [tf.keras.layers.Dense(w, activation=act) for w in [d_fc]]) self.neighbor_encoding = tf.keras.models.Sequential( [tf.keras.layers.Dense(w, activation=act) for w in [d_fc]]) if model_config['pairwise_type'] != 'none': self.pairwise_encoding = tf.keras.models.Sequential( [tf.keras.layers.Dense(w, activation=act) for w in [d_fc]]) #num_species #125,exclude fish self.evol_encoder = EvolEncoder2( num_species=200, weighting_schema=model_config['weighting_schema'], pairwise_type=model_config['pairwise_type']) self.logit_layer = tf.keras.layers.Dense(1) self.mha = MultiHeadAttention( d_model, num_heads, attn_score_type=attn_score_type, #use_pairwise=model_config['pairwise_type'] != 'none' use_pairwise=True) self.gru = tf.keras.layers.GRUCell(d_model, activation='tanh', recurrent_dropout=dropout_rate, dropout=dropout_rate)
def build(self, input_shape): self.MHA_sublayer = ResidualBlock_BN( SelfAttention( MultiHeadAttention( n_heads=self.n_heads, embed_dim=input_shape[2], need_W=True) # input_shape[2] = embed_dim = 128 ), self.BN1) self.FF_sublayer = ResidualBlock_BN( tf.keras.models.Sequential([ # tf.keras.layers.Dense(self.FF_hidden, use_bias = True, activation = self.activation, kernel_initializer = init, bias_initializer = init), # tf.keras.layers.Dense(input_shape[2], use_bias = True, kernel_initializer = init, bias_initializer = init) tf.keras.layers.Dense(self.FF_hidden, use_bias=True, activation=self.activation), tf.keras.layers.Dense(input_shape[2], use_bias=True) ]), self.BN2) super().build(input_shape)
def build_attention(): """ Build the model architecture for attention output """ inputs = Input(shape=(MAX_LEN, 20), name='Input') masking = Masking(mask_value=0.0, input_shape=(MAX_LEN, 20), name='Masking')(inputs) hidden = Bidirectional(LSTM(512, use_bias=True, dropout=0.5, return_sequences=True), name='Bidirectional-LSTM')(masking) hidden = MultiHeadAttention(head_num=32, activation='relu', use_bias=True, return_multi_attention=False, name='Multi-Head-Attention')(hidden) hidden = Dropout(0.2, name='Dropout_1')(hidden) hidden = Attention(return_attention=True, name='Attention')(hidden) model = Model(inputs=inputs, outputs=hidden) return model
def __init__(self, vocabs, word_dim, pos_dim, hidden_size, rnn_layers, dropout_rate, device, bidirectional=True, use_crf=False, embedding=None): super(LabelAttention, self).__init__() word2id, tag2id, label2id = vocabs # vocab == wor2id, tag2id, label2id output_size = hidden_size * 2 if bidirectional else hidden_size # because bidirectional # word embedding set self.word_embeddings = nn.Embedding(len(word2id), word_dim) # dimension == 100 # parameter embedding is preprocessing(use pretrained or not use pretrained) # parameter copy to local variable if embedding is not None: self.word_embeddings.weight.data.copy_(torch.from_numpy(embedding)) # preprocessing not embedding tag and label self.tag_embeddings = nn.Embedding(len(tag2id), pos_dim) # tag embedding # this is no labelAttention difference self.label_embeddings = nn.Embedding(len(label2id), output_size) # lstm set # word_dim + pos_dom == 150 self.lstm1 = nn.LSTM(word_dim + pos_dim, hidden_size, 1, batch_first=True, bidirectional=bidirectional, dropout=dropout_rate) self.label_attn1 = MultiHeadAttention(input_size=output_size, hidden_size=hidden_size, n_head=8, dropout=dropout_rate, device=device) self.lstm2 = nn.LSTM(hidden_size, hidden_size, 1, batch_first=True, bidirectional=bidirectional, dropout=dropout_rate) self.label_attn2 = MultiHeadAttention(input_size=output_size, hidden_size=hidden_size, n_head=1, dropout=dropout_rate, device=device) # bidirectional is ouput size * 2 # no bidirectional is ouput size * 1 # output size self.linear = nn.Linear(output_size, len(label2id)) # drop out set self.dropout_rate = dropout_rate # using crf self.use_crf = use_crf if use_crf: self.crf = CRF(len(label2id), batch_first=True) # parameter: label index # loss function: cross entroyp self.cross_entropy = nn.CrossEntropyLoss(reduction='none') # label total size self.label_size = len(label2id) self.device = device
def build(self, input_shape): context_shape, nodes_shape = input_shape self.prep_attention_layer = MultiHeadAttention(n_heads = self.n_heads, embed_dim = nodes_shape[2]) self.final_attention_layer = DotProductAttention(return_logits = True, clip = self.clip) super().build(input_shape)
def get_age_model(DATA): feed_forward_size = 2048 max_seq_len = 150 model_dim = 256 + 256 + 64 + 32 + 8 + 16 input_creative_id = Input(shape=(max_seq_len, ), name='creative_id') x1 = Embedding( input_dim=NUM_creative_id + 1, output_dim=256, weights=[DATA['creative_id_emb']], trainable=args.not_train_embedding, # trainable=False, input_length=150, mask_zero=True)(input_creative_id) # encodings = PositionEncoding(model_dim)(x1) # encodings = Add()([embeddings, encodings]) input_ad_id = Input(shape=(max_seq_len, ), name='ad_id') x2 = Embedding( input_dim=NUM_ad_id + 1, output_dim=256, weights=[DATA['ad_id_emb']], trainable=args.not_train_embedding, # trainable=False, input_length=150, mask_zero=True)(input_ad_id) input_product_id = Input(shape=(max_seq_len, ), name='product_id') x3 = Embedding( input_dim=NUM_product_id + 1, output_dim=32, weights=[DATA['product_id_emb']], trainable=args.not_train_embedding, # trainable=False, input_length=150, mask_zero=True)(input_product_id) input_advertiser_id = Input(shape=(max_seq_len, ), name='advertiser_id') x4 = Embedding( input_dim=NUM_advertiser_id + 1, output_dim=64, weights=[DATA['advertiser_id_emb']], trainable=args.not_train_embedding, # trainable=False, input_length=150, mask_zero=True)(input_advertiser_id) input_industry = Input(shape=(max_seq_len, ), name='industry') x5 = Embedding( input_dim=NUM_industry + 1, output_dim=16, weights=[DATA['industry_emb']], trainable=True, # trainable=False, input_length=150, mask_zero=True)(input_industry) input_product_category = Input(shape=(max_seq_len, ), name='product_category') x6 = Embedding( input_dim=NUM_product_category + 1, output_dim=8, weights=[DATA['product_category_emb']], trainable=True, # trainable=False, input_length=150, mask_zero=True)(input_product_category) # (bs, 100, 128*2) encodings = layers.Concatenate(axis=2)([x1, x2, x3, x4, x5, x6]) # (bs, 100) masks = tf.equal(input_creative_id, 0) # (bs, 100, 128*2) attention_out = MultiHeadAttention( 8, 79)([encodings, encodings, encodings, masks]) # Add & Norm attention_out += encodings attention_out = LayerNormalization()(attention_out) # Feed-Forward ff = PositionWiseFeedForward(model_dim, feed_forward_size) ff_out = ff(attention_out) # Add & Norm # ff_out (bs, 100, 128),但是attention_out是(bs,100,256) ff_out += attention_out encodings = LayerNormalization()(ff_out) encodings = GlobalMaxPooling1D()(encodings) encodings = Dropout(0.2)(encodings) # output_gender = Dense(2, activation='softmax', name='gender')(encodings) output_age = Dense(10, activation='softmax', name='age')(encodings) model = Model(inputs=[ input_creative_id, input_ad_id, input_product_id, input_advertiser_id, input_industry, input_product_category ], outputs=[output_age]) model.compile( optimizer=optimizers.Adam(2.5e-4), loss={ # 'gender': losses.CategoricalCrossentropy(from_logits=False), 'age': losses.CategoricalCrossentropy(from_logits=False) }, # loss_weights=[0.4, 0.6], metrics=['accuracy']) return model
def setUp(self): self.attn = MultiHeadAttention(128, 4)
def __init__(self, embedding, vocab_size, heads, embedding_size, encoder_size, conv_num=4, attn_num=1, drop_out=0.2): super(MwAN, self).__init__() self.drop_out=drop_out self.conv_num = conv_num self.attn_num = attn_num self.c = copy.deepcopy # self.embedding = nn.Embedding(vocab_size + 1, embedding_dim=embedding_size) self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding), freeze=False) self.position_encoding = PositionalEncoding(embedding_size, self.drop_out, max_len=500) # projection self.proj_a = nn.Linear(embedding_size, encoder_size) self.proj_p = nn.Linear(embedding_size, encoder_size) self.proj_q = nn.Linear(embedding_size, encoder_size) # depthwise separable convolution self.sep_conv = DepthwiseSeparableConv(encoder_size, encoder_size, k=5, dim=1) self.conv_block = Conv_block(Convsublayer( encoder_size, self.c(self.sep_conv), self.drop_out), N=2) # encoder using self-attention self.attn = MultiHeadAttention(heads, encoder_size, self.drop_out, projection=False) self.ffn = PointwiseFeedForward(encoder_size, encoder_size * 4) self.encoder = Encoder(EncoderSublayer( encoder_size, self.c(self.attn), self.c(self.ffn), self.drop_out), N=3) self.a_proj = nn.Linear(encoder_size, embedding_size) self.a_attention = nn.Linear(embedding_size, 1, bias=False) # Concat Attention self.Wc1 = nn.Linear(encoder_size, encoder_size, bias=False) self.Wc2 = nn.Linear(encoder_size, encoder_size, bias=False) self.vc = nn.Linear(encoder_size, 1, bias=False) # Bilinear Attention self.Wb = nn.Linear(encoder_size, encoder_size, bias=False) # Dot Attention : self.Wd = nn.Linear(encoder_size, encoder_size, bias=False) self.vd = nn.Linear(encoder_size, 1, bias=False) # Minus Attention : self.Wm = nn.Linear(encoder_size, encoder_size, bias=False) self.vm = nn.Linear(encoder_size, 1, bias=False) # dot attention between query self.Ws = nn.Linear(encoder_size, encoder_size, bias=False) self.vs = nn.Linear(encoder_size, 1, bias=False) # qanet self.Wqa1 = nn.Linear(encoder_size, 1, bias=False) self.Wqa2 = nn.Linear(encoder_size, 1, bias=False) self.Wqa3 = nn.Linear(encoder_size, encoder_size, bias=False) # modeling layer # add highway self.aggWH = nn.Linear(8 * encoder_size, 8 * encoder_size) self.aggWT = nn.Linear(8 * encoder_size, 8 * encoder_size) self.agg_linear = nn.Linear(8 * encoder_size, encoder_size * 2) self.agg_sep_conv = DepthwiseSeparableConv(encoder_size * 8, encoder_size * 8, k=5, dim=1) self.agg_conv_block = Conv_block(Convsublayer( encoder_size * 8, self.c(self.agg_sep_conv), self.drop_out), N=2) # encoder using self-attention self.agg_attn = MultiHeadAttention(heads, encoder_size * 8, self.drop_out, projection=False) self.agg_ffn = PointwiseFeedForward(encoder_size * 8, encoder_size * 4) self.agg_encoder = Encoder(EncoderSublayer( encoder_size * 8, self.c(self.agg_attn), self.c(self.agg_ffn), self.drop_out), N=4) """ prediction layer """ self.Wq = nn.Linear(encoder_size, encoder_size, bias=False) self.vq = nn.Linear(encoder_size, 1, bias=False) self.Wp1 = nn.Linear(encoder_size * 8, encoder_size, bias=False) self.Wp2 = nn.Linear(encoder_size, encoder_size, bias=False) self.vp = nn.Linear(encoder_size, 1, bias=False) self.prediction = nn.Linear(encoder_size * 8, embedding_size, bias=False) self.initiation()