def __init__(self, num_layers, num_heads, embed_dim, ff_dim, dropout=0., norm_in=True): super(Decoder, self).__init__() self.self_atts = nn.ModuleList([]) self.enc_dec_atts = nn.ModuleList([]) self.pos_ffs = nn.ModuleList([]) self.lnorms = nn.ModuleList([]) for i in range(num_layers): self.self_atts.append( Attention(embed_dim, num_heads, dropout=dropout)) self.enc_dec_atts.append( Attention(embed_dim, num_heads, dropout=dropout)) self.pos_ffs.append( PositionWiseFeedForward(embed_dim, ff_dim, dropout=dropout)) self.lnorms.append( nn.ModuleList( [nn.LayerNorm(embed_dim, eps=1e-6) for _ in range(3)])) self.last_lnorm = nn.LayerNorm(embed_dim, eps=1e-6) if norm_in else None self.dropout = dropout self.num_layers = num_layers
def encoder(self, inputs): if K.dtype(inputs) != 'int32': inputs = K.cast(inputs, 'int32') masks = K.equal(inputs, 0) # Embeddings embeddings = K.gather(self.embeddings, inputs) embeddings *= self._model_dim**0.5 # Scale # Position Encodings position_encodings = PositionEncoding(self._model_dim)(embeddings) # Embedings + Postion-encodings encodings = embeddings + position_encodings # Dropout encodings = K.dropout(encodings, self._dropout_rate) for i in range(self._encoder_stack): # Multi-head-Attention attention = MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads) attention_input = [encodings, encodings, encodings, masks] attention_out = attention(attention_input) # Add & Norm attention_out += encodings attention_out = LayerNormalization()(attention_out) # Feed-Forward ff = PositionWiseFeedForward(self._model_dim, self._feed_forward_size) ff_out = ff(attention_out) # Add & Norm ff_out += attention_out encodings = LayerNormalization()(ff_out) return encodings, masks
def build(self, input_shape): self.embeddings = self.add_weight(shape=(self._vocab_size, self._model_dim), initializer='glorot_uniform', trainable=True, name="embeddings") self.EncoderPositionEncoding = PositionEncoding(self._model_dim) self.EncoderMultiHeadAttetions = [ MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads) for _ in range(self._encoder_stack) ] self.EncoderLayerNorms0 = [ LayerNormalization() for _ in range(self._encoder_stack) ] self.EncoderPositionWiseFeedForwards = [ PositionWiseFeedForward(self._model_dim, self._feed_forward_size) for _ in range(self._encoder_stack) ] self.EncoderLayerNorms1 = [ LayerNormalization() for _ in range(self._encoder_stack) ] self.DecoderPositionEncoding = PositionEncoding(self._model_dim) self.DecoderMultiHeadAttetions0 = [ MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads, future=True) for _ in range(self._decoder_stack) ] self.DecoderLayerNorms0 = [ LayerNormalization() for _ in range(self._decoder_stack) ] self.DecoderMultiHeadAttetions1 = [ MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads) for _ in range(self._decoder_stack) ] self.DecoderLayerNorms1 = [ LayerNormalization() for _ in range(self._decoder_stack) ] self.DecoderPositionWiseFeedForwards = [ PositionWiseFeedForward(self._model_dim, self._feed_forward_size) for _ in range(self._decoder_stack) ] self.DecoderLayerNorms2 = [ LayerNormalization() for _ in range(self._decoder_stack) ] super(Transformer, self).build(input_shape)
def decoder(self, inputs): decoder_inputs, encoder_encodings, encoder_masks = inputs if K.dtype(decoder_inputs) != 'int32': decoder_inputs = K.cast(decoder_inputs, 'int32') decoder_masks = K.equal(decoder_inputs, 0) # Embeddings embeddings = K.gather(self.embeddings, decoder_inputs) embeddings *= self._model_dim**0.5 # Scale # Position Encodings position_encodings = PositionEncoding(self._model_dim)(embeddings) # Embedings + Postion-encodings encodings = embeddings + position_encodings # Dropout encodings = K.dropout(encodings, self._dropout_rate) for i in range(self._decoder_stack): # Masked-Multi-head-Attention masked_attention = MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads, future=True) masked_attention_input = [ encodings, encodings, encodings, decoder_masks ] masked_attention_out = masked_attention(masked_attention_input) # Add & Norm masked_attention_out += encodings masked_attention_out = LayerNormalization()(masked_attention_out) # Multi-head-Attention attention = MultiHeadAttention(self._n_heads, self._model_dim // self._n_heads) attention_input = [ masked_attention_out, encoder_encodings, encoder_encodings, encoder_masks ] attention_out = attention(attention_input) # Add & Norm attention_out += masked_attention_out attention_out = LayerNormalization()(attention_out) # Feed-Forward ff = PositionWiseFeedForward(self._model_dim, self._feed_forward_size) ff_out = ff(attention_out) # Add & Norm ff_out += attention_out encodings = LayerNormalization()(ff_out) # Pre-Softmax 与 Embeddings 共享参数 linear_projection = K.dot(encodings, K.transpose(self.embeddings)) outputs = K.softmax(linear_projection) return outputs
def get_age_model(DATA): feed_forward_size = 2048 max_seq_len = 150 model_dim = 256 + 256 + 64 + 32 + 8 + 16 input_creative_id = Input(shape=(max_seq_len, ), name='creative_id') x1 = Embedding( input_dim=NUM_creative_id + 1, output_dim=256, weights=[DATA['creative_id_emb']], trainable=args.not_train_embedding, # trainable=False, input_length=150, mask_zero=True)(input_creative_id) # encodings = PositionEncoding(model_dim)(x1) # encodings = Add()([embeddings, encodings]) input_ad_id = Input(shape=(max_seq_len, ), name='ad_id') x2 = Embedding( input_dim=NUM_ad_id + 1, output_dim=256, weights=[DATA['ad_id_emb']], trainable=args.not_train_embedding, # trainable=False, input_length=150, mask_zero=True)(input_ad_id) input_product_id = Input(shape=(max_seq_len, ), name='product_id') x3 = Embedding( input_dim=NUM_product_id + 1, output_dim=32, weights=[DATA['product_id_emb']], trainable=args.not_train_embedding, # trainable=False, input_length=150, mask_zero=True)(input_product_id) input_advertiser_id = Input(shape=(max_seq_len, ), name='advertiser_id') x4 = Embedding( input_dim=NUM_advertiser_id + 1, output_dim=64, weights=[DATA['advertiser_id_emb']], trainable=args.not_train_embedding, # trainable=False, input_length=150, mask_zero=True)(input_advertiser_id) input_industry = Input(shape=(max_seq_len, ), name='industry') x5 = Embedding( input_dim=NUM_industry + 1, output_dim=16, weights=[DATA['industry_emb']], trainable=True, # trainable=False, input_length=150, mask_zero=True)(input_industry) input_product_category = Input(shape=(max_seq_len, ), name='product_category') x6 = Embedding( input_dim=NUM_product_category + 1, output_dim=8, weights=[DATA['product_category_emb']], trainable=True, # trainable=False, input_length=150, mask_zero=True)(input_product_category) # (bs, 100, 128*2) encodings = layers.Concatenate(axis=2)([x1, x2, x3, x4, x5, x6]) # (bs, 100) masks = tf.equal(input_creative_id, 0) # (bs, 100, 128*2) attention_out = MultiHeadAttention( 8, 79)([encodings, encodings, encodings, masks]) # Add & Norm attention_out += encodings attention_out = LayerNormalization()(attention_out) # Feed-Forward ff = PositionWiseFeedForward(model_dim, feed_forward_size) ff_out = ff(attention_out) # Add & Norm # ff_out (bs, 100, 128),但是attention_out是(bs,100,256) ff_out += attention_out encodings = LayerNormalization()(ff_out) encodings = GlobalMaxPooling1D()(encodings) encodings = Dropout(0.2)(encodings) # output_gender = Dense(2, activation='softmax', name='gender')(encodings) output_age = Dense(10, activation='softmax', name='age')(encodings) model = Model(inputs=[ input_creative_id, input_ad_id, input_product_id, input_advertiser_id, input_industry, input_product_category ], outputs=[output_age]) model.compile( optimizer=optimizers.Adam(2.5e-4), loss={ # 'gender': losses.CategoricalCrossentropy(from_logits=False), 'age': losses.CategoricalCrossentropy(from_logits=False) }, # loss_weights=[0.4, 0.6], metrics=['accuracy']) return model