def build(self, input_shape): self.input_spec = tf.keras.layers.InputSpec(shape=input_shape) attention_head_size = int(self.hidden_size / self.num_attention_heads) self.attention_layer = MultiAttentionLayer( num_attention_heads=self.num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob=self.attention_probs_dropout_prob, initializer_range=self.initializer_range, do_return_2d_tensor=True, batch_size=self.batch_size, from_seq_length=self.seq_length, to_seq_length=self.seq_length, name="self") self.attention_output_layer = tf.keras.layers.Dense( self.hidden_size, kernel_initializer=create_initializer(self.initializer_range), name="dense") self.inter_output = tf.keras.layers.Dense( self.intermediate_size, activation=self.intermediate_act_fn, kernel_initializer=create_initializer(self.initializer_range), name="dense") self.layer_out = tf.keras.layers.Dense( self.hidden_size, kernel_initializer=create_initializer(self.initializer_range), name="dense") self.dropout = tf.keras.layers.Dropout(self.hidden_dropout_prob) self.layer_norm = tf.keras.layers.LayerNormalization(axis=-1, name="LayerNorm") self.out_layer_norm = tf.keras.layers.LayerNormalization( axis=-1, name="LayerNorm") self.built = True
def build(self, input_shape): hidden_size = input_shape[-1] self.c_fc = tf.keras.layers.Dense( hidden_size * 4, name="c_fc", kernel_initializer=create_initializer(self.initializer_range)) self.c_proj = tf.keras.layers.Dense( hidden_size, name="c_proj", kernel_initializer=create_initializer(self.initializer_range)) self.act = get_activation('gelu') self.dropout = tf.keras.layers.Dropout(self.resid_pdrop_rate)
def build(self, input_shape): self.token_embedding = WTEmbedding( vocab_size=self.vocab_size, embedding_size=self.hidden_size, initializer_range=self.initializer_range, word_embedding_name="embedding", name="wte") # position embedding self.posembedding = tf.keras.layers.Embedding( self.max_position_length, self.hidden_size, embeddings_initializer=create_initializer(self.initializer_range), name="wpe", ) self.embedding_drop = tf.keras.layers.Dropout(self.embedding_drop_rate) self.encoder_layers = [] for layer_idx in range(self.num_hidden_layers): self.encoder_layer = GPT2Transformer( num_attention_heads=self.num_attention_heads, attention_probs_dropout_prob=self.attention_probs_dropout_prob, initializer_range=self.initializer_range, epsilon=self.layer_norm_epsilon, resid_out_rate=self.resid_out_rate, name="h{}".format(layer_idx)) self.encoder_layers.append(self.encoder_layer) self.ln_f = tf.keras.layers.LayerNormalization( epsilon=self.layer_norm_epsilon, name='ln_f') # self.ln_f = normalization.GPTNorm(epsilon=self.layer_norm_epsilon, name='ln_f') self.built = True
def build(self, input_shape): self.size_per_head = int(input_shape[-1] / self.num_attention_heads) self.c_att = tf.keras.layers.Dense( # 12*64 self.num_attention_heads * self.size_per_head * 3, name="c_attn", kernel_initializer=create_initializer(self.initializer_range)) self.c_proj = tf.keras.layers.Dense( self.num_attention_heads * self.size_per_head, name="c_proj", kernel_initializer=create_initializer(self.initializer_range)) self.resid_out = tf.keras.layers.Dropout(self.resid_out_rate) self.drop_out = tf.keras.layers.Dropout( self.attention_probs_dropout_prob) self.built = True
def build(self, input_shape): self.input_spec = tf.keras.layers.InputSpec(shape=input_shape) size_per_head = int(input_shape[2] / self.num_attention_heads) self.q = dense.DenseLayer3d(self.num_attention_heads, size_per_head, create_initializer(self.initializer_range), self.query_act, self.use_einsum, "query") self.k = dense.DenseLayer3d(self.num_attention_heads, size_per_head, create_initializer(self.initializer_range), self.key_act, self.use_einsum, "key") self.v = dense.DenseLayer3d(self.num_attention_heads, size_per_head, create_initializer(self.initializer_range), self.value_act, self.use_einsum, "value") self.drop_out = tf.keras.layers.Dropout( self.attention_probs_dropout_prob) self.built = True
def build(self, input_shape): self.nx = input_shape[-1] self.weight = self.add_weight("w", shape=[1, self.nx, self.nf], initializer=create_initializer( self.initializer_range)) self.bias = self.add_weight("b", shape=[self.nf], initializer=tf.zeros_initializer()) self.built = True
def build(self, input_shape): self.input_spec = tf.keras.layers.InputSpec(shape=input_shape) self.embedding_table = self.add_weight( name=self.word_embedding_name, dtype=tf.keras.backend.floatx(), shape=[self.vocab_size, self.embedding_size], initializer=create_initializer(self.initializer_range), trainable=True, ) self.built = True
def build(self, input_shape): self.input_spec = tf.keras.layers.InputSpec(shape=input_shape) self.attention = ALBERTAttention( num_attention_heads=self.num_attention_heads, attention_probs_dropout_prob=self.attention_probs_dropout_prob, initializer_range=self.initializer_range, use_einsum=True, name='self', ) self.dense_layer_3d_proj = dense.DenseLayer3dProj( self.hidden_size, self.attention_head_size, create_initializer(self.initializer_range), None, use_einsum=self.use_einsum, name="dense") self.dense_layer_2d = dense.DenseLayer2d( self.intermediate_size, create_initializer(self.initializer_range), self.intermediate_act_fn, use_einsum=self.use_einsum, num_attention_heads=self.num_attention_heads, name="dense") self.out_dense_layer_2d = dense.DenseLayer2d( self.hidden_size, create_initializer(self.initializer_range), None, use_einsum=self.use_einsum, num_attention_heads=self.num_attention_heads, name="dense") self.attdropout = tf.keras.layers.Dropout(self.hidden_dropout_prob) self.ffdropout = tf.keras.layers.Dropout(self.hidden_dropout_prob) self.attlayer_norm = tf.keras.layers.LayerNormalization( axis=-1, name="LayerNorm") self.ffnlayer_norm = tf.keras.layers.LayerNormalization( axis=-1, name="LayerNorm") self.built = True
def build(self, input_shape): self.input_spec = tf.keras.layers.InputSpec(shape=input_shape) # `query_layer` =[B*F, N*H] self._query_layer = tf.keras.layers.Dense( self.num_attention_heads * self.size_per_head, activation=self.query_act, name="query", kernel_initializer=create_initializer(self.initializer_range)) # `value_layer` = [B*T, N*H] self._key_layer = tf.keras.layers.Dense( self.num_attention_heads * self.size_per_head, activation=self.key_act, name="key", kernel_initializer=create_initializer(self.initializer_range)) # `query_layer` =[B*T, N*H] self._value_layer = tf.keras.layers.Dense( self.num_attention_heads * self.size_per_head, activation=self.value_act, name="value", kernel_initializer=create_initializer(self.initializer_range)) self.drop_out = tf.keras.layers.Dropout( self.attention_probs_dropout_prob) self.built = True
def build(self, input_shape): input_ids_shape = input_shape self.input_spec = tf.keras.layers.InputSpec(shape=input_ids_shape) self.token_type_table = self.add_weight( name=self.token_type_embedding_name, shape=[self.token_type_vocab_size, input_shape[2]], dtype=tf.keras.backend.floatx(), initializer=create_initializer(self.initializer_range), trainable=True) self.full_position_embeddings = self.add_weight( name=self.position_embedding_name, shape=[self.max_position_embeddings, input_shape[2]], dtype=tf.keras.backend.floatx(), initializer=create_initializer(self.initializer_range), trainable=True) self.drop_out = tf.keras.layers.Dropout(self.hidden_dropout_prob) self.layer_norm = tf.keras.layers.LayerNormalization(axis=-1, name="LayerNorm") self.built = True
def build(self, input_shape): self.token_embedding = WDEmbedding(vocab_size=self.vocab_size, embedding_size=self.embedding_size, initializer_range=self.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embedding=self.use_one_hot_embedding, name="embeddings") # segment and position embedding self.segposembedding = SegPosEmbedding(use_token_type=True, hidden_dropout_prob=self.hidden_dropout_prob, token_type_vocab_size=self.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=self.initializer_range, max_position_embeddings=self.max_position_embeddings, use_one_hot_embedding=self.use_one_hot_embedding, name="embeddings" ) self.shape_change = dense.DenseLayer2d( self.hidden_size, create_initializer(self.initializer_range), None, use_einsum=self.use_einsum, name="embedding_hidden_mapping_in", ) self.encoder_layer = AlbertTransformer( hidden_size=self.hidden_size, num_attention_heads=self.num_attention_heads, attention_head_size=self.attention_head_size, attention_probs_dropout_prob=self.attention_probs_dropout_prob, intermediate_size=self.intermediate_size, intermediate_act_fn=get_activation(self.hidden_act), initializer_range=self.initializer_range, hidden_dropout_prob=self.hidden_dropout_prob, use_einsum=True, name="inner_group_{}".format(0) ) self.pool_out = tf.keras.layers.Dense( self.hidden_size, activation=tf.tanh, # kernel_constraint=create_initializer(self.initializer_range), name="dense") self.built = True