def build(self): """ALBert模型构建函数""" input_ids = tf.keras.layers.Input(shape=(self.max_seq_len, ), name='Input-Token') model_inputs = [input_ids] if self.use_token_type: token_type_ids = tf.keras.layers.Input(shape=(self.max_seq_len, ), name='Input-Segment') model_inputs.append(token_type_ids) else: token_type_ids = tf.keras.layers.Lambda( lambda x: create_token_type_ids(x), name='Input-Segment')(input_ids) # attention_mask和bert中的input_mask一致 attention_mask = tf.keras.layers.Lambda( lambda x: get_input_mask(x), name="Attention-Mask")(input_ids) embeddings = self._embeddings(input_ids, token_type_ids) # embedding 因式分解 if self.embedding_size != self.hidden_size: embeddings = tf.keras.layers.Dense( self.hidden_size, kernel_initializer=get_initializer(self.initializer_range), name="Factor-Dense")(embeddings) # 主要Transformer Encoder部分 self.all_layer_outputs = [] prev_output = embeddings attention_name = 'Encoder-MultiHeadSelfAttention' feed_forward_name = 'Encoder-FeedForward' layers = self.main_layer(attention_name, feed_forward_name) for i in range(self.num_hidden_layers): encoder_output = self.transformer_block( inputs=prev_output, attention_mask=attention_mask, layers= layers # 层复用,tensorflow代码中采用的variable_scope来复用变量,keras直接复用就行 ) self.all_layer_outputs.append(encoder_output) prev_output = encoder_output # pooler,取[CLS]的输出做一次线性变换,用于句子或者句对的分类 sequence_output = self.all_layer_outputs[-1] first_token_tensor = tf.keras.layers.Lambda( lambda x: x[:, 0], name='Pooler')(sequence_output) self.pooler_output = tf.keras.layers.Dense( self.hidden_size, activation='tanh', kernel_initializer=get_initializer(self.initializer_range), name="Pooler-Dense")(first_token_tensor) # sequence_output, pooler_output outputs = [sequence_output, self.pooler_output] self.model = tf.keras.Model(model_inputs, outputs) for layer in self.model.layers: layer.trainable = self._trainable(layer)
def __init__(self, config, trainable=True, training=True, max_seq_len=None, **kwargs): super(ALBertForPretraining, self).__init__(config, trainable, training, max_seq_len, **kwargs) self.bert = ALBertModel(config, trainable=trainable, training=training, max_seq_len=max_seq_len, **kwargs) self.input_embeddings = self.bert.get_token_embeddings() # NSP self.seq_relationship = tf.keras.layers.Dense( 2, kernel_initializer=get_initializer(config.initializer_range), name='NSP') # MLM self.mlm_dense = tf.keras.layers.Dense( config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name='MLM-Dense') self.transform_act_fn = ACT2FN[config.hidden_act] self.LayerNorm = LayerNormalization(epsilon=config.layer_norm_eps, name='MLM-Norm') self.bais_add = BiasAdd(initializer_range=config.initializer_range, name='MLM-Proba')
def __init__(self, config, trainable=True, training=False, max_seq_len=None, **kwargs): super(DistillBertForSequenceClassification, self).__init__(config, trainable, training, max_seq_len, **kwargs) self.bert = DistillBertModel(config, trainable=trainable, training=training, max_seq_len=max_seq_len, **kwargs) num_labels = int(kwargs.pop('num_labels', 2)) self.pre_classifier = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), activation='relu', name="pre_classifier") self.dropout = tf.keras.layers.Dropout( rate=config.sequence_classif_dropout_prob, name='classifier-drop') self.classifier = tf.keras.layers.Dense( units=num_labels, activation='softmax', kernel_regularizer=tf.keras.regularizers.l2(0.01), kernel_initializer=get_initializer(config.initializer_range), name="classifier")
def _embeddings(self, input_ids, position_ids=None): self.share_token_embeddings = tf.keras.layers.Embedding( input_dim=self.vocab_size, output_dim=self.embedding_size, embeddings_initializer=get_initializer(self.initializer_range), name='Embedding-Token') self.token_embeddings = self.share_token_embeddings(input_ids) if position_ids is None: position_ids = tf.keras.layers.Lambda( lambda x: create_position_ids(x), name='Input-Position')(input_ids) position_embeddings = tf.keras.layers.Embedding( input_dim=self.max_position_embeddings, output_dim=self.embedding_size, embeddings_initializer=get_initializer(self.initializer_range), name='Embedding-Position')(position_ids) embeddings = tf.keras.layers.Add(name='Embedding-Add')( [self.token_embeddings, position_embeddings]) embeddings = LayerNormalization(epsilon=self.layer_norm_eps, name='Embedding-Norm')(embeddings) embeddings = tf.keras.layers.Dropout( rate=self.hidden_dropout_prob, name='Embedding-Dropout')(embeddings) return embeddings
def build(self, input_shape): super(FeedForward, self).build(input_shape) # The activation is only applied to the "intermediate" hidden layer. self.intermediate = tf.keras.layers.Dense( self.intermediate_size, kernel_initializer=get_initializer(self.initializer_range)) self.intermediate_act_fn = ACT2FN[self.hidden_act] # Down-project back to `hidden_size` then add the residual. self.down_project = tf.keras.layers.Dense( self.hidden_size, kernel_initializer=get_initializer(self.initializer_range))
def __init__(self, config, trainable=True, training=True, max_seq_len=None, **kwargs): super(DistillBertForPretraining, self).__init__(config, trainable, training, max_seq_len, **kwargs) self.distillbert = DistillBertModel(config, trainable=trainable, training=training, max_seq_len=max_seq_len, **kwargs) # MLM self.transform = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name='MLM-Dense') self.transform_act_fn = ACT2FN[config.hidden_act] self.LayerNorm = LayerNormalization(epsilon=config.layer_norm_eps, name='MLM-Norm') self.bais_add = BiasAdd(initializer_range=config.initializer_range, name='MLM-Proba')
def build(self, input_shape): super(ALBertMultiHeadSelfAttention, self).build(input_shape) self.query = tf.keras.layers.Dense(self.all_head_size, kernel_initializer=get_initializer( self.initializer_range), name='query') self.key = tf.keras.layers.Dense(self.all_head_size, kernel_initializer=get_initializer( self.initializer_range), name='key') self.value = tf.keras.layers.Dense(self.all_head_size, kernel_initializer=get_initializer( self.initializer_range), name='value') self.dropout = tf.keras.layers.Dropout( rate=self.attention_probs_dropout_prob) self.linear = tf.keras.layers.Dense(self.all_head_size, kernel_initializer=get_initializer( self.initializer_range), name='linear')
def build(self): """Bert模型构建函数""" # 设置输入 input_ids = tf.keras.layers.Input(shape=(self.max_seq_len, ), name='Input-Token') model_inputs = [input_ids] if self.use_token_type: token_type_ids = tf.keras.layers.Input(shape=(self.max_seq_len, ), name='Input-Segment') model_inputs.append(token_type_ids) else: token_type_ids = tf.keras.layers.Lambda( lambda x: create_token_type_ids(x), name='Input-Segment')(input_ids) embeddings = self._embeddings(input_ids, token_type_ids) # 主要Transformer Encoder部分 attention_mask = tf.keras.layers.Lambda( lambda x: get_input_mask(x), name="Attention-Mask")(input_ids) self.all_layer_outputs = [] prev_output = embeddings for i in range(self.num_hidden_layers): attention_name = 'Encoder-%d-MultiHeadSelfAttention' % (i + 1) feed_forward_name = 'Encoder-%d-FeedForward' % (i + 1) encoder_output = self.transformer_block( inputs=prev_output, attention_mask=attention_mask, attention_name=attention_name, feed_forward_name=feed_forward_name) self.all_layer_outputs.append(encoder_output) prev_output = encoder_output # pooler,取[CLS]的输出做一次线性变换,用于句子或者句队的分类 sequence_output = self.all_layer_outputs[-1] first_token_tensor = tf.keras.layers.Lambda( lambda x: x[:, 0], name='Pooler')(sequence_output) self.pooler_output = tf.keras.layers.Dense( self.hidden_size, activation='tanh', kernel_initializer=get_initializer(self.initializer_range), name="Pooler-Dense")(first_token_tensor) # sequence_output, pooler_output outputs = [self.all_layer_outputs[-1], self.pooler_output] self.model = tf.keras.Model(model_inputs, outputs) for layer in self.model.layers: layer.trainable = self._trainable(layer)
def __init__(self, config, trainable=True, training=False, max_seq_len=None, **kwargs): super(ALBertForQuestionAnswering, self).__init__(config, trainable, training, max_seq_len, **kwargs) self.bert = ALBertModel(config, trainable=trainable, training=training, max_seq_len=max_seq_len, **kwargs) num_labels = int(kwargs.pop('num_labels', 2)) self.qa_outputs = tf.keras.layers.Dense( units=num_labels, activation='softmax', kernel_regularizer=tf.keras.regularizers.l2(0.001), kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs")