def __init__(self, config: BertConfig): super(PositionwiseFeedForward, self).__init__() self.intermediate = tf.keras.layers.Dense(units=config.intermediate_size, kernel_initializer=create_initializer(config.initializer_range)) self.gelu = tf.keras.layers.Activation(gelu) self.linear = tf.keras.layers.Dense(units=config.hidden_size, kernel_initializer=create_initializer(config.initializer_range))
def __init__(self, config: BertConfig): super(NextSentencePrediction, self).__init__() self.pooler = tf.keras.layers.Dense( units=config.hidden_size, kernel_initializer=create_initializer(config.initializer_range)) self.pooler_activation = tf.keras.layers.Activation('tanh') self.output_weight = self.add_weight(name='output_weight', shape=[config.hidden_size, 2], initializer=create_initializer( config.initializer_range)) self.output_bias = self.add_weight(name='output_bias', shape=[2], initializer=create_initializer( config.initializer_range))
def __init__(self, config: BertConfig): super(BertEmbedding, self).__init__() self.vocab_size = config.vocab_size self.max_position_embeddings = config.max_position_embeddings self.type_vocab_size = config.type_vocab_size self.word_embedding = self.add_weight( name='word_embedding', shape=[config.vocab_size, config.hidden_size], initializer=create_initializer(config.initializer_range)) self.position_embedding = self.add_weight( name='position_embedding', shape=[config.max_position_embeddings, config.hidden_size], initializer=create_initializer(config.initializer_range)) self.token_type_embedding = self.add_weight( name='token_type_embedding', shape=[config.type_vocab_size, config.hidden_size], initializer=create_initializer(config.initializer_range))
def __init__(self, config: BertConfig, word_embedding): super(MaskedLanguageModel, self).__init__() self.word_embedding = word_embedding self.transform = tf.keras.layers.Dense( units=config.hidden_size, kernel_initializer=create_initializer(config.initializer_range)) self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-12) self.output_bias = self.add_weight(name='output_bias', shape=[config.vocab_size], initializer=tf.zeros_initializer())
def __init__(self, config: BertConfig): super(SelfAttention, self).__init__() self.num_attention_heads = config.num_attention_heads self.size_per_head = int(config.hidden_size / self.num_attention_heads) if config.hidden_size % self.num_attention_heads != 0: raise ValueError( f'The hidden size {config.hidden_size} is not a multiple ' f'of the number of attention heads {self.num_attention_heads}') self.query = tf.keras.layers.Dense(units=config.hidden_size, kernel_initializer=create_initializer(config.initializer_range), name='query') self.key = tf.keras.layers.Dense(units=config.hidden_size, kernel_initializer=create_initializer(config.initializer_range), name='key') self.value = tf.keras.layers.Dense(units=config.hidden_size, kernel_initializer=create_initializer(config.initializer_range), name='value') self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) self.linear = tf.keras.layers.Dense(units=config.hidden_size, kernel_initializer=create_initializer(config.initializer_range))