def block_net(units: int, d_model: int, num_heads: int, dropout: float, d_type: tf.dtypes.DType = tf.float32, name: str = "block_net") -> tf.keras.Model: """ BlockNet :param units: 词汇量大小 :param d_model: 深度,词嵌入维度 :param num_heads: 注意力头数 :param dropout: dropout的权重 :param d_type: 运算精度 :param name: 名称 :return: BlockNet """ query = tf.keras.Input(shape=(None, d_model), dtype=d_type, name="{}_query".format(name)) key = tf.keras.Input(shape=(None, d_model), dtype=d_type, name="{}_key".format(name)) padding_mask = tf.keras.Input(shape=(1, 1, None), dtype=d_type, name="{}_padding_mask".format(name)) attention, _ = MultiHeadAttention(d_model, num_heads)(q=query, k=key, v=key, mask=padding_mask) # attention, _ = scaled_dot_product_attention(q=query, k=key, v=key) attention = tf.keras.layers.Dropout( rate=dropout, dtype=d_type, name="{}_attention_dropout".format(name))(attention) attention = tf.keras.layers.LayerNormalization( epsilon=1e-6, dtype=d_type, name="{}_attention_layer_norm".format(name))(query + attention) outputs = tf.keras.layers.Dense( units=units, activation="relu", dtype=d_type, name="{}_dense_act".format(name))(attention) outputs = tf.keras.layers.Dense(units=d_model, dtype=d_type, name="{}_dense".format(name))(outputs) outputs = tf.keras.layers.Dropout( rate=dropout, dtype=d_type, name="{}_outputs_dropout".format(name))(outputs) outputs = tf.keras.layers.LayerNormalization( epsilon=1e-6, dtype=d_type, name="{}_outputs_layer_norm".format(name))(attention + outputs) return tf.keras.Model(inputs=[query, key, padding_mask], outputs=outputs, name=name)
def encoder_layer(units: int, d_model: int, num_heads: int, dropout: float, d_type: tf.dtypes.DType = tf.float32, name: str = "encoder_layer") -> tf.keras.Model: """ :param units: 词汇量大小 :param d_model: 深度,词嵌入维度 :param num_heads: 注意力头数 :param dropout: dropout的权重 :param d_type: 运算精度 :param name: 名称 :return: """ inputs = tf.keras.Input(shape=(None, d_model), dtype=d_type, name="{}_inputs".format(name)) padding_mask = tf.keras.Input(shape=(1, 1, None), dtype=d_type, name="{}_padding_mask".format(name)) attention, _ = MultiHeadAttention(d_model, num_heads)(q=inputs, k=inputs, v=inputs, mask=padding_mask) attention = tf.keras.layers.Dropout( rate=dropout, dtype=d_type, name="{}_attention_dropout".format(name))(attention) attention = tf.keras.layers.LayerNormalization( epsilon=1e-6, dtype=d_type, name="{}_attention_layer_norm".format(name))(inputs + attention) outputs = tf.keras.layers.Dense( units=units, activation="relu", dtype=d_type, name="{}_dense_act".format(name))(attention) outputs = tf.keras.layers.Dense(units=d_model, dtype=d_type, name="{}_dense".format(name))(outputs) outputs = tf.keras.layers.Dropout( rate=dropout, dtype=d_type, name="{}_outputs_dropout".format(name))(outputs) outputs = tf.keras.layers.LayerNormalization( epsilon=1e-6, dtype=d_type, name="{}_outputs_layer_norm".format(name))(attention + outputs) return tf.keras.Model(inputs=[inputs, padding_mask], outputs=outputs, name=name)