def transformer_block(x, projection_dim, num_heads, dropout, prefix): """ Transformer encoder block for MobileViT. See official pytorch implementation: https://github.com/apple/ml-cvnets/blob/main/cvnets/modules/transformer.py """ # Layer normalization 1. x1 = LayerNormalization(epsilon=1e-6, name=prefix + '_LN1')(x) # Create a multi-head attention layer. attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=projection_dim, dropout=dropout, name=prefix + '_attention')(x1, x1) # Skip connection 1. x2 = Add(name=prefix + '_add1')([attention_output, x]) # Layer normalization 2. x3 = LayerNormalization(epsilon=1e-6, name=prefix + '_LN2')(x2) # FeedForward network. x3 = feedforward(x3, hidden_units=[x.shape[-1] * 2, x.shape[-1]], dropout_rate=dropout, name=prefix + '_ff') # Skip connection 2. x = Add(name=prefix + '_add2')([x3, x2]) return x
def attention(inputs): layer = MultiHeadAttention(num_heads=8, key_dim=2, value_dim=3, output_shape=(5, )) target = tf.keras.Input(shape=[1000, 23]) source = tf.keras.Input(shape=[1000, 23]) output_tensor = layer(target, source)
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1): super(TransformerBlock, self).__init__() self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) self.ffn = keras.Sequential([Dense(ff_dim, activation="relu"), Dense(embed_dim)]) self.layernorm1 = LayerNormalization(epsilon=1e-6) self.layernorm2 = LayerNormalization(epsilon=1e-6) self.dropout1 = Dropout(rate) self.dropout2 = Dropout(rate)
def transformer(layer, num_heads, key_dim, activation='relu'): _in = Dense(X.shape[1], activation=activation, bias_initializer=tf.keras.initializers.glorot_uniform)(layer) mha = MultiHeadAttention( num_heads=num_heads, key_dim=key_dim, attention_axes=1, bias_initializer=tf.keras.initializers.glorot_uniform) res = mha(_in, _in) return tf.keras.layers.ReLU(negative_slope=.01)(layer + res)
def ViT_block(V, num_heads, key_dim, filter_num_MLP, activation='GELU', name='ViT'): ''' Vision transformer (ViT) block. ViT_block(V, num_heads, key_dim, filter_num_MLP, activation='GELU', name='ViT') ---------- Dosovitskiy, A., Beyer, L., Kolesnikov, A., Weissenborn, D., Zhai, X., Unterthiner, T., Dehghani, M., Minderer, M., Heigold, G., Gelly, S. and Uszkoreit, J., 2020. An image is worth 16x16 words: Transformers for image recognition at scale. arXiv preprint arXiv:2010.11929. Input ---------- V: embedded input features. num_heads: number of attention heads. key_dim: dimension of the attention key (equals to the embeded dimensions). filter_num_MLP: a list that defines the number of nodes for each MLP layer. For the last MLP layer, its number of node must equal to the dimension of key. activation: activation of MLP nodes. name: prefix of the created keras layers. Output ---------- V: output tensor. ''' # Multiheaded self-attention (MSA) V_atten = V # <--- skip V_atten = LayerNormalization(name='{}_layer_norm_1'.format(name))(V_atten) V_atten = MultiHeadAttention(num_heads=num_heads, key_dim=key_dim, name='{}_atten'.format(name))(V_atten, V_atten) # Skip connection V_add = add([V_atten, V], name='{}_skip_1'.format(name)) # <--- skip # MLP V_MLP = V_add # <--- skip V_MLP = LayerNormalization(name='{}_layer_norm_2'.format(name))(V_MLP) V_MLP = ViT_MLP(V_MLP, filter_num_MLP, activation, name='{}_mlp'.format(name)) # Skip connection V_out = add([V_MLP, V_add], name='{}_skip_2'.format(name)) # <--- skip return V_out
def __init__(self, name='AttentionBlock', num_heads=2, head_size=128, ff_dim=None, dropout=0, **kwargs): super().__init__(name=name, **kwargs) if ff_dim is None: ff_dim = head_size self.attention = MultiHeadAttention(num_heads=num_heads, head_size=head_size, dropout=dropout) self.attention_dropout = keras.layers.Dropout(dropout) self.attention_norm = keras.layers.LayerNormalization(epsilon=1e-6) self.ff_conv1 = keras.layers.Conv1D(filters=ff_dim, kernel_size=1, activation='relu') # self.ff_conv2 at build() self.ff_dropout = keras.layers.Dropout(dropout) self.ff_norm = keras.layers.LayerNormalization(epsilon=1e-6)
import tensorflow as tf from tensorflow.keras.layers import MultiHeadAttention def attention(inputs): layer = MultiHeadAttention(num_heads=8, key_dim=2, value_dim=3, output_shape=(5, )) target = tf.keras.Input(shape=[1000, 23]) source = tf.keras.Input(shape=[1000, 23]) output_tensor = layer(target, source) if __name__ == '__main__': # layer = MultiHeadAttention(num_heads=8, key_dim=2, value_dim=2)#, attention_axes=(0)) layer = MultiHeadAttention(num_heads=8, key_dim=2, value_dim=2, attention_axes=(0, 1)) input_tensor = tf.keras.Input(shape=( 8, 87, )) output_tensor = layer(input_tensor, input_tensor) print(output_tensor.shape)
def __init__(self, n_slots, n_heads, head_size, n_blocks=1, n_layers=3, activation='tanh', recurrent_activation='hard_sigmoid', forget_bias=1.0, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros', kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, kernel_constraint=None, recurrent_constraint=None, bias_constraint=None, **kwargs): """Initialization method. Args: n_slots (int): Number of memory slots. n_heads (int): Number of attention heads. head_size (int): Size of each attention head. n_blocks (int): Number of feed-forward networks. n_layers (int): Amout of layers per feed-forward network. activation (str): Output activation function. recurrent_activation (str): Recurrent step activation function. forget_bias (float): Forget gate bias values. kernel_initializer (str): Kernel initializer function. recurrent_initializer (str): Recurrent kernel initializer function. bias_initializer (str): Bias initializer function. kernel_regularizer (str): Kernel regularizer function. recurrent_regularizer (str): Recurrent kernel regularizer function. bias_regularizer (str): Bias regularizer function. kernel_constraint (str): Kernel constraint function. recurrent_constraint (str): Recurrent kernel constraint function. bias_constraint (str): Bias constraint function. """ super(RelationalMemoryCell, self).__init__(**kwargs) # Number of memory slots and their sizes self.n_slots = n_slots self.slot_size = n_heads * head_size # Number of attention heads and their sizes self.n_heads = n_heads self.head_size = head_size # Number of feed-forward network blocks and their sizes self.n_blocks = n_blocks self.n_layers = n_layers # Activation functions self.activation = activations.get(activation) self.recurrent_activation = activations.get(recurrent_activation) # Forget gate bias value self.forget_bias = forget_bias # `W`, `U` and `b` initializers self.kernel_initializer = initializers.get(kernel_initializer) self.recurrent_initializer = initializers.get(recurrent_initializer) self.bias_initializer = initializers.get(bias_initializer) # `W`, `U` and `b` regularizers self.kernel_regularizer = regularizers.get(kernel_regularizer) self.recurrent_regularizer = regularizers.get(recurrent_regularizer) self.bias_regularizer = regularizers.get(bias_regularizer) # `W`, `U` and `b` constraints self.kernel_constraint = constraints.get(kernel_constraint) self.recurrent_constraint = constraints.get(recurrent_constraint) self.bias_constraint = constraints.get(bias_constraint) # Number of outputted units self.units = self.slot_size * n_slots # Number of outputted units from the gates self.n_gates = 2 * self.slot_size # Creating a layer for projecting the input self.projector = Dense(self.slot_size) # Creating the feed-forward network # It is composed by linear layers and normalization ones self.before_norm = LayerNormalization() self.linear = [ Dense(self.slot_size, activation='relu') for _ in range(n_layers) ] self.after_norm = LayerNormalization() # Creating the Multi-Head Attention layer self.attn = MultiHeadAttention(self.slot_size, self.n_heads)
def simple_model(input_size, num_regions, cell_num): input_shape = (input_size, 4) inputs = Input(shape=input_shape) x = inputs x = Dropout(0.3)(x) x = resnet_v2(x, 8, 2) num_patches = 782 x = Dropout(0.3)(x) # Encode patches. encoded_patches = PatchEncoder(num_patches, projection_dim)(x) # Create multiple layers of the Transformer block. for i in range(transformer_layers): # Layer normalization 1. x1 = LayerNormalization(epsilon=1e-6, name="ln_" + str(i) + "_1")(encoded_patches) # Create a multi-head attention layer. attention_output = MultiHeadAttention(num_heads=num_heads, key_dim=projection_dim, dropout=0.1, name="mha_" + str(i))(x1, x1) # Skip connection 1. x2 = Add()([attention_output, encoded_patches]) # Layer normalization 2. x3 = LayerNormalization(epsilon=1e-6, name="ln_" + str(i) + "_2")(x2) # MLP. x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1, name="mlp_" + str(i)) # Skip connection 2. encoded_patches = Add()([x3, x2]) # Create a [batch_size, projection_dim] tensor. representation = LayerNormalization(epsilon=1e-6, name="ln_rep")(encoded_patches) representation = Flatten()(representation) representation = Dropout(0.2)(representation) # Compress compress_dim = 4000 x = Dense(compress_dim, name="latent_vector")(representation) x = LeakyReLU(alpha=0.1)(x) x = Dropout(0.5, input_shape=(None, compress_dim))(x) outs = [] for i in range(cell_num): ol = Dense(num_regions, name="out_row_" + str(i))(x) ol = LeakyReLU(alpha=0.1, name="act_out_row_" + str(i))(ol) outs.append(ol) if i % 50 == 0: print(i, end=" ") # x = Dense(cell_num * num_regions)(representation) # x = LeakyReLU(alpha=0.1)(x) # outputs = Reshape((cell_num, num_regions))(x) outputs = tf.stack(outs, axis=1) print(outputs) model = Model(inputs, outputs, name="model") print("\nModel constructed") return model
def build(self, input_shape): self.layer_normalization_1 = LayerNormalization() # layer_norm_mha self.multi_head_attention = MultiHeadAttention(num_heads=4, key_dim=12, dropout=0.2, attention_axes=None) self.dropout = Dropout(0.2) self.layer_normalization_2 = LayerNormalization() # layer_norm_ffn self.positionwise_convolution = PositionwiseConvolution(self.units)