def __init__(self, embed_dim, ff_dim, num_heads, **kwargs): super().__init__(**kwargs) self.embed_dim = embed_dim self.ff_dim = ff_dim self.num_heads = num_heads self.attention_1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim, dropout=0.1) self.attention_2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim, dropout=0.1) self.ffn_layer_1 = layers.Dense(ff_dim, activation="relu") self.ffn_layer_2 = layers.Dense(embed_dim) self.layernorm_1 = layers.LayerNormalization() self.layernorm_2 = layers.LayerNormalization() self.layernorm_3 = layers.LayerNormalization() self.embedding = PositionalEmbedding(embed_dim=EMBED_DIM, sequence_length=SEQ_LENGTH, vocab_size=VOCAB_SIZE) self.out = layers.Dense(VOCAB_SIZE, activation="softmax") self.dropout_1 = layers.Dropout(0.3) self.dropout_2 = layers.Dropout(0.5) self.supports_masking = True
def __init__(self, embed_dim, ff_dim=2048, num_heads=4, rate=0.1): super(TransformerDecoderLayer, self).__init__() self.relation_att = layers.MultiHeadAttention(num_heads, embed_dim) self.align_att = layers.MultiHeadAttention(num_heads, embed_dim) self.ffn = tf.keras.Sequential( [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim)] ) self.dropout1 = layers.Dropout(rate) self.dropout2 = layers.Dropout(rate) self.dropout3 = layers.Dropout(rate) self.layernorm1 = layers.LayerNormalization(epsilon=1e-6) self.layernorm2 = layers.LayerNormalization(epsilon=1e-6) self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)
def _attention_builder(query, value): return layers.MultiHeadAttention( num_heads=head_num, key_dim=key_dim, trainable=trainable, name=name, )(query, value)
def create_transformer_module( latent_dim, projection_dim, num_heads, num_transformer_blocks, ffn_units, dropout_rate, ): # input_shape: [1, latent_dim, projection_dim] inputs = layers.Input(shape=(latent_dim, projection_dim)) x0 = inputs # Create multiple layers of the Transformer block. for _ in range(num_transformer_blocks): # Apply layer normalization 1. x1 = layers.LayerNormalization(epsilon=1e-6)(x0) # Create a multi-head self-attention layer. attention_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=projection_dim, dropout=0.1)(x1, x1) # Skip connection 1. x2 = layers.Add()([attention_output, x0]) # Apply layer normalization 2. x3 = layers.LayerNormalization(epsilon=1e-6)(x2) # Apply Feedforward network. ffn = create_ffn(hidden_units=ffn_units, dropout_rate=dropout_rate) x3 = ffn(x3) # Skip connection 2. x0 = layers.Add()([x3, x2]) # Create the Keras model. model = keras.Model(inputs=inputs, outputs=x0) return model
def create_model(self): inputs = layers.Input(shape=(self.num_of_frames, self.frame_size)) # Attention and Normalization res = inputs x = layers.MultiHeadAttention(key_dim=256, num_heads=32, dropout=.5)(inputs, inputs) x = layers.Dropout(.5)(x) x = layers.LayerNormalization()(x) res += x # Feed Forward Part x = layers.Conv1D(filters=512, kernel_size=1)(res) x = layers.PReLU()(x) x = layers.Dropout(.5)(x) x = layers.Conv1D(filters=res.shape[-1], kernel_size=1)(x) x = layers.Dropout(.5)(x) x = layers.LayerNormalization()(x) x += res x = layers.GlobalAveragePooling1D()(x) outputs = layers.Dense(self.num_of_classes, activation='softmax')(x) self.model = tf.keras.Model(inputs, outputs)
def bert_module(query, key, value, i): # Multi headed self-attention attention_output = layers.MultiHeadAttention( num_heads=config.NUM_HEAD, key_dim=config.EMBED_DIM // config.NUM_HEAD, name="encoder_{}/multiheadattention".format(i), )(query, key, value) attention_output = layers.Dropout( 0.1, name="encoder_{}/att_dropout".format(i))(attention_output) attention_output = layers.LayerNormalization( epsilon=1e-6, name="encoder_{}/att_layernormalization".format(i))(query + attention_output) # Feed-forward layer ffn = keras.Sequential( [ layers.Dense(config.FF_DIM, activation="relu"), layers.Dense(config.EMBED_DIM), ], name="encoder_{}/ffn".format(i), ) ffn_output = ffn(attention_output) ffn_output = layers.Dropout( 0.1, name="encoder_{}/ffn_dropout".format(i))(ffn_output) sequence_output = layers.LayerNormalization( epsilon=1e-6, name="encoder_{}/ffn_layernormalization".format(i))(attention_output + ffn_output) return sequence_output
def create_decoder(num_layers=DEC_LAYERS, num_heads=DEC_NUM_HEADS, image_size=IMAGE_SIZE): inputs = layers.Input((NUM_PATCHES, ENC_PROJECTION_DIM)) x = layers.Dense(DEC_PROJECTION_DIM)(inputs) for _ in range(num_layers): # Layer normalization 1. x1 = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)(x) # Create a multi-head attention layer. attention_output = layers.MultiHeadAttention( num_heads=num_heads, key_dim=DEC_PROJECTION_DIM, dropout=0.1)(x1, x1) # Skip connection 1. x2 = layers.Add()([attention_output, x]) # Layer normalization 2. x3 = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)(x2) # MLP. x3 = mlp(x3, hidden_units=DEC_TRANSFORMER_UNITS, dropout_rate=0.1) # Skip connection 2. x = layers.Add()([x3, x2]) x = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)(x) x = layers.Flatten()(x) pre_final = layers.Dense(units=image_size * image_size * 3, activation="sigmoid")(x) outputs = layers.Reshape((image_size, image_size, 3))(pre_final) return keras.Model(inputs, outputs, name="mae_decoder")
def create_encoder(num_heads=ENC_NUM_HEADS, num_layers=ENC_LAYERS): inputs = layers.Input((None, ENC_PROJECTION_DIM)) x = inputs for _ in range(num_layers): # Layer normalization 1. x1 = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)(x) # Create a multi-head attention layer. attention_output = layers.MultiHeadAttention( num_heads=num_heads, key_dim=ENC_PROJECTION_DIM, dropout=0.1)(x1, x1) # Skip connection 1. x2 = layers.Add()([attention_output, x]) # Layer normalization 2. x3 = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)(x2) # MLP. x3 = mlp(x3, hidden_units=ENC_TRANSFORMER_UNITS, dropout_rate=0.1) # Skip connection 2. x = layers.Add()([x3, x2]) outputs = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)(x) return keras.Model(inputs, outputs, name="mae_encoder")
def __init__(self, embed_dim, num_heads, feed_forward_dim, rate=0.1): super().__init__() self.self_att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) self.enc_att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) self.ffn = keras.Sequential([ layers.Dense(feed_forward_dim, activation="relu"), layers.Dense(embed_dim), ]) self.layernorm1 = layers.LayerNormalization(epsilon=1e-6) self.layernorm2 = layers.LayerNormalization(epsilon=1e-6) self.layernorm3 = layers.LayerNormalization(epsilon=1e-6) self.self_dropout = layers.Dropout(rate) self.enc_dropout = layers.Dropout(rate) self.ffn_dropout = layers.Dropout(rate)
def transformer_encoder( x, embedding_dim, mlp_dim, num_heads, dim_coefficient, attention_dropout, projection_dropout, attention_type="external_attention", ): residual_1 = x x = layers.LayerNormalization(epsilon=1e-5)(x) if attention_type == "external_attention": x = external_attention( x, embedding_dim, num_heads, dim_coefficient, attention_dropout, projection_dropout, ) elif attention_type == "self_attention": x = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim, dropout=attention_dropout)(x, x) x = layers.add([x, residual_1]) residual_2 = x x = layers.LayerNormalization(epsilon=1e-5)(x) x = mlp(x, embedding_dim, mlp_dim) x = layers.add([x, residual_2]) return x
def classify_branch(input_shape=(256, 256, 32), roi_pool_size=[10, 10], num_bbox=400, chan_num=3, projection_dim=100, transformer_layers=4, num_heads=4, crypt_class=False): Input_bbox = Input(shape=(num_bbox, 4)) fmap = Input(shape=input_shape) # Transformer part ========= pooled_features = ROIPoolingLayer(roi_pool_size[0], roi_pool_size[1])([fmap, Input_bbox]) c_p_f = PatchEncoder_w_position(num_bbox, projection_dim, 128)([pooled_features, Input_bbox]) # Create multiple layers of the Transformer block. for _ in range(transformer_layers): # Layer normalization 1. x1 = layers.LayerNormalization(epsilon=1e-6)(c_p_f) # Create a multi-head attention layer. attention_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=projection_dim, dropout=0.15)(x1, x1) # Skip connection 1. x2 = layers.Add()([attention_output, c_p_f]) # Layer normalization 2. x3 = layers.LayerNormalization(epsilon=1e-6)(x2) # MLP. x3 = mlp(x3, hidden_units=[projection_dim * 2, projection_dim], dropout_rate=0.15) # Skip connection 2. c_p_f = layers.Add()([x3, x2]) # Create a [batch_size, projection_dim] tensor. c_p_f = layers.LayerNormalization(epsilon=1e-6)(c_p_f) c_p_f = layers.Dropout(0.3)(c_p_f) # increased from 0.2 clone = layers.Dense(1)(c_p_f) partial = layers.Dense(1)(c_p_f) fufi = layers.Dense(1)(c_p_f) clone = layers.Activation('sigmoid', dtype='float32', name='clone')(clone) partial = layers.Activation('sigmoid', dtype='float32', name='partial')(partial) fufi = layers.Activation('sigmoid', dtype='float32', name='fufi')(fufi) if crypt_class: crypt = layers.Dense(1)(c_p_f) crypt = layers.Activation('sigmoid', dtype='float32', name='crclass')(crypt) just_trnsf = Model(inputs=[fmap, Input_bbox], outputs=[clone, partial, fufi, crypt], name="cpf") else: just_trnsf = Model(inputs=[fmap, Input_bbox], outputs=[clone, partial, fufi], name="cpf") return just_trnsf
def __init__(self, embed_dim, latent_dim, num_heads, **kwargs): super(FNetDecoder, self).__init__(**kwargs) self.embed_dim = embed_dim self.latent_dim = latent_dim self.num_heads = num_heads self.attention_1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) self.attention_2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) self.dense_proj = keras.Sequential([ layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim), ]) self.layernorm_1 = layers.LayerNormalization() self.layernorm_2 = layers.LayerNormalization() self.layernorm_3 = layers.LayerNormalization() self.supports_masking = True
def __init__(self, embed_dim, dense_dim, num_heads, **kwargs): super().__init__(**kwargs) self.embed_dim = embed_dim self.dense_dim = dense_dim self.num_heads = num_heads self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) self.dense_proj = layers.Dense(embed_dim, activation="relu") self.layernorm_1 = layers.LayerNormalization()
def __init__(self, d_model, heads, d_ff, dropout): super(TransformerEncoderLayer, self).__init__() self.self_attn = layers.MultiHeadAttention(num_heads=heads, key_dim=d_model, dropout=dropout) self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout) self.layer_norm = layers.LayerNormalization(epsilon=1e-6) self.dropout = layers.Dropout(dropout)
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1): super(TransformerBlock, self).__init__() self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) self.ffn = keras.Sequential( [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),] ) self.layernorm1 = layers.LayerNormalization(epsilon=1e-6) self.layernorm2 = layers.LayerNormalization(epsilon=1e-6) self.dropout1 = layers.Dropout(rate) self.dropout2 = layers.Dropout(rate)
def attention( input_shape, outputs, size_scale=1, ): if isinstance(input_shape, tuple): mask = keras.Input(shape=(input_shape[0], input_shape[0])) inputs = keras.Input(shape=input_shape) else: # input_shape already a tensor mask = keras.Input(shape=(input_shape.shape[1], input_shape.shape[1])) net = layers.MultiHeadAttention(num_heads=4, key_dim=32 * size_scale)( inputs, inputs, attention_mask=mask) net = layers.MultiHeadAttention(num_heads=4, key_dim=32 * size_scale)(net, net) net = layers.Flatten()(net) outputs = layers.Dense(outputs, activation="linear")(net) return [inputs, mask], outputs
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1): super().__init__() self.att = L.MultiHeadAttention(num_heads, key_dim=embed_dim) self.ffn = M.Sequential([ L.Dense(ff_dim, activation='relu'), L.Dense(embed_dim), ]) self.layernorm1 = L.LayerNormalization(epsilon=1e-6) self.layernorm2 = L.LayerNormalization(epsilon=1e-6) self.dropout1 = L.Dropout(rate) self.dropout2 = L.Dropout(rate)
def __init__(self, embed_dim, num_heads, ffn, dropout_rate=0.1): super(TransformerBlock, self).__init__() self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) # The ffn can be either a standard feedforward network or a switch # layer with a Mixture of Experts. self.ffn = ffn self.layernorm1 = layers.LayerNormalization(epsilon=1e-6) self.layernorm2 = layers.LayerNormalization(epsilon=1e-6) self.dropout1 = layers.Dropout(dropout_rate) self.dropout2 = layers.Dropout(dropout_rate)
def __init__(self, num_heads=8, embed_dim=64, dense_dim=512, **kwargs): super().__init__(**kwargs) self.attention = layers.MultiHeadAttention(num_heads, embed_dim) self.dense_proj = keras.Sequential([ layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim), ]) self.layernorm_1 = layers.LayerNormalization() self.layernorm_2 = layers.LayerNormalization() self.supports_masking = True
def create_vit_classifier(input_shape, patch_size, num_patches, transformer_layers, num_heads, transformer_units, num_classes, mlp_head_units, data_augmentation, projection_dim): inputs = layers.Input(shape=input_shape) # Augment data. augmented = data_augmentation(inputs) # Create patches. patches = Patches(patch_size)(augmented) # Encode patches. encoded_patches = PatchEncoder(num_patches, projection_dim)(patches) # Create multiple layers of the Transformer block. for _ in range(transformer_layers): # Layer normalization 1. x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches) # Create a multi-head attention layer. attention_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=projection_dim, dropout=0.1)(x1, x1) # Skip connection 1. x2 = layers.Add()([attention_output, encoded_patches]) # Layer Normalization 2. x3 = layers.LayerNormalization(epsilon=1e-6)(x2) # MLP. x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1) # Skip connection 2. encoded_patches = layers.Add()([x3, x2]) # Create a [batch_size, projection_dim] tensor. representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches) representation = layers.Flatten()(representation) representation = layers.Dropout(0.5)(representation) # Add MLP. features = mlp(representation, hidden_units=mlp_head_units, dropout_rate=0.5) # Classify outputs. logits = layers.Dense(num_classes)(features) # Create the keras model. model = keras.Model(inputs=inputs, outputs=logits) return model
def __init__(self, embed_dim, num_heads, ffn_dim, dropout_rate=0.1): super(TransformerBlock, self).__init__() self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim, dropout=dropout_rate) self.layernorm1 = layers.LayerNormalization(epsilon=1e-6) self.leakyrelu = layers.LeakyReLU() self.ffn = layers.Dense(units=ffn_dim) self.layernorm2 = layers.LayerNormalization(epsilon=1e-6) self.dropout1 = layers.Dropout(dropout_rate) self.dropout2 = layers.Dropout(dropout_rate) self.out = layers.Flatten()
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1): super().__init__() # for stop referring the base class self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) self.ffn = keras.Sequential([ layers.Dense(ff_dim, activation='relu'), layers.Dense(embed_dim, ) ]) #print("hello ") self.layernorm1 = layers.LayerNormalization(epsilon=1e-6) self.layernorm2 = layers.LayerNormalization(epsilon=1e-6) self.dropout1 = layers.Dropout(rate) self.dropout2 = layers.Dropout(rate)
def __init__( self, num_heads=8, embed_dim=64, dense_dim=512, batch_size=32, **kwargs ): super().__init__(**kwargs) self.partition_padding = PartitionPadding(batch_size) self.attention = layers.MultiHeadAttention(num_heads, embed_dim) self.dense_proj = keras.Sequential( [layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),] ) self.layernorm_1 = layers.LayerNormalization() self.layernorm_2 = layers.LayerNormalization() self.average_pooling = layers.GlobalAveragePooling1D()
def __init__(self, embed_dim, dense_dim, num_heads, **kwargs): super().__init__(**kwargs) self.embed_dim = embed_dim self.dense_dim = dense_dim self.num_heads = num_heads self.attention = layers.MultiHeadAttention( num_heads=num_heads, key_dim=embed_dim, dropout=0.3 ) self.dense_proj = keras.Sequential( [layers.Dense(dense_dim, activation=tf.nn.gelu), layers.Dense(embed_dim),] ) self.layernorm_1 = layers.LayerNormalization() self.layernorm_2 = layers.LayerNormalization()
def create_vivit_classifier( tubelet_embedder, positional_encoder, input_shape=INPUT_SHAPE, transformer_layers=NUM_LAYERS, num_heads=NUM_HEADS, embed_dim=PROJECTION_DIM, layer_norm_eps=LAYER_NORM_EPS, num_classes=NUM_CLASSES, ): # Get the input layer inputs = layers.Input(shape=input_shape) # Create patches. patches = tubelet_embedder(inputs) # Encode patches. encoded_patches = positional_encoder(patches) # Create multiple layers of the Transformer block. for _ in range(transformer_layers): # Layer normalization and MHSA x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches) attention_output = layers.MultiHeadAttention( num_heads=num_heads, key_dim=embed_dim // num_heads, dropout=0.1 )(x1, x1) # Skip connection x2 = layers.Add()([attention_output, encoded_patches]) # Layer Normalization and MLP x3 = layers.LayerNormalization(epsilon=1e-6)(x2) x3 = keras.Sequential( [ layers.Dense(units=embed_dim * 4, activation=tf.nn.gelu), layers.Dense(units=embed_dim, activation=tf.nn.gelu), ] )(x3) # Skip connection encoded_patches = layers.Add()([x3, x2]) # Layer normalization and Global average pooling. representation = layers.LayerNormalization(epsilon=layer_norm_eps)(encoded_patches) representation = layers.GlobalAvgPool1D()(representation) # Classify outputs. outputs = layers.Dense(units=num_classes, activation="softmax")(representation) # Create the Keras model. model = keras.Model(inputs=inputs, outputs=outputs) return model
def __init__(self, units=512, num_heads=8, key_dim=64, drop_prob=0): super(TransformerBlock, self).__init__() self.drop_prob = drop_prob self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim, dropout=self.drop_prob) self.add0 = layers.Add() self.layer_norm0 = layers.LayerNormalization() self.feed_fwd = FeedForward(units_w1=2048, units_w2=units) self.add1 = layers.Add() self.layer_norm1 = layers.LayerNormalization()
def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1): super(TransformerBlock, self).__init__() # Divide key dimension by the number of heads # so that each head is projected to a lower dimension # then gets concatenated self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim // num_heads) self.ffn = keras.Sequential([ layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim), ]) self.layernorm1 = layers.LayerNormalization(epsilon=1e-6) self.layernorm2 = layers.LayerNormalization(epsilon=1e-6) self.dropout1 = layers.Dropout(dropout) self.dropout2 = layers.Dropout(dropout)
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0): # Attention and Normalization x = layers.MultiHeadAttention(key_dim=head_size, num_heads=num_heads, dropout=dropout)(inputs, inputs) x = layers.Dropout(dropout)(x) x = layers.LayerNormalization(epsilon=1e-6)(x) res = x + inputs # Feed Forward Part x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(res) x = layers.Dropout(dropout)(x) x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x) x = layers.LayerNormalization(epsilon=1e-6)(x) return x + res
def build(self, input_shape): self.attention = layers.MultiHeadAttention( num_heads=1, key_dim=self.dimensions, dropout=0.2, ) self.layer_norm1 = layers.LayerNormalization(epsilon=1e-6) self.layer_norm2 = layers.LayerNormalization(epsilon=1e-6) self.layer_norm3 = layers.LayerNormalization(epsilon=1e-6) self.mlp = keras.Sequential([ layers.Dense(units=self.dimensions, activation=tf.nn.gelu), layers.Dropout(0.2), layers.Dense(units=self.dimensions, activation=tf.nn.gelu), ]) self.dense = layers.Dense(units=self.num_classes) self.flatten = layers.Flatten()
def transformer_block(x, transformer_layers, projection_dim, num_heads=2): for _ in range(transformer_layers): # Layer normalization 1. x1 = layers.LayerNormalization(epsilon=1e-6)(x) # Create a multi-head attention layer. attention_output = layers.MultiHeadAttention( num_heads=num_heads, key_dim=projection_dim, dropout=0.1 )(x1, x1) # Skip connection 1. x2 = layers.Add()([attention_output, x]) # Layer normalization 2. x3 = layers.LayerNormalization(epsilon=1e-6)(x2) # MLP. x3 = mlp(x3, hidden_units=[x.shape[-1] * 2, x.shape[-1]], dropout_rate=0.1,) # Skip connection 2. x = layers.Add()([x3, x2]) return x