def helper_test_coupling(my_activation, tf_activation, loss, inputs, y_true, units): tf.random.set_seed(42) tf_layer = Dense(units, activation=tf_activation) tf_layer.build(inputs.shape) with tf.GradientTape(persistent=True) as tape: tape.watch([inputs, *tf_layer.trainable_weights]) pred_tf = tf_layer(inputs) loss = loss(y_true, pred_tf) *grads_tf, dY = tape.gradient( loss, [inputs, *tf_layer.trainable_weights, pred_tf]) tf.random.set_seed(42) my_layer = NN.Layer(units, my_activation) my_layer.build(inputs.shape) pred_my = my_layer(inputs) dX, [dW, dB] = my_layer.backprop(dY) grads_my = [dX, dW, dB] assert np.allclose(pred_my, pred_tf) assert all( np.allclose(grad_my, grad_tf) for grad_my, grad_tf in zip(grads_my, grads_tf))
class FB_Classifier(Model): """Classifier that uses sentence embeddings to classify sentence -> DDC""" def __init__(self, input_dim=(1,None), output_dim=None): super(self.__class__, self).__init__() self.embed = hub.KerasLayer(PARAGRAPH_ENCODER, input_shape=(1,), trainable=TRAIN_ENCODER) # self.conv1 = Conv2D(32, 3, activation='relu') # self.flatten = Flatten() #TODO: den dense part possibly as layer (https://www.tensorflow.org/guide/keras/custom_layers_and_models#the_model_class) self.d1 = Dense(ENCODER_OUTDIM, activation='relu') self.d2 = Dense(output_dim) def __call__(self, inputs, training=True): x = self.embed(inputs) #input shape: (32,) x = self.d1(x) #x shape: (32, 512) return self.d2(x) #x shape: (32, 512) def call(self, inputs, training=True): return self.__call__(inputs, training=training) #TODO: custom loss - see https://www.tensorflow.org/guide/keras/custom_layers_and_models#the_add_loss_method def summary(self): self.build([]) #inputs.shape == (32,) self.d1.build(ENCODER_OUTDIM) self.d2.build() self.summary()
class SpanBegin(Layer): def __init__(self, **kwargs): super(SpanBegin, self).__init__(**kwargs) def build(self, input_shape): last_dim = input_shape[0][-1] + input_shape[1][-1] input_shape_dense_1 = input_shape[0][:-1] + (last_dim, ) self.dense_1 = Dense(units=1) self.dense_1.build(input_shape_dense_1) self.trainable_weights = self.dense_1.trainable_weights super(SpanBegin, self).build(input_shape) def call(self, inputs): merged_context, modeled_passage = inputs span_begin_input = K.concatenate([merged_context, modeled_passage]) span_begin_weights = TimeDistributed(self.dense_1)(span_begin_input) span_begin_probabilities = Softmax()(K.squeeze(span_begin_weights, axis=-1)) return span_begin_probabilities def compute_output_shape(self, input_shape): merged_context_shape, _ = input_shape return merged_context_shape[:-1] def get_config(self): config = super().get_config() return config
def vanilla_export(self): self._kwargs["name"] = self.name layer = Dense(self.units, self.activation, self.use_bias, "glorot_uniform", "zeros", None, None, None, None, None, **self._kwargs) layer.build(self.input_shape) layer.kernel.assign(self.kernel.numpy() * self._get_coef()) if self.use_bias: layer.bias.assign(self.bias.numpy()) return layer
class CumulativeSetAttentionLayer(tf.keras.layers.Layer): dense_options = {'activation': 'relu', 'kernel_initializer': 'he_uniform'} def __init__(self, n_layers=2, width=128, latent_width=128, aggregation_function='mean', dot_prod_dim=64, n_heads=4, attn_dropout=0.3): super().__init__() assert aggregation_function == 'mean' self.width = width self.dot_prod_dim = dot_prod_dim self.attn_dropout = attn_dropout self.n_heads = n_heads self.psi = build_dense_dropout_model(n_layers, width, 0., self.dense_options) self.psi.add(Dense(latent_width, **self.dense_options)) self.rho = Dense(latent_width, **self.dense_options) def build(self, input_shape): self.psi.build(input_shape) encoded_shape = self.psi.compute_output_shape(input_shape) self.rho.build(encoded_shape) self.W_k = self.add_weight('W_k', (encoded_shape[-1] + input_shape[-1], self.dot_prod_dim * self.n_heads), initializer='he_uniform') self.W_q = self.add_weight('W_q', (self.n_heads, self.dot_prod_dim), initializer=tf.keras.initializers.Zeros()) def compute_output_shape(self, input_shape): return (input_shape[0], self.n_heads) def call(self, inputs, segment_ids, training=None): if training is None: training = tf.keras.backend.learning_phase() encoded = self.psi(inputs) # cumulative mean aggregation agg = cumulative_segment_mean(encoded, segment_ids) agg = self.rho(agg) combined = tf.concat([inputs, agg], axis=-1) keys = tf.matmul(combined, self.W_k) keys = tf.stack(tf.split(keys, self.n_heads, -1), 1) keys = tf.expand_dims(keys, axis=2) # should have shape (el, heads, 1, dot_prod_dim) queries = tf.expand_dims(tf.expand_dims(self.W_q, -1), 0) # should have shape (1, heads, dot_prod_dim, 1) preattn = tf.matmul(keys, queries) / tf.sqrt(float(self.dot_prod_dim)) preattn = tf.squeeze(tf.squeeze(preattn, -1), -1) return preattn
def build(config, classes, softmax=True, scale_adjust_wb=None): model = Sequential() input_shape = config.input_shape # 1. CONV => RELU => BN => POOL model.add( Conv2D(48, (7, 7), activation='relu', input_shape=input_shape)) # model.add(BatchNormalization()) model.add(MaxPooling2D(pool_size=(2, 2))) # 2. CONV => RELU => BN => POOL model.add(Conv2D(96, (5, 5), activation='relu')) # model.add(BatchNormalization()) model.add(MaxPooling2D(pool_size=(2, 2))) # 3. CONV => RELU => BN => POOL model.add(Conv2D(128, (3, 3), activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2))) # 4. CONV => RELU => BN => POOL model.add(Conv2D(96, (3, 3), activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2))) # 5. CONV => RELU => BN => POOL model.add(Conv2D(64, (1, 1), activation='relu')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Dropout(0.3)) model.add(Flatten()) model.add(Dense(256, activation='relu')) # model.add(BatchNormalization()) model.add(Dropout(0.5)) model.add(Dense(16, activation='relu')) # final layer model.add(Dense(classes)) if softmax: model.add(Activation("softmax")) if scale_adjust_wb is not None: # The below line doesn't save/load well as this is a custom object. Thus replaced by dense layer # model.add(Lambda(lambda x: scale_adjust_wb[0] * x + scale_adjust_wb[1])) input_shape = (None, classes) scale_layer = Dense( classes, trainable=False, input_shape=input_shape, ) scale_layer.build(input_shape=input_shape) scale_layer.set_weights( [np.diag(scale_adjust_wb[0]), scale_adjust_wb[1]]) model.add(scale_layer) # return the constructed network architecture return model
def begin_insert_layer(self, layer_dim): # `self.layers[0].get_weights()` -> [weights, bias] next_units = self.layers[0].get_weights()[0].shape[0] layer = Dense( units=next_units, activation=tf.nn.relu, kernel_regularizer=regularizers.l1_l2(l1=self.l1, l2=self.l2), kernel_initializer=initializers.GlorotNormal(seed=self.seed), bias_initializer=initializers.Zeros()) layer.build(input_shape=(None, layer_dim)) self.layers.insert(0, layer)
def build(self, input_shape): """Creates the layer neurons and connections. Parameters ---------- input_shape: Union[list, tuple, Any] Keras tensor (future input to layer) or list/tuple of Keras tensors to reference for weight shape computations. """ Dense.build(self, input_shape) self.init_neurons(input_shape)
def _create_dense_layer(self, _, normalized_weights, num_classes): input_shape = tf.TensorShape([None, 512]) dense_layer = Dense( input_shape=(512, ), units=num_classes, use_bias=False, name='fully_connected_to_softmax_crossentropy', dtype='float32', trainable=False, ) dense_layer.build(input_shape) dense_layer.set_weights([normalized_weights.read_value()]) return dense_layer
class SpanEnd(Layer): def __init__(self, **kwargs): super(SpanEnd, self).__init__(**kwargs) def build(self, input_shape): emdim = input_shape[0][-1] // 2 input_shape_bilstm_1 = input_shape[0][:-1] + (emdim * 14, ) self.bilstm_1 = Bidirectional(LSTM(emdim, return_sequences=True)) self.bilstm_1.build(input_shape_bilstm_1) input_shape_dense_1 = input_shape[0][:-1] + (emdim * 10, ) self.dense_1 = Dense(units=1) self.dense_1.build(input_shape_dense_1) self.trainable_weights = self.bilstm_1.trainable_weights + self.dense_1.trainable_weights super(SpanEnd, self).build(input_shape) def call(self, inputs): encoded_passage, merged_context, modeled_passage, span_begin_probabilities = inputs weighted_sum = K.sum( K.expand_dims(span_begin_probabilities, axis=-1) * modeled_passage, -2) passage_weighted_by_predicted_span = K.expand_dims(weighted_sum, axis=1) tile_shape = K.concatenate([[1], [K.shape(encoded_passage)[1]], [1]], axis=0) passage_weighted_by_predicted_span = K.tile( passage_weighted_by_predicted_span, tile_shape) multiply1 = modeled_passage * passage_weighted_by_predicted_span span_end_representation = K.concatenate([ merged_context, modeled_passage, passage_weighted_by_predicted_span, multiply1 ]) span_end_representation = self.bilstm_1(span_end_representation) span_end_input = K.concatenate( [merged_context, span_end_representation]) span_end_weights = TimeDistributed(self.dense_1)(span_end_input) span_end_probabilities = Softmax()(K.squeeze(span_end_weights, axis=-1)) return span_end_probabilities def compute_output_shape(self, input_shape): _, merged_context_shape, _, _ = input_shape return merged_context_shape[:-1] def get_config(self): config = super().get_config() return config
def build(self, input_shape): """Creates the layer neurons and connections. Parameters ---------- input_shape: Union[list, tuple, Any] Keras tensor (future input to layer) or list/tuple of Keras tensors to reference for weight shape computations. """ Dense.build(self, input_shape) self.init_neurons(input_shape.as_list()) if self.config.getboolean('cell', 'bias_relaxation'): self.update_b()
class Highway(Layer): activation = None transform_gate_bias = None def __init__(self, activation='relu', transform_gate_bias=-1, **kwargs): self.activation = activation self.transform_gate_bias = transform_gate_bias super(Highway, self).__init__(**kwargs) def build(self, input_shape): # Create a trainable weight variable for this layer. dim = input_shape[-1] transform_gate_bias_initializer = Constant(self.transform_gate_bias) input_shape_dense_1 = input_shape[-1] self.dense_1 = Dense(units=dim, bias_initializer=transform_gate_bias_initializer) self.dense_1.build(input_shape) self.dense_2 = Dense(units=dim) self.dense_2.build(input_shape) self.trainable_weights = self.dense_1.trainable_weights + self.dense_2.trainable_weights super(Highway, self).build(input_shape) # Be sure to call this at the end def call(self, x): dim = K.int_shape(x)[-1] transform_gate = self.dense_1(x) transform_gate = Activation("sigmoid")(transform_gate) carry_gate = Lambda(lambda x: 1.0 - x, output_shape=(dim, ))(transform_gate) transformed_data = self.dense_2(x) transformed_data = Activation(self.activation)(transformed_data) transformed_gated = Multiply()([transform_gate, transformed_data]) identity_gated = Multiply()([carry_gate, x]) value = Add()([transformed_gated, identity_gated]) return value def compute_output_shape(self, input_shape): return input_shape def get_config(self): config = super().get_config() config['activation'] = self.activation config['transform_gate_bias'] = self.transform_gate_bias return config
class FullyConnected(ModuleNative): def __init__(self, out_features: int, activation=None): """ A simple fully connected layer (aka Linear Layer or Dense). It computes Wx+b with optional activation funciton. :param out_features: The number of output features. :param activation: The activation function that should be added after the fc layer. """ super().__init__() self.out_features = out_features self.activation = Activation(activation) @RunOnlyOnce def _build_pytorch(self, features): import torch from babilim.core.tensor_pt import Tensor as _Tensor in_features = features.shape[-1] self.linear = torch.nn.Linear(in_features, self.out_features) self.weight = _Tensor(data=None, trainable=True, native=self.linear.weight) self.bias = _Tensor(data=None, trainable=True, native=self.linear.bias) if torch.cuda.is_available(): self.linear = self.linear.to(torch.device( "cuda")) # FIXME shouldn't this be done automatically? def _call_pytorch(self, features): return self.activation(self.linear(features)) @RunOnlyOnce def _build_tf(self, features): from tensorflow.keras.layers import Dense from babilim.core.tensor_tf import Tensor as _Tensor self.linear = Dense(self.out_features) self.linear.build(features.shape) self.weight = _Tensor(data=None, trainable=True, native=self.linear.kernel) self.bias = _Tensor(data=None, trainable=True, native=self.linear.bias) def _call_tf(self, features): return self.activation(self.linear(features))
def wider(self, added_size=1, pos_layer=None): layers_size = len(self.layers) if layers_size < 2: raise ValueError("Number of layer must be greater than 2.") if pos_layer is None: pos_layer = max(layers_size - 2, 0) elif pos_layer >= layers_size - 1 or pos_layer < 0: raise ValueError( f"pos_layer is expected less than length of layers (pos_layer in [0, layers_size-2])" ) # TODO: get biggest value to divide for new weights weights, bias = self.layers[pos_layer].get_weights() weights_next_layer, bias_next_layer = self.layers[pos_layer + 1].get_weights() new_weights, new_bias, new_weights_next_layer = net2wider( weights, bias, weights_next_layer, added_size) src_units, des_units = weights.shape[0], weights.shape[1] + added_size next_des_units = weights_next_layer.shape[1] wider_layer = Dense(units=des_units, activation=tf.nn.relu, kernel_regularizer=regularizers.l1_l2(l1=self.l1, l2=self.l2)) # input_shape = (batch_size, input_features). # input_features = number of units in layer = length(layer) = output of previous layer wider_layer.build(input_shape=(None, src_units)) wider_layer.set_weights([new_weights, new_bias]) next_layer = Dense(units=next_des_units, activation=tf.nn.relu, kernel_regularizer=regularizers.l1_l2(l1=self.l1, l2=self.l2)) next_layer.build(input_shape=(None, des_units)) next_layer.set_weights([new_weights_next_layer, bias_next_layer]) self.layers[pos_layer] = wider_layer self.layers[pos_layer + 1] = next_layer
class Attention(Layer): """ Implementing attention Layer. References: 1. https://androidkt.com/text-classification-using-attention-mechanism-in-keras/ """ def __init__(self, units=256, **kwargs): self.W1 = Dense(units, use_bias=False) self.W2 = Dense(units, use_bias=False) self.V = Dense(1, use_bias=False) super(Attention, self).__init__(**kwargs) def build(self, input_shape): self.W1.build(input_shape[0]) self.W2.build((input_shape[1][0], 1, input_shape[1][1])) self.V.build(input_shape[0]) self.trainable_weights.append(self.W1.trainable_weights) self.trainable_weights.append(self.W2.trainable_weights) self.trainable_weights.append(self.V.trainable_weights) super(Attention, self).build(input_shape) self.built = True def call(self, inputs, mask=None): features, hidden = inputs hidden_with_time_axis = tf.expand_dims(hidden, 1) score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis)) attention_weights = tf.nn.softmax(self.V(score), axis=1) context_vector = attention_weights * features return tf.reshape(context_vector, tf.shape(features)), attention_weights def compute_mask(self, inputs, mask=None): return mask
def build(self, input_shape) -> None: """Build the Graph Convolution layer. Parameters ------------------------------ input_shape Shape of the output of the previous layer. """ if len(input_shape) == 0: raise ValueError( "The provided input of the Graph Convolution layer " "is empty. It should contain exactly two elements, " "the adjacency matrix and the node features.") if len(input_shape) == 1: raise ValueError( "The provided input of the Graph Convolution layer " "has a single element. It should contain exactly two elements, " "the adjacency matrix and the node features.") for node_feature_shape in input_shape[1:]: dense_layer = Dense( units=self._units, activation=self._activation, ) dense_layer.build(node_feature_shape) self._dense_layers.append(dense_layer) if self._dropout_rate is not None: self._dropout_layer = Dropout(self._dropout_rate) self._dropout_layer.build(node_feature_shape) else: self._dropout_layer = lambda x: x if self._apply_norm: self._l2_norm = L2Norm() self._l2_norm.build(node_feature_shape) else: self._l2_norm = lambda x: x super().build(input_shape)
def deeper(self, pos_layer=None): layers_size = len(self.layers) if pos_layer is None: pos_layer = max(layers_size - 2, 0) elif pos_layer >= layers_size - 1 or pos_layer < 0: raise ValueError( f"pos_layer is expected less than length of layers (pos_layer in [0, layers_size-2])." ) weights, bias = self.layers[pos_layer].get_weights() new_weights, new_bias = net2deeper(weights) des_units = weights.shape[1] # TODO: add initial kernel layer = Dense( units=des_units, activation=tf.nn.relu, kernel_regularizer=regularizers.l1_l2(l1=self.l1, l2=self.l2), ) layer.build(input_shape=(None, des_units)) layer.set_weights([new_weights, new_bias]) self.layers.insert(pos_layer + 1, layer)
def last_insert_layer(self, layer_dim): prev_weights, prev_bias = self.layers[len(self.layers) - 1].get_weights() prev_units = prev_weights.shape[1] replace_prev_layer = Dense( units=prev_units, activation=tf.nn.relu, kernel_regularizer=regularizers.l1_l2(l1=self.l1, l2=self.l2), ) replace_prev_layer.build(input_shape=(None, prev_weights.shape[0])) replace_prev_layer.set_weights([prev_weights, prev_bias]) added_layer = Dense( units=layer_dim, activation=tf.nn.sigmoid, kernel_regularizer=regularizers.l1_l2(l1=self.l1, l2=self.l2), kernel_initializer=initializers.GlorotNormal(seed=self.seed), bias_initializer=initializers.Zeros()) added_layer.build(input_shape=(None, prev_units)) del self.layers[len(self.layers) - 1] self.layers.append(replace_prev_layer) self.layers.append(added_layer)
def __init__(self, input_dim, output_dim=2, hidden_dims=None, l1=0.01, l2=0.01, seed=6): super(PartCoder, self).__init__() self.l1 = l1 self.l2 = l2 self.seed = seed # self.layers = NoDependency([]) # self.__dict__['layers'] = [] self.layers = [] _input_dim = input_dim for i, dim in enumerate(hidden_dims): layer = Dense( units=dim, activation=tf.nn.relu, kernel_regularizer=regularizers.l1_l2(l1=self.l1, l2=self.l2), kernel_initializer=initializers.GlorotNormal(seed=self.seed), bias_initializer=initializers.Zeros()) layer.build(input_shape=(None, _input_dim)) _input_dim = dim self.layers.append(layer) # Final, adding output_layer (latent/reconstruction layer) layer = Dense(units=output_dim, activation=tf.nn.sigmoid, kernel_regularizer=regularizers.l1_l2(l1=self.l1, l2=self.l2), kernel_initializer=initializers.GlorotNormal(seed=6), bias_initializer=initializers.Zeros()) layer.build(input_shape=(None, _input_dim)) self.layers.append(layer)
class MDN(Layer): """A Mixture Density Network Layer for Keras. This layer has a few tricks to avoid NaNs in the loss function when training: - Activation for variances is ELU + 1 + 1e-8 (to avoid very small values) - Mixture weights (pi) are trained in as logits, not in the softmax space. A loss function needs to be constructed with the same output dimension and number of mixtures. A sampling function is also provided to sample from distribution parametrised by the MDN outputs. """ def __init__(self, output_dimension, num_mixtures, **kwargs): self.output_dim = output_dimension self.num_mix = num_mixtures with tf.name_scope('MDN'): self.mdn_mus = Dense( self.num_mix * self.output_dim, name='mdn_mus', activation='sigmoid') # mix*output vals, no activation self.mdn_sigmas = Dense( self.num_mix * self.output_dim, activation=elu_plus_one_plus_epsilon, name='mdn_sigmas') # mix*output vals exp activation self.mdn_pi = Dense(self.num_mix, name='mdn_pi', activation='softmax') # mix vals, logits super(MDN, self).__init__(**kwargs) def build(self, input_shape): self.mdn_mus.build(input_shape) self.mdn_sigmas.build(input_shape) self.mdn_pi.build(input_shape) self._trainable_weights = self.mdn_mus.trainable_weights + self.mdn_sigmas.trainable_weights + self.mdn_pi.trainable_weights self._non_trainable_weights = self.mdn_mus.non_trainable_weights + self.mdn_sigmas.non_trainable_weights + self.mdn_pi.non_trainable_weights super(MDN, self).build(input_shape) def call(self, x, mask=None): with tf.name_scope('MDN'): mdn_out = keras.layers.concatenate( [self.mdn_mus(x), self.mdn_sigmas(x), self.mdn_pi(x)], name='mdn_outputs') return mdn_out def compute_output_shape(self, input_shape): """Returns output shape, showing the number of mixture parameters.""" return (input_shape[0], (2 * self.output_dim * self.num_mix) + self.num_mix) def get_config(self): config = { "output_dimension": self.output_dim, "num_mixtures": self.num_mix } base_config = super(MDN, self).get_config() return dict(list(base_config.items()) + list(config.items()))
class SnailAttention(Layer): """ Based on work of Mishra et al., 2018 https://openreview.net/pdf?id=B1DmUzWAW Adopting code from https://github.com/philipperemy/keras-snail-attention/blob/master/attention.py """ def __init__(self, dims, k_size, v_size, seq_len=None, **kwargs): self.k_size = k_size self.seq_len = seq_len self.v_size = v_size self.dims = dims self.sqrt_k = math.sqrt(k_size) self.keys_fc = None self.queries_fc = None self.values_fc = None super(SnailAttention, self).__init__(**kwargs) def build(self, input_shape): # https://stackoverflow.com/questions/54194724/how-to-use-keras-layers-in-custom-keras-layer self.keys_fc = Dense(self.k_size, name="Keys_SnailAttn") self.keys_fc.build((None, self.dims)) self._trainable_weights.extend(self.keys_fc.trainable_weights) self.queries_fc = Dense(self.k_size, name="Queries_SnailAttn") self.queries_fc.build((None, self.dims)) self._trainable_weights.extend(self.queries_fc.trainable_weights) self.values_fc = Dense(self.v_size, name="Values_SnailAttn") self.values_fc.build((None, self.dims)) self._trainable_weights.extend(self.values_fc.trainable_weights) #super(SnailAttention, self).__init__(**kwargs) def __call__(self, inputs, **kwargs): if not self.built: self._maybe_build(inputs) # check that the implementation matches exactly py torch. keys = self.keys_fc(inputs) queries = self.queries_fc(inputs) values = self.values_fc(inputs) logits = K.batch_dot(queries, K.permute_dimensions(keys, (0, 2, 1))) mask = K.ones_like(logits) * np.triu( (-np.inf) * np.ones(logits.shape.as_list()[1:]), k=1) logits = mask + logits probs = Softmax(axis=-1, name="Softmax_SnailAttn")(logits / self.sqrt_k) read = K.batch_dot(probs, values) output = K.concatenate([inputs, read], axis=-1) return output def compute_output_shape(self, input_shape): output_shape = list(input_shape) output_shape[-1] += self.v_size return tuple(output_shape)
class Attention(Layer): """ Layer for implementing two common types of attention mechanisms, i) global (soft) attention and ii) local (hard) attention, for two types of sequence tasks, i) many-to-one and ii) many-to-many. The setting use_bias=False converts the Dense() layers into annotation weight matrices. Softmax activation ensures that all weights sum up to 1. Read more here to make more sense of the code and implementations: i) https://www.tensorflow.org/beta/tutorials/text/nmt_with_attention ii) https://github.com/philipperemy/keras-attention-mechanism/issues/14 iii) https://lilianweng.github.io/lil-log/2018/06/24/attention-attention.html SUGGESTION: If model doesn't converge or the test accuracy is lower than expected, try playing around with the hidden size of the recurrent layers, the batch size in training process, or the param @window_width if using a 'local' attention. NOTE: This implementation takes the hidden states associated with the last timestep of the input sequence as the target hidden state (h_t) as suggested by @felixhao28 in i) for many-to-one scenarios. Hence, when trying to predict what word (token) comes after sequence ['I', 'love', 'biscuits', 'and'], we take h('and') with shape (1, H) as the target hidden state. For many-to-many scenarios, it takes the hidden state associated with the timestep that is being currently iterated in the target sequence, usually by a decoder-like architecture. @param (str) context: the context of the problem at hand, specify 'many-to-many' for sequence-to-sequence tasks such as machine translation and question answering, or specify 'many-to-one' for tasks such as sentiment classification and language modelling @param (str) alignment_type: type of attention mechanism to be applied, 'local-m' corresponds to monotonic alignment where we take the last @window_width timesteps, 'local-p' corresponds to having a Gaussian distribution around the predicted aligned position, whereas 'local-p*' corresponds to the newly proposed method to adaptively learning the unique timesteps to give attention (currently only works for many-to-one scenarios) @param (int) window_width: width for set of source hidden states in 'local' attention @param (str) score_function: alignment score function config; current implementations include the 'dot', 'general', and 'location' both by Luong et al. (2015), 'concat' by Bahdanau et al. (2015), and 'scaled_dot' by Vaswani et al. (2017) @param (str) model_api: specify to use TF's Sequential OR Functional API, note that attention weights are not outputted with the former as it only accepts single-output layers """ def __init__(self, context='many-to-many', alignment_type='global', window_width=None, score_function='general', model_api='functional', **kwargs): if context not in ['many-to-many', 'many-to-one']: raise ValueError("Argument for param @context is not recognized") if alignment_type not in ['global', 'local-m', 'local-p', 'local-p*']: raise ValueError("Argument for param @alignment_type is not recognized") if alignment_type == 'global' and window_width is not None: raise ValueError("Can't use windowed approach with global attention") if context == 'many-to-many' and alignment_type == 'local-p*': raise ValueError("Can't use local-p* approach in many-to-many scenarios") if score_function not in ['dot', 'general', 'location', 'concat', 'scaled_dot']: raise ValueError("Argument for param @score_function is not recognized") if model_api not in ['sequential', 'functional']: raise ValueError("Argument for param @model_api is not recognized") super(Attention, self).__init__(**kwargs) self.context = context self.alignment_type = alignment_type self.window_width = window_width # D self.score_function = score_function self.model_api = model_api def get_config(self): base_config = super(Attention, self).get_config() base_config['alignment_type'] = self.alignment_type base_config['window_width'] = self.window_width base_config['score_function'] = self.score_function base_config['model_api'] = self.model_api return base_config def build(self, input_shape): # Declare attributes for easy access to dimension values if self.context == 'many-to-many': self.input_sequence_length, self.hidden_dim = input_shape[0][1], input_shape[0][2] self.target_sequence_length = input_shape[1][1] elif self.context == 'many-to-one': self.input_sequence_length, self.hidden_dim = input_shape[0][1], input_shape[0][2] # Build weight matrices for different alignment types and score functions if 'local-p' in self.alignment_type: self.W_p = Dense(units=self.hidden_dim, use_bias=False) self.W_p.build(input_shape=(None, None, self.hidden_dim)) # (B, 1, H) self._trainable_weights += self.W_p.trainable_weights self.v_p = Dense(units=1, use_bias=False) self.v_p.build(input_shape=(None, None, self.hidden_dim)) # (B, 1, H) self._trainable_weights += self.v_p.trainable_weights if 'dot' not in self.score_function: # weight matrix not utilized for 'dot' function self.W_a = Dense(units=self.hidden_dim, use_bias=False) self.W_a.build(input_shape=(None, None, self.hidden_dim)) # (B, S*, H) self._trainable_weights += self.W_a.trainable_weights if self.score_function == 'concat': # define additional weight matrices self.U_a = Dense(units=self.hidden_dim, use_bias=False) self.U_a.build(input_shape=(None, None, self.hidden_dim)) # (B, 1, H) self._trainable_weights += self.U_a.trainable_weights self.v_a = Dense(units=1, use_bias=False) self.v_a.build(input_shape=(None, None, self.hidden_dim)) # (B, S*, H) self._trainable_weights += self.v_a.trainable_weights super(Attention, self).build(input_shape) def call(self, inputs): # Pass decoder output (prev. timestep) alongside encoder output for all scenarios if not isinstance(inputs, list): raise ValueError("Pass a list=[encoder_out (Tensor), decoder_out (Tensor)," + "current_timestep (int)] for all scenarios") # Specify source and target states (and timestep if applicable) for easy access if self.context == 'many-to-one': # Get h_t, the current (target) hidden state as the last timestep of input sequence target_hidden_state = inputs[1] # (B, H) source_hidden_states = inputs[0] # (B, S, H) elif self.context == 'many-to-many': # Get h_t, the current (target) hidden state from the previous decoded hidden state target_hidden_state = inputs[1] # (B, H) current_timestep = inputs[2] source_hidden_states = inputs[0] # (B, S, H) # Add time axis to h_t target_hidden_state = tf.expand_dims(input=target_hidden_state, axis=1) # (B, 1, H) # Get h_s, source hidden states through specified attention mechanism if self.alignment_type == 'global': # Global Approach source_hidden_states = source_hidden_states # (B, S, H) elif 'local' in self.alignment_type: # Local Approach # Automatically set window width to default value (8 -> no real logic behind this value) self.window_width = 8 if self.window_width is None else self.window_width # Get aligned position (between inputs & targets) and derive a context window to focus if self.alignment_type == 'local-m': # Monotonic Alignment # Set alignment position if self.context == 'many-to-one': aligned_position = self.input_sequence_length elif self.context == 'many-to-many': aligned_position = current_timestep # Get window borders left = int(aligned_position - self.window_width if aligned_position - self.window_width >= 0 else 0) right = int(aligned_position + self.window_width if aligned_position + self.window_width <= self.input_sequence_length else self.input_sequence_length) # Extract window window source_hidden_states = Lambda(lambda x: x[:, left:right, :])(source_hidden_states) # (B, S*=(D, 2xD), H) elif self.alignment_type == 'local-p': # Predictive Alignment aligned_position = self.W_p(target_hidden_state) # (B, 1, H) aligned_position = Activation('tanh')(aligned_position) # (B, 1, H) aligned_position = self.v_p(aligned_position) # (B, 1, 1) aligned_position = Activation('sigmoid')(aligned_position) # (B, 1, 1) aligned_position = aligned_position * self.input_sequence_length # (B, 1, 1) elif self.alignment_type == 'local-p*': # Completely Predictive Alignment aligned_position = self.W_p(source_hidden_states) # (B, S, H) aligned_position = Activation('tanh')(aligned_position) # (B, S, H) aligned_position = self.v_p(aligned_position) # (B, S, 1) aligned_position = Activation('sigmoid')(aligned_position) # (B, S, 1) # Only keep top D values out of the sigmoid activation, and zero-out the rest aligned_position = tf.squeeze(aligned_position, axis=-1) # (B, S) top_probabilities = tf.nn.top_k(input=aligned_position, # (values:(B, D), indices:(B, D)) k=self.window_width, sorted=False) onehot_vector = tf.one_hot(indices=top_probabilities.indices, depth=self.input_sequence_length) # (B, D, S) onehot_vector = tf.reduce_sum(onehot_vector, axis=1) # (B, S) aligned_position = Multiply()([aligned_position, onehot_vector]) # (B, S) aligned_position = tf.expand_dims(aligned_position, axis=-1) # (B, S, 1) initial_source_hidden_states = source_hidden_states # (B, S, 1) source_hidden_states = Multiply()([source_hidden_states, aligned_position]) # (B, S*=S(D), H) # Scale back-to approximately original hidden state values aligned_position += tf.keras.backend.epsilon() # (B, S, 1) source_hidden_states /= aligned_position # (B, S*=S(D), H) source_hidden_states = initial_source_hidden_states + source_hidden_states # (B, S, H) # Compute alignment score through specified function if 'dot' in self.score_function: # Dot Score Function attention_score = Dot(axes=[2, 2])([source_hidden_states, target_hidden_state]) # (B, S*, 1) if self.score_function == 'scaled_dot': attention_score *= 1 / np.sqrt(float(source_hidden_states.shape[2])) # (B, S*, 1) elif self.score_function == 'general': # General Score Function weighted_hidden_states = self.W_a(source_hidden_states) # (B, S*, H) attention_score = Dot(axes=[2, 2])([weighted_hidden_states, target_hidden_state]) # (B, S*, 1) elif self.score_function == 'location': # Location-based Score Function weighted_target_state = self.W_a(target_hidden_state) # (B, 1, H) attention_score = Activation('softmax')(weighted_target_state) # (B, 1, H) attention_score = RepeatVector(source_hidden_states.shape[1])(attention_score) # (B, S*, H) attention_score = tf.reduce_sum(attention_score, axis=-1) # (B, S*) attention_score = tf.expand_dims(attention_score, axis=-1) # (B, S*, 1) elif self.score_function == 'concat': # Concat Score Function weighted_hidden_states = self.W_a(source_hidden_states) # (B, S*, H) weighted_target_state = self.U_a(target_hidden_state) # (B, 1, H) weighted_sum = weighted_hidden_states + weighted_target_state # (B, S*, H) weighted_sum = Activation('tanh')(weighted_sum) # (B, S*, H) attention_score = self.v_a(weighted_sum) # (B, S*, 1) # Compute attention weights attention_weights = Activation('softmax')(attention_score) # (B, S*, 1) # Distribute weights around aligned position for local-p approach only if self.alignment_type == 'local-p': # Gaussian Distribution gaussian_estimation = lambda s: tf.exp(-tf.square(s - aligned_position) / (2 * tf.square(self.window_width / 2))) gaussian_factor = gaussian_estimation(0) for i in range(1, self.input_sequence_length): gaussian_factor = Concatenate(axis=1)([gaussian_factor, gaussian_estimation(i)]) # (B, S*, 1) attention_weights = attention_weights * gaussian_factor # (B, S*, 1) # Derive context vector context_vector = source_hidden_states * attention_weights # (B, S*, H) if self.model_api == 'functional': return context_vector, attention_weights elif self.model_api == 'sequential': return context_vector
class SRFR(Model): def __init__( self, num_filters: int = 62, depth: int = 50, categories: int = 512, num_gc: int = 32, num_blocks: int = 23, residual_scailing: float = 0.2, training: bool = True, input_shape=(28, 28, 3), num_classes_syn: int = None, both: bool = False, num_classes_nat: int = None, scale: int = 64, ): super(SRFR, self).__init__() self._training = training self.scale = scale if both: self._natural_input = Conv2D( input_shape=input_shape, filters=num_filters, kernel_size=(3, 3), strides=1, padding='same', name='natural_input', activation=mish, ) self._synthetic_input = Conv2D( input_shape=input_shape, filters=num_filters, kernel_size=(3, 3), strides=1, padding='same', name='synthetic_input', activation=mish, ) self._super_resolution = GeneratorNetwork( num_filters, num_gc, num_blocks, residual_scailing, ) self._face_recognition = ResNet( depth, categories, training ) if self._training: if both: self._fc_classification_nat = Dense( input_shape=(categories,), units=num_classes_nat, activation=None, use_bias=False, dtype='float32', name='fully_connected_to_softmax_crossentropy_nat', ) self._fc_classification_nat.build(tf.TensorShape([None, 512])) self.net_type = 'nat' self._fc_classification_syn: Dense = Dense( input_shape=(categories,), units=num_classes_syn, activation=None, use_bias=False, dtype='float32', name='fully_connected_to_softmax_crossentropy_syn', ) self._fc_classification_syn.build(tf.TensorShape([None, 512])) @tf.function def _call_evaluating(self, input_tensor, input_type: str = 'nat'): if input_type == 'syn': outputs = self._synthetic_input(input_tensor) else: outputs = self._natural_input(input_tensor) super_resolution_image = self._super_resolution(outputs) embeddings = self._face_recognition(super_resolution_image) return super_resolution_image, embeddings def _calculate_normalized_embeddings(self, embeddings, net_type: str = 'syn'): fc_weights = self.get_weights(net_type) normalized_weights = tf.Variable( normalize(fc_weights, name='weights_normalization'), aggregation=tf.VariableAggregation.NONE, ) normalized_embeddings = normalize( embeddings, axis=1, name='embeddings_normalization') * self.scale replica = tf.distribute.get_replica_context() replica.merge_call(self.set_weights, args=(normalized_weights, net_type)) return self.call_fc_classification(normalized_embeddings, net_type) def _call_training(self, synthetic_images, natural_images=None): synthetic_outputs = self._synthetic_input(synthetic_images) synthetic_sr_images = self._super_resolution(synthetic_outputs) synthetic_embeddings = self._face_recognition(synthetic_sr_images) synthetic_embeddings = self._calculate_normalized_embeddings( synthetic_embeddings ) if natural_images: natural_outputs = self._natural_input(natural_images) natural_sr_images = self._super_resolution(natural_outputs) natural_embeddings = self._face_recognition(natural_sr_images) natural_embeddings = self._calculate_normalized_embeddings( natural_embeddings ) return ( synthetic_sr_images, synthetic_embeddings, natural_sr_images, natural_embeddings, ) return synthetic_sr_images, synthetic_embeddings def call(self, input_tensor_01, input_tensor_02=None, training: bool = True, input_type: str = 'nat'): if training: return self._call_training(input_tensor_01, input_tensor_02) return self._call_evaluating(input_tensor_01, input_type) def get_weights(self, net_type: str = 'syn'): if net_type == 'nat': return self._fc_classification_nat.get_weights() return self._fc_classification_syn.get_weights() def set_weights(self, _, weights, net_type: str = 'syn') -> None: if net_type == 'nat': self._fc_classification_nat.set_weights([weights.read_value()]) else: self._fc_classification_syn.set_weights([weights.read_value()]) def call_fc_classification(self, input, net_type: str = 'syn'): if net_type == 'nat': return self._fc_classification_nat(input) return self._fc_classification_syn(input)
class LocationSensitiveAttentionLayer(Layer): def __init__(self): super(LocationSensitiveAttentionLayer, self).__init__() self.units = hps.LSA_dim self.filters = hps.LSA_filters self.kernel = hps.LSA_kernel self._cumulate = True self.location_convolution = Conv1D(filters=self.filters, kernel_size=self.kernel, padding='same', bias_initializer='zeros') self.location_layer = Dense(self.units, use_bias=False) self.query_layer = Dense(self.units, use_bias=False) self.memory_layer = Dense(self.units, use_bias=False) self.rnn_cell = Decoderlstm() self.values = None self.keys = None def build(self, input_shape): enc_out_seq, dec_out_seq = input_shape self.v_a = self.add_weight(name='V_a', shape=(self.units, ), initializer='uniform', trainable=True) self.b_a = self.add_weight(name='b_a', shape=(self.units, ), initializer='uniform', trainable=True) if self.memory_layer: self.memory_layer.build(enc_out_seq) self._trainable_weights += self.memory_layer._trainable_weights if self.query_layer: if not self.query_layer.built: if self.rnn_cell: self.query_layer.build( self.rnn_cell.compute_output_shape(dec_out_seq)[0]) else: self.query_layer.build(dec_out_seq) self._trainable_weights += self.query_layer._trainable_weights if self.rnn_cell: rnn_input_shape = (enc_out_seq[0], 1, dec_out_seq[-1] + enc_out_seq[-1]) self.rnn_cell.build(rnn_input_shape) self._trainable_weights += self.rnn_cell.weights conv_input_shape = (enc_out_seq[0], enc_out_seq[1], 1) location_input_shape = (enc_out_seq[0], enc_out_seq[1], self.filters) self.location_convolution.build(conv_input_shape) self.location_layer.build(location_input_shape) self._trainable_weights += self.location_convolution._trainable_weights self._trainable_weights += self.location_layer._trainable_weights super(LocationSensitiveAttentionLayer, self).build(input_shape) def call(self, inputs, verbose=False): encoder_out_seq, decoder_out_seq = inputs values = encoder_out_seq keys = self.memory_layer(values) if self.memory_layer else values def energy_step(query, states): previous_alignments = states[0] if self.rnn_cell: c_i = states[1] cell_state = states[2:] lstm_input = K.concatenate([query, c_i]) lstm_input = K.expand_dims(lstm_input, 1) lstm_out = self.rnn_cell(lstm_input, initial_state=cell_state) lstm_output, new_cell_state = lstm_out[0], lstm_out[1:] query = lstm_output processed_query = self.query_layer( query) if self.query_layer else query expanded_alignments = K.expand_dims(previous_alignments, axis=2) f = self.location_convolution(expanded_alignments) processed_location_features = self.location_layer(f) e_i = K.sum( self.v_a * K.tanh(keys + processed_query + processed_location_features + self.b_a), [2]) e_i = K.softmax(e_i) if self._cumulate: next_state = e_i + previous_alignments else: next_state = e_i if self.rnn_cell: new_c_i, _ = context_step(e_i, [c_i]) return e_i, [next_state, new_c_i, *new_cell_state] return e_i, [next_state] def context_step(inputs, states): alignments = inputs expanded_alignments = K.expand_dims(alignments, 1) c_i = math_ops.matmul(expanded_alignments, values) c_i = K.squeeze(c_i, 1) return c_i, [c_i] def create_initial_state(inputs, hidden_size): fake_state = K.zeros_like(inputs) fake_state = K.sum(fake_state, axis=[1, 2]) fake_state = K.expand_dims(fake_state) fake_state = K.tile(fake_state, [1, hidden_size]) return fake_state def get_fake_cell_input(fake_state_c): fake_input = K.zeros_like(decoder_out_seq)[:, 0, :] fake_input = K.concatenate([fake_state_c, fake_input]) fake_input = K.expand_dims(fake_input, 1) return fake_input fake_state_c = create_initial_state(values, values.shape[-1]) fake_state_e = create_initial_state(values, K.shape(values)[1]) if self.rnn_cell: cell_initial_state = self.rnn_cell.get_initial_state( get_fake_cell_input(fake_state_c)) initial_states_e = [ fake_state_e, fake_state_c, *cell_initial_state ] else: initial_states_e = [fake_state_e] last_out, e_outputs, _ = K.rnn(energy_step, decoder_out_seq, initial_states_e) c_outputs = math_ops.matmul(e_outputs, values) return [c_outputs, e_outputs] def comute_output_shape(self, input_shape): return [(input_shape[1][0], input_shape[1][1], input_shape[1][2]), (input_shape[1][0], input_shape[1][1], input_shape[0][1])]
class PhasedLSTMCell(tf.keras.layers.Layer): """Phased LSTM recurrent network cell. https://arxiv.org/pdf/1610.09513v1.pdf """ def __init__(self, num_units, use_peepholes=False, leak=0.001, ratio_on=0.1, trainable_ratio_on=True, period_init_min=0.5, period_init_max=1000.0): """Initialize the Phased LSTM cell. Args: num_units: int, The number of units in the Phased LSTM cell. use_peepholes: bool, set True to enable peephole connections. leak: float or scalar float Tensor with value in [0, 1]. Leak applied during training. ratio_on: float or scalar float Tensor with value in [0, 1]. Ratio of the period during which the gates are open. trainable_ratio_on: bool, weather ratio_on is trainable. period_init_min: float or scalar float Tensor. With value > 0. Minimum value of the initialized period. The period values are initialized by drawing from the distribution: e^U(log(period_init_min), log(period_init_max)) Where U(.,.) is the uniform distribution. period_init_max: float or scalar float Tensor. With value > period_init_min. Maximum value of the initialized period. """ super().__init__() self._num_units = num_units self._use_peepholes = use_peepholes self._leak = leak self._ratio_on = ratio_on self._trainable_ratio_on = trainable_ratio_on self._period_init_min = period_init_min self._period_init_max = period_init_max self.linear1 = Dense(2 * self._num_units, use_bias=True, activation='sigmoid', name='MaskGates') self.linear2 = Dense(self._num_units, use_bias=True, activation='tanh') self.linear3 = Dense(self._num_units, use_bias=True, activation='sigmoid') self.period = self.add_weight('period', shape=[self._num_units], initializer=_random_exp_initializer( self._period_init_min, self._period_init_max)) self.phase = self.add_weight( 'phase', shape=[self._num_units], initializer=tf.initializers.random_uniform( 0., self.period.initial_value)) self.ratio_on = self.add_weight("ratio_on", [self._num_units], initializer=tf.constant_initializer( self._ratio_on), trainable=self._trainable_ratio_on) def build(self, input_shapes): time_shape, x_shape = input_shapes.times, input_shapes.x x_dim = x_shape[-1] if self._use_peepholes: mask_gate_and_ouput_gate_dims = 2 * self._num_units + x_dim else: mask_gate_and_ouput_gate_dims = self._num_units + x_dim self.linear1.build((time_shape[0], mask_gate_and_ouput_gate_dims)) self.linear2.build((time_shape[0], self._num_units + x_dim)) self.linear3.build((time_shape[0], mask_gate_and_ouput_gate_dims)) super().build(input_shapes) @property def state_size(self): return tf.nn.rnn_cell.LSTMStateTuple(self._num_units, self._num_units) @property def output_size(self): return self._num_units def _mod(self, x, y): """Modulo function that propagates x gradients.""" return tf.stop_gradient(tf.math.mod(x, y) - x) + x def _get_cycle_ratio(self, time): """Compute the cycle ratio in the dtype of the time.""" phase = tf.cast(self.phase, dtype=time.dtype) period = tf.cast(self.period, dtype=time.dtype) shifted_time = time - phase cycle_ratio = self._mod(shifted_time, period) / period return tf.cast(cycle_ratio, dtype=tf.float32) def call(self, inputs, state): """Phased LSTM Cell. Args: inputs: A tuple of 2 Tensor. The first Tensor has shape [batch, 1], and type float32 or float64. It stores the time. The second Tensor has shape [batch, features_size], and type float32. It stores the features. state: rnn_cell_impl.LSTMStateTuple, state from previous timestep. Returns: A tuple containing: - A Tensor of float32, and shape [batch_size, num_units], representing the output of the cell. - A rnn_cell_impl.LSTMStateTuple, containing 2 Tensors of float32, shape [batch_size, num_units], representing the new state and the output. """ (c_prev, h_prev) = state time, x = inputs.times, inputs.x if self._use_peepholes: input_mask_and_output_gate = tf.concat([x, h_prev, c_prev], axis=-1) else: input_mask_and_output_gate = tf.concat([x, h_prev], axis=-1) mask_gates = self.linear1(input_mask_and_output_gate) input_gate, forget_gate = tf.split(mask_gates, axis=1, num_or_size_splits=2) new_input = self.linear2(tf.concat([x, h_prev], axis=-1)) new_c = (c_prev * forget_gate + input_gate * new_input) output_gate = self.linear3(input_mask_and_output_gate) new_h = tf.tanh(new_c) * output_gate cycle_ratio = self._get_cycle_ratio(time) k_up = 2 * cycle_ratio / self.ratio_on k_down = 2 - k_up k_closed = self._leak * cycle_ratio k = tf.where(cycle_ratio < self.ratio_on, k_down, k_closed) k = tf.where(cycle_ratio < 0.5 * self.ratio_on, k_up, k) new_c = k * new_c + (1 - k) * c_prev new_h = k * new_h + (1 - k) * h_prev new_state = tf.nn.rnn_cell.LSTMStateTuple(new_c, new_h) return new_h, new_state
class CNNEncoder(tf.keras.layers.Layer): """ CNNEncoder is a combination of multiple convolutional layers and max pooling layers. This is defined as a single layer to be consistent with other encoders in terms of input and output specifications. Input shape: (batch_size, sequence_length, input_dim). Output shape: (batch_size, output_dim). The CNN has one convolution layer per each ngram filter size. Each convolution operation gives out a vector of size num_filters. The number of times a convolution layer will be used depends on the ngram size: input_len - ngram_size + 1. The corresponding maxpooling layer aggregates all these outputs from the convolution layer and outputs the max. This operation is repeated for every ngram size passed, and consequently the dimensionality of the output after maxpooling is len(ngram_filter_sizes) * num_filters. We the use a fully connected layer to project in back to the desired output_dim. References: "A Sensitivity Analysis of (and Practitioners’ Guide to) Convolutional Neural Networks for Sentence Classification", Zhang and Wallace 2016, particularly Figure 1. Args: filters: Integer, the output dim for each convolutional layer. kernel_sizes: An integer tuple of list, the kernel sizes of each convolutional layers. units: After doing convolutions, we'll project the collected features into a vecor of this size. If this value is `None`, just return the result of the max pooling. conv_layer_activation: string of convolutional layer `Activation`. l1_regularization: float. l2_regularization: float. """ def __init__(self, filters=100, kernel_sizes=(2, 3, 4, 5), conv_layer_activation='relu', l1_regularization=None, l2_regularization=None, units=None, **kwargs): self.filters = filters self.kernel_sizes = kernel_sizes self.units = units self.conv_layer_activation = conv_layer_activation self.l1_regularization = l1_regularization self.l2_regularization = l2_regularization self.regularizer = l1_l2( l1=l1_regularization if l1_regularization is not None else 0.0, l2=l2_regularization if l2_regularization is not None else 0.0) self.conv_layers = None self.projection_layer = None self.trainable_layers = None self.output_dim = None self.input_spec = [InputSpec(ndim=3)] super(CNNEncoder, self).__init__(**kwargs) def build(self, input_shape): self.conv_layers = [Conv1D(filters=self.filters, kernel_size=kernel_size, activation=self.conv_layer_activation, kernel_regularizer=self.regularizer, bias_regularizer=self.regularizer) for kernel_size in self.kernel_sizes] for conv_layer in self.conv_layers: with K.name_scope(conv_layer.name): conv_layer.build(input_shape) maxpool_output_dim = self.filters * len(self.kernel_sizes) if self.units is not None: self.projection_layer = Dense(self.units) projection_input_shape = (input_shape[0], maxpool_output_dim) with K.name_scope(self.projection_layer.name): self.projection_layer.build(projection_input_shape) self.output_dim = self.units self.trainable_layers = self.conv_layers + [self.projection_layer] else: self.projection_layer = None self.output_dim = maxpool_output_dim self.trainable_layers = self.conv_layers super(CNNEncoder, self).build(input_shape) def call(self, inputs, mask=None): # Each convolution layer returns output of size (batch_size, conv_length, filters), # where `conv_length = num_words - kernel_size + 1`. We then do max # pooling over each filter for the whole input sequence, just use K.max, # giving a result tensor of shape (batch_size, filters), which then # gets projected using the projection layer. filter_outputs = [K.max(conv_layer.call(inputs), axis=1) for conv_layer in self.conv_layers] maxpool_output = Concatenate()(filter_outputs) \ if len(filter_outputs) > 1 else filter_outputs[0] if self.projection_layer: result = self.projection_layer.call(maxpool_output) else: result = maxpool_output return result def compute_output_shape(self, input_shape): return (input_shape[0], self.output_dim) def compute_mask(self, inputs, mask=None): # By default Keras propagates the mask from a layer that supports masking. We don't need it # anymore. So eliminating it from the flow. return None def get_config(self): config = {"filters": self.filters, "kernel_sizes": self.kernel_sizes, "units": self.units, "conv_layer_activation": self.conv_layer_activation, "l1_regularization": self.l1_regularization, "l2_regularization": self.l2_regularization } base_config = super(CNNEncoder, self).get_config() config.update(base_config) return config @property def trainable_weights(self): trainable_weights = [] for layer in self.trainable_layers: trainable_weights.extend(layer.trainable_weights) return trainable_weights
class Attention(Layer): """ Layer for implementing two common types of attention mechanisms: i) global (soft) attention, and ii) local (hard) attention, The setting use_bias=False converts the Dense() layers into annotation weight matrices. Softmax activation ensures that all weights sum up to 1. Read more here to make more sense of the code and implementations: i) https://www.tensorflow.org/beta/tutorials/text/nmt_with_attention ii) https://github.com/philipperemy/keras-attention-mechanism/issues/14 iii) https://lilianweng.github.io/lil-log/2018/06/24/attention-attention.html SUGGESTION: If model doesn't converge, increase either the hidden size of the RNN model, the batch size of the model, or the param @size. If test accuracy is low, decrease these hyperparameters instead. NOTE: This implementation takes the hidden states associted with the last timestep as the target hidden state (h_t) as suggested by @felixhao28 in i), whereas originally attention was proposed for MANY-TO-MANY sequence tasks like machine translation. Hence, when trying to predict what word (token) comes after sequence ['I', 'love', 'biscuits', 'and'], we take h('and') with shape (1, H) as the target hidden state. @param size (int): size of attension vector or attention length; number of hidden units to decode the attention to with dense layer, presumably before being fed to the final softmax dense layer for next token prediction @param alignment_type (str): type of attention mechanism to be applied, 'local-m' corresponds to monotonic alignment where we take the last @window_width timesteps, 'local-p' corresponds to having a Gaussian distribution around the predicted aligned position, whereas 'local-p*' corresponds to the newly proposed method to adaptively learning the unique timesteps to give attention @param window_width (int): width for set of source hidden states in 'local' attention @param score_function (str): alignment score function config; current implementations include the 'dot', 'general', and 'location' both by Luong et al. 2015, 'concat' by Bahdanau et al. 2015, and 'scaled_dot' by Vaswani et al. 2017 """ def __init__(self, size, alignment_type='global', window_width=None, score_function='general', **kwargs): if alignment_type not in ['global', 'local-m', 'local-p', 'local-p*']: raise ValueError( "Argument for param @alignment_type is not recognized") if alignment_type == 'global': if window_width is not None: raise ValueError( "Can't use windowed approach with global attention") if score_function not in [ 'dot', 'general', 'location', 'concat', 'scaled_dot' ]: raise ValueError( "Argument for param @score_function is not recognized") super(Attention, self).__init__(**kwargs) self.size = size self.alignment_type = alignment_type self.window_width = window_width # 2*D self.score_function = score_function def get_config(self): base_config = super(Attention, self).get_config() base_config['size'] = self.size base_config['alignment_type'] = self.alignment_type base_config['window_width'] = self.window_width base_config['score_function'] = self.score_function return base_config def build(self, input_shape ): # Build weight matrices for trainable, adaptive parameters if 'local-p' in self.alignment_type: self.W_p = Dense(units=input_shape[2], use_bias=False) self.W_p.build(input_shape=(None, None, input_shape[2])) # (B, 1, H) self._trainable_weights += self.W_p.trainable_weights self.v_p = Dense(units=1, use_bias=False) self.v_p.build(input_shape=(None, None, input_shape[2])) # (B, 1, H) self._trainable_weights += self.v_p.trainable_weights if 'dot' not in self.score_function: # weight matrix not utilized for 'dot' function self.W_a = Dense(units=input_shape[2], use_bias=False) self.W_a.build(input_shape=(None, None, input_shape[2])) # (B, S*, H) self._trainable_weights += self.W_a.trainable_weights if self.score_function == 'concat': # define additional weight matrices self.U_a = Dense(units=input_shape[2], use_bias=False) self.U_a.build(input_shape=(None, None, input_shape[2])) # (B, 1, H) self._trainable_weights += self.U_a.trainable_weights self.v_a = Dense(units=1, use_bias=False) self.v_a.build(input_shape=(None, None, input_shape[2])) # (B, S*, H) self._trainable_weights += self.v_a.trainable_weights self.attention_vector = Dense(units=self.size, activation='tanh', use_bias=False) self.attention_vector.build(input_shape=(None, 2 * input_shape[2])) # (B, 2*H) self._trainable_weights += self.attention_vector.trainable_weights super(Attention, self).build(input_shape) def call(self, inputs): sequence_length = inputs.shape[1] ## Get h_t, the current (target) hidden state ## target_hidden_state = Lambda(function=lambda x: x[:, -1, :])( inputs) # (B, H) target_hidden_state_reshaped = Reshape( target_shape=(1, inputs.shape[2]))(target_hidden_state) # (B, 1, H) ## Get h_s, source hidden states through specified attention mechanism ## if self.alignment_type == 'global': ## Global Approach ## source_hidden_states = inputs # (B, S*=S, H) elif 'local' in self.alignment_type: ## Local Approach ## if self.window_width == None: ## Automatically set window width ## self.window_width = sequence_length // 2 if self.alignment_type == 'local-m': ## Monotonic Alignment ## aligned_position = sequence_length left_border = aligned_position - self.window_width if aligned_position - self.window_width >= 0 else 0 source_hidden_states = Lambda( function=lambda x: x[:, left_border:, :])( inputs) # (B, S*=D, H) elif self.alignment_type == 'local-p': ## Predictive Alignment ## aligned_position = self.W_p(target_hidden_state) # (B, H) aligned_position = Activation('tanh')( aligned_position) # (B, H) aligned_position = self.v_p(aligned_position) # (B, 1) aligned_position = Activation('sigmoid')( aligned_position) # (B, 1) aligned_position = aligned_position * sequence_length # (B, 1) source_hidden_states = inputs # (B, S, H) elif self.alignment_type == 'local-p*': ## Completely Predictive Alignment ## aligned_position = self.W_p(inputs) # (B, S, H) aligned_position = Activation('tanh')( aligned_position) # (B, S, H) aligned_position = self.v_p(aligned_position) # (B, S, 1) aligned_position = Activation('sigmoid')( aligned_position) # (B, S, 1) ## Only keep top D values out of the sigmoid activation, and zero-out the rest ## aligned_position = tf.squeeze(aligned_position, axis=-1) # (B, S) top_probabilities = tf.nn.top_k( input=aligned_position, k=self.window_width, sorted=False) # (values:(B, D), indices:(B, D)) onehot_vector = tf.one_hot(indices=top_probabilities.indices, depth=sequence_length) # (B, D, S) onehot_vector = tf.reduce_sum(onehot_vector, axis=1) # (B, S) aligned_position = Multiply()( [aligned_position, onehot_vector]) # (B, S) aligned_position = tf.expand_dims(aligned_position, axis=-1) # (B, S, 1) source_hidden_states = Multiply()([inputs, aligned_position ]) # (B, S*=S(D), H) ## Scale back-to approximately original hidden state values ## aligned_position += 1 # (B, S, 1) source_hidden_states /= aligned_position # (B, S*=S(D), H) ## Compute alignment score through specified function ## if 'dot' in self.score_function: attention_score = Dot(axes=[2, 1])( [source_hidden_states, target_hidden_state]) # (B, S*) if self.score_function == 'scaled_dot': attention_score = attention_score * ( 1 / np.sqrt(float(inputs.shape[2]))) # (B, S*) elif self.score_function == 'general': weighted_hidden_states = self.W_a( source_hidden_states) # (B, S*, H) attention_score = Dot(axes=[2, 1])( [weighted_hidden_states, target_hidden_state]) # (B, S*) elif self.score_function == 'location': weighted_target_state = self.W_a(target_hidden_state) # (B, H) attention_score = Activation('softmax')( weighted_target_state) # (B, H) attention_score = RepeatVector( n=inputs.shape[1] - 1 if self.seperate else inputs.shape[1])( attention_score) # (B, S*, H) attention_score = tf.reduce_sum(attention_score, axis=-1) # (B, S*) elif self.score_function == 'concat': weighted_hidden_states = self.W_a( source_hidden_states) # (B, S*, H) weighted_target_state = self.U_a( target_hidden_state_reshaped) # (B, 1, H) weighted_sum = weighted_hidden_states + weighted_target_state # (B, S*, H) weighted_sum = Activation('tanh')(weighted_sum) # (B, S*, H) attention_score = self.v_a(weighted_sum) # (B, S*, 1) attention_score = attention_score[:, :, 0] # (B, S*) attention_weights = Activation('softmax')(attention_score) # (B, S*) if self.alignment_type == 'local-p': ## Gaussian Distribution ## gaussian_estimation = lambda s: tf.exp(-tf.square( s - aligned_position) / (2 * tf.square(self.window_width / 2))) gaussian_factor = gaussian_estimation(0) for i in range(1, sequence_length): gaussian_factor = Concatenate()( [gaussian_factor, gaussian_estimation(i)]) # gaussian_factor: (B, S*) attention_weights = attention_weights * gaussian_factor # (B, S*) context_vector = Dot(axes=[1, 1])( [source_hidden_states, attention_weights]) # (B, H) combined_information = Concatenate()( [context_vector, target_hidden_state]) # (B, 2*H) attention_vector = self.attention_vector( combined_information) # (B, self.size) return attention_vector
class SetAttentionLayer(tf.keras.layers.Layer): dense_options = {'activation': 'relu', 'kernel_initializer': 'he_uniform'} def __init__(self, n_layers=2, width=128, latent_width=128, aggregation_function='mean', dot_prod_dim=64, n_heads=4, attn_dropout=0.3): super().__init__() self.width = width self.dot_prod_dim = dot_prod_dim self.attn_dropout = attn_dropout self.n_heads = n_heads self.psi = build_dense_dropout_model(n_layers, width, 0., self.dense_options) self.psi.add(Dense(latent_width, **self.dense_options)) self.psi_aggregation = SegmentAggregation(aggregation_function) self.rho = Dense(latent_width, **self.dense_options) def build(self, input_shape): self.psi.build(input_shape) encoded_shape = self.psi.compute_output_shape(input_shape) agg_shape = self.psi_aggregation.compute_output_shape(encoded_shape) self.rho.build(agg_shape) self.W_k = self.add_weight('W_k', (encoded_shape[-1] + input_shape[-1], self.dot_prod_dim * self.n_heads), initializer='he_uniform') self.W_q = self.add_weight('W_q', (self.n_heads, self.dot_prod_dim), initializer=tf.keras.initializers.Zeros()) def call(self, inputs, segment_ids, lengths, training=None): if training is None: training = tf.keras.backend.learning_phase() def dropout_attn(input_tensor): if self.attn_dropout > 0: mask = (tf.random.uniform(tf.shape(input_tensor)[:-1]) < self.attn_dropout) return (input_tensor + tf.expand_dims(tf.cast(mask, tf.float32), -1) * -1e9) else: return tf.identity(input_tensor) encoded = self.psi(inputs) agg = self.psi_aggregation(encoded, segment_ids) agg = self.rho(agg) agg_scattered = tf.gather_nd(agg, tf.expand_dims(segment_ids, -1)) combined = tf.concat([inputs, agg_scattered], axis=-1) keys = tf.matmul(combined, self.W_k) keys = tf.stack(tf.split(keys, self.n_heads, -1), 1) keys = tf.expand_dims(keys, axis=2) # should have shape (el, heads, 1, dot_prod_dim) queries = tf.expand_dims(tf.expand_dims(self.W_q, -1), 0) # should have shape (1, heads, dot_prod_dim, 1) preattn = tf.matmul(keys, queries) / tf.sqrt(float(self.dot_prod_dim)) preattn = tf.squeeze(preattn, -1) preattn = smart_cond(training, lambda: dropout_attn(preattn), lambda: tf.identity(preattn)) per_head_preattn = tf.unstack(preattn, axis=1) attentions = [] for pre_attn in per_head_preattn: attentions.append(segment_softmax(pre_attn, segment_ids)) return attentions def compute_output_shape(self, input_shape): return list(chain(input_shape[:-1], (self.n_heads, )))
class SRFR(Model): def __init__( self, num_filters: int = 62, depth: int = 50, categories: int = 512, num_gc: int = 32, num_blocks: int = 23, residual_scailing: float = 0.2, training: bool = True, input_shape=(28, 28, 3), num_classes_syn: int = 2, both: bool = False, num_classes_nat: int = None, scale: int = 64, ): super(SRFR, self).__init__() self._training = training self.scale = scale if both: self._natural_input = Conv2D( input_shape=input_shape, filters=num_filters, kernel_size=(3, 3), strides=1, padding="same", name="natural_input", activation=mish, ) self._synthetic_input = Conv2D( input_shape=input_shape, filters=num_filters, kernel_size=(3, 3), strides=1, padding="same", name="synthetic_input", activation=mish, ) self._super_resolution = GeneratorNetwork( num_filters, num_gc, num_blocks, residual_scailing, ) self._face_recognition = ResNet(depth, categories, training, None) if self._training: if both: self._fc_classification_nat = Dense( input_shape=(categories, ), units=num_classes_nat, activation=None, use_bias=False, dtype="float32", name="fully_connected_to_softmax_crossentropy_nat", ) self._fc_classification_nat.build(tf.TensorShape([None, 512])) self.net_type = "nat" self._fc_classification_syn: Dense = Dense( input_shape=(categories, ), units=num_classes_syn, activation="softmax", use_bias=False, dtype="float32", name="fully_connected_to_softmax_crossentropy_syn", ) self._fc_classification_syn.build(tf.TensorShape([None, 512])) @tf.function def _call_evaluating(self, input_tensor, input_type: str = "syn"): if input_type == "syn": outputs = self._synthetic_input(input_tensor) else: outputs = self._natural_input(input_tensor) super_resolution_image = self._super_resolution(outputs) embeddings = self._face_recognition(super_resolution_image) # if input_type == "syn": # classification = self._fc_classification_syn(embeddings) # else: # classification = self._fc_classification_nat(embeddings) return super_resolution_image, embeddings # , classification # def _calculate_normalized_embeddings(self, embeddings, net_type: str = "syn"): # fc_weights = self.get_weights(net_type) # normalized_weights = tf.Variable( # normalize(fc_weights, name="weights_normalization"), # aggregation=tf.VariableAggregation.NONE, # ) # normalized_embeddings = ( # normalize(embeddings, axis=1, name="embeddings_normalization") * self.scale # ) # # replica = tf.distribute.get_replica_context() # # replica.merge_call(self.set_weights, # # args=(normalized_weights, net_type)) # self.set_weights(normalized_weights, net_type) # return self.call_fc_classification(normalized_embeddings, net_type) def _call_training(self, synthetic_images, natural_images=None): synthetic_outputs = self._synthetic_input(synthetic_images) synthetic_sr_images = self._super_resolution(synthetic_outputs) synthetic_embeddings = self._face_recognition(synthetic_sr_images) # synthetic_embeddings = self._calculate_normalized_embeddings( # synthetic_embeddings # ) synthetic_classification = self._fc_classification_syn( synthetic_embeddings) if natural_images: natural_outputs = self._natural_input(natural_images) natural_sr_images = self._super_resolution(natural_outputs) natural_embeddings = self._face_recognition(natural_sr_images) # natural_embeddings = self._calculate_normalized_embeddings( # natural_embeddings # ) natural_classification = self._fc_classification_nat( natural_embeddings) return ( synthetic_sr_images, synthetic_embeddings, synthetic_classification, natural_sr_images, natural_embeddings, natural_classification, ) return synthetic_sr_images, synthetic_embeddings, synthetic_classification def call( self, input_tensor_01, input_tensor_02=None, training: bool = True, input_type: str = "syn", ): if training: return self._call_training(input_tensor_01, input_tensor_02) return self._call_evaluating(input_tensor_01, input_type)
class BAC(Layer): def __init__(self, passage_len=200, activation='softmax', nn_units=300, emb_dim=600, **kwargs): self.activation = activation self.nn_units = nn_units self.emb_dim = emb_dim self.passage_len = passage_len super(BAC, self).__init__(**kwargs) def build(self, input_shape): passage_shape = (input_shape[0], self.passage_len, input_shape[-1]) query_shape = (input_shape[0], input_shape[1] - self.passage_len, input_shape[-1]) self.dense_1 = Dense(self.nn_units, activation=relu, use_bias=True) self.dense_1.build(passage_shape) self.dense_2 = Dense(self.nn_units, activation=relu, use_bias=True) self.dense_2.build(query_shape) self.trainable_weight = self.dense_1.trainable_weights + self.dense_2.trainable_weights super(BAC, self).build(input_shape) # Be sure to call this at the end def call(self, stack_input): # unstack_input = tf.unstack(stack_input) passage_input = stack_input[:, :self.passage_len, :] query_input = stack_input[:, self.passage_len:, :] passage_dense = self.dense_1(passage_input) query_dense = self.dense_2(query_input) affinity_matrix = tf.matmul(passage_dense, tf.transpose(query_dense, perm=[0, 2, 1])) affinity_matrix = 1 / np.sqrt(self.emb_dim) * affinity_matrix activation = Activation(softmax) aligned_p = activation(tf.transpose(affinity_matrix, perm=[0, 2, 1])) aligned_q = activation(affinity_matrix) passage_aligned = tf.matmul(aligned_p, passage_input) query_aligned = tf.matmul(aligned_q, query_input) passage_concat = tf.concat([query_aligned, passage_input], 2) query_concat = tf.concat([passage_aligned, query_input], 2) passage_diff = tf.subtract(query_aligned, passage_input) query_diff = tf.subtract(passage_aligned, query_input) passage_mul = tf.multiply(query_aligned, passage_input) query_mul = tf.multiply(passage_aligned, query_input) fm_1 = Factorization_machine(5, name='passage_concat_layer') fm_2 = Factorization_machine(5, name='query_concat_layer') fm_3 = Factorization_machine(5, name='passage_diff_layer') fm_4 = Factorization_machine(5, name='query_diff_layer') fm_5 = Factorization_machine(5, name='passage_mul_layer') fm_6 = Factorization_machine(5, name='query_mul_layer') connecter_1 = fm_1(passage_concat) connecter_2 = fm_2(query_concat) connecter_3 = fm_3(passage_diff) connecter_4 = fm_4(query_diff) connecter_5 = fm_5(passage_mul) connecter_6 = fm_6(query_mul) feature_p = [connecter_1, connecter_3, connecter_5] feature_q = [connecter_2, connecter_4, connecter_6] features_passage = tf.concat(feature_p, 2) features_query = tf.concat(feature_q, 2) return features_passage, features_query