def scaled_dot_product_attention(q, k, v, mask): """ The implementation of scaled attention. Args: v: (batch_size, seq_len_v, hidden_size) k: (batch_size, seq_len_k, hidden_size) q: (batch_size, seq_len_q, hidden_size) mask: (batch_size, seq_len_q, seq_len_k) Returns: output: (batch_size, seq_len_q, hidden_size) attention_weights: (batch_size, num_heads, seq_len_q, seq_len_k) """ matmul_qk = tf.matmul( q, k, transpose_b=True) # (batch_size, seq_len_q, seq_len_k) # Scaled dk = tf.cast(tf.shape(k)[-1], tf.float32) scaled_attention_logits = matmul_qk / tf.math.sqrt(dk) # Masked if mask is not None: scaled_attention_logits += (mask * -1e9) # Normalized attention_weights = tf.nn.softmax( scaled_attention_logits, axis=-1) # (batch_size, seq_len_q, seq_len_k) # Weighted sum output = tf.matmul(attention_weights, v) # (batch_size, seq_len_q, depth_v) return output, attention_weights
def attention(inputs, attention_size, time_major=False, return_alphas=False): """Attention layer.""" if isinstance(inputs, tuple): # In case of Bi-RNN, concatenate the forward and the backward RNN outputs. inputs = tf.concat(inputs, 2) if time_major: # (T,B,D) => (B,T,D) inputs = tf.transpose(inputs, [1, 0, 2]) time_size = inputs.shape[1].value # T value - time size of the RNN layer hidden_size = inputs.shape[ 2].value # D value - hidden size of the RNN layer # Trainable parameters W_omega = tf.get_variable(name='W_omega', initializer=tf.random_normal( [hidden_size, attention_size], stddev=0.1)) b_omega = tf.get_variable(name='b_omega', initializer=tf.random_normal([attention_size], stddev=0.1)) u_omega = tf.get_variable(name='u_omega', initializer=tf.random_normal([attention_size, 1], stddev=0.1)) # Applying fully connected layer with non-linear activation to each of the B*T timestamps; # the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size #v = tf.tanh(tf.tensordot(inputs, W_omega, axes=1) + b_omega) #v = tf.sigmoid(tf.tensordot(inputs, W_omega, axes=1) + b_omega) # (B, T, D) dot (D, Atten) logging.info('attention inputs: {}'.format(inputs.shape)) inputs_reshaped = tf.reshape(inputs, [-1, hidden_size]) dot = tf.matmul(inputs_reshaped, W_omega) dot = tf.reshape(dot, [-1, time_size, attention_size]) v = tf.sigmoid(dot + b_omega) logging.info(f'attention vector: {v.shape}') # For each of the timestamps its vector of size A from `v` is reduced with `u` vector # (B, T, Atten) dot (Atten) #vu = tf.tensordot(v, u_omega, axes=1) # (B,T) shape v = tf.reshape(v, [-1, attention_size]) vu = tf.matmul(v, u_omega) # (B,T) shape vu = tf.squeeze(vu, axis=-1) vu = tf.reshape(vu, [-1, time_size]) logging.info(f'attention energe: {vu.shape}') alphas = tf.nn.softmax(vu) # (B,T) shape also # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape # [batch, time] -> [batch, time, 1] alphas = tf.expand_dims(alphas, -1) # [batch, time, dim] -> [batch, dim] output = tf.reduce_sum(inputs * alphas, 1) if not return_alphas: return output return output, alphas
def linear(x, names, shapes, has_bias=True): """Linear Layer.""" assert len(shapes) == 2 with tf.variable_scope(names): weights = tf.get_variable(name='weights', shape=shapes, initializer=tf.initializers.glorot_uniform()) if has_bias: bias = tf.get_variable( name='bias', shape=shapes[1], initializer=tf.initializers.glorot_uniform()) return tf.matmul(x, weights) + bias else: return tf.matmul(x, weights)
def call(self, inputs, training=None, mask=None): # pylint: disable=too-many-locals input_left = inputs["input_x_left"] input_right = inputs["input_x_right"] embedding = self.embed embed_left = embedding(input_left) embed_right = embedding(input_right) encoded_left = self.lstm_left(embed_left) encoded_right = self.lstm_right(embed_right) encoded_right = tf.transpose(encoded_right, [0, 2, 1]) left_right_sim = tf.matmul(encoded_left, encoded_right) shape_list = left_right_sim.get_shape() newdim = shape_list[1] * shape_list[2] sim_matrix = tf.reshape(left_right_sim, [-1, newdim], name="sim_matrix") dropout = self.dropout(sim_matrix) out = self.outlayer(dropout) scores = self.final_dense(out) return scores
def call(self, inputs, training=None, mask=None): batch_size = tf.shape(inputs)[0] W_3d = tf.tile(tf.expand_dims(self.W, axis=0), tf.stack([batch_size, 1, 1])) # [batch_size, steps, features] input_projection = tf.matmul(inputs, W_3d) if self.use_bias: input_projection += self.b input_projection = tf.tanh(input_projection) # [batch_size, steps, 1] similaritys = tf.reduce_sum(tf.multiply(input_projection, self.attention_context_vector), axis=2, keep_dims=True) # [batch_size, steps, 1] if mask is not None: attention_weights = masked_softmax(similaritys, mask, axis=1) else: attention_weights = tf.nn.softmax(similaritys, axis=1) # [batch_size, features] attention_output = tf.reduce_sum(tf.multiply(inputs, attention_weights), axis=1) return attention_output
def logits_layer(self, x, labels): ''' Logits layer to further produce softmax. ''' if labels is None: # serving export mode, no need for logits return x output_num = self.taskconf['classes']['num'] logits_type = self.netconf['logits_type'] logits_shape = [x.shape[-1].value, output_num] with tf.variable_scope('logits'): init_type = self.netconf['logits_weight_init']['type'] if init_type == 'truncated_normal': stddev = self.netconf['logits_weight_init']['stddev'] init = tf.truncated_normal_initializer(stddev=stddev) elif init_type == 'xavier_uniform': init = tf.contrib.layers.xavier_initializer(uniform=True) elif init_type == 'xavier_norm': init = tf.contrib.layers.xavier_initializer(uniform=False) else: raise ValueError('Unsupported weight init type: %s' % (init_type)) weights = tf.get_variable(name='weights', shape=logits_shape, initializer=init) if logits_type == 'linear': bias = tf.get_variable( name='bias', shape=logits_shape[1], initializer=tf.constant_initializer(0.0)) return tf.matmul(x, weights) + bias elif logits_type == 'linear_no_bias': return tf.matmul(x, weights) elif logits_type == 'arcface': return self.arcface_layer(x, labels, output_num, weights)
def call(self, tensors): """Attention layer.""" left, right = tensors len_left = left.shape[1] len_right = right.shape[1] tensor_left = tf.expand_dims(left, axis=2) tensor_right = tf.expand_dims(right, axis=1) tensor_left = tf.tile(tensor_left, [1, 1, len_right, 1]) tensor_right = tf.tile(tensor_right, [1, len_left, 1, 1]) tensor_merged = tf.concat([tensor_left, tensor_right], axis=-1) middle_output = self.middle_layer(tensor_merged) attn_scores = self.attn(middle_output) attn_scores = tf.squeeze(attn_scores, axis=3) exp_attn_scores = tf.exp( attn_scores - tf.reduce_max(attn_scores, axis=-1, keepdims=True)) exp_sum = tf.reduce_sum(exp_attn_scores, axis=-1, keepdims=True) attention_weights = exp_attn_scores / exp_sum return tf.matmul(attention_weights, right)
def arcface_loss(embedding, labels, out_num, weights=None, s=64., m=0.5, limit_to_pi=True): ''' https://github.com/auroua/InsightFace_TF/blob/master/losses/face_losses.py :param embedding: the input embedding vectors :param labels: the input labels, the shape should be eg: (batch_size, 1) :param s: scalar value default is 64 :param out_num: output class num :param weights: a tf.variable with shape (embedding.shape[-1], out_num) or None to make a new one internally. default = None :param m: the margin value, default is 0.5 :return: the final cacualted output, this output is send into the tf.nn.softmax directly ''' cos_m = math.cos(m) sin_m = math.sin(m) mm = sin_m * m # issue 1 threshold = math.cos(math.pi - m) with tf.variable_scope('arcface_loss'): # inputs and weights norm embedding_norm = tf.norm(embedding, axis=1, keep_dims=True) embedding = tf.div(embedding, embedding_norm, name='norm_embedding') if weights is None: weights = tf.get_variable( name='weights', shape=[embedding.shape[-1].value, out_num], initializer=tf.initializer.glorot_unifrom()) weights_norm = tf.norm(weights, axis=0, keep_dims=True) weights = tf.div(weights, weights_norm, name='norm_weights') # cos(theta+m) cos_t = tf.matmul(embedding, weights, name='cos_t') cos_t2 = tf.square(cos_t, name='cos_2') sin_t2 = tf.subtract(1., cos_t2, name='sin_2') sin_t = tf.sqrt(sin_t2, name='sin_t') cos_mt = s * tf.subtract(tf.multiply(cos_t, cos_m), tf.multiply(sin_t, sin_m), name='cos_mt') if limit_to_pi: # this condition controls the theta+m should in range [0, pi] # 0<=theta+m<=pi # -m<=theta<=pi-m cond_v = cos_t - threshold cond = tf.cast(tf.nn.relu(cond_v, name='if_else'), dtype=tf.bool) keep_val = s * (cos_t - mm) cos_mt_temp = tf.where(cond, cos_mt, keep_val) else: cos_mt_temp = cos_mt mask = tf.one_hot(labels, depth=out_num, name='one_hot_mask') # mask = tf.squeeze(mask, 1) inv_mask = tf.subtract(1., mask, name='inverse_mask') s_cos_t = tf.multiply(s, cos_t, name='scalar_cos_t') output = tf.add(tf.multiply(s_cos_t, inv_mask), tf.multiply(cos_mt_temp, mask), name='arcface_loss_output') return output
def call(self, inputs, training=None, mask=None): query, key, value = self._unpack(inputs) query_mask, key_mask, _ = self._unpack(mask) batch_size = tf.shape(query)[0] dimension_query = query.get_shape().as_list()[-1] seq_len = tf.shape(query)[-2] key_len = tf.shape(key)[-2] feature_dim = tf.shape(value)[-1] query = tf.matmul( query, tf.tile(tf.expand_dims(self.kernel_query, 0), [batch_size, 1, 1])) key = tf.matmul( key, tf.tile(tf.expand_dims(self.kernel_key, 0), [batch_size, 1, 1])) value = tf.matmul( value, tf.tile(tf.expand_dims(self.kernel_value, 0), [batch_size, 1, 1])) if self.use_bias: query += self.b_query key += self.b_key value += self.b_value def _reshape_multihead(origin_input): """ reshape for multi head Input shape: (Batch size, steps, features) Output shape: (Batch size * head num, steps, features // head num) """ return tf.concat(tf.split(origin_input, self.head_num, axis=2), axis=0) def _reshape_mask(mask): """ repeat mask for multi head Input shape: (Batch size, steps) Output shape: (Batch size * head num, steps) """ if mask is None: return None seq_len = tf.shape(mask)[1] mask = tf.expand_dims(mask, axis=1) mask = tf.tile(mask, [1, self.head_num, 1]) return tf.reshape(mask, shape=(-1, seq_len)) query_ = _reshape_multihead(query) key_ = _reshape_multihead(key) value_ = _reshape_multihead(value) key_mask = _reshape_mask(key_mask) # (Batch size * head num, query steps, key steps) similaritys = tf.matmul(query_, tf.transpose(key_, [0, 2, 1])) # scale similaritys /= tf.sqrt(tf.cast(dimension_query, tf.float32)) if self.sequence_mask: ones = tf.ones((seq_len, key_len)) similaritys -= (ones - tf.matrix_band_part(ones, -1, 0)) * 1e9 if key_mask is not None: similaritys -= (1.0 - tf.cast(tf.expand_dims(key_mask, axis=-2), tf.float32)) * 1e9 attention_weights = tf.keras.activations.softmax(similaritys) attention_outputs = tf.matmul(attention_weights, value_) attention_outputs = tf.reshape( attention_outputs, (-1, self.head_num, seq_len, feature_dim // self.head_num)) attention_outputs = tf.transpose(attention_outputs, [0, 2, 1, 3]) attention_outputs = tf.reshape(attention_outputs, (-1, seq_len, feature_dim)) attention_outputs = tf.matmul( attention_outputs, tf.tile(tf.expand_dims(self.kernel_project, 0), [batch_size, 1, 1])) if self.use_bias: attention_outputs += self.b_project if self.activation is not None: attention_outputs = self.activation(attention_outputs) if query_mask is not None: attention_outputs *= tf.cast(tf.expand_dims(query_mask, axis=-1), tf.float32) return attention_outputs