Example #1
0
    def scaled_dot_product_attention(q, k, v, mask):
        """
    The implementation of scaled attention.
    Args:
      v: (batch_size, seq_len_v, hidden_size)
      k: (batch_size, seq_len_k, hidden_size)
      q: (batch_size, seq_len_q, hidden_size)
      mask: (batch_size, seq_len_q, seq_len_k)

    Returns:
      output: (batch_size, seq_len_q, hidden_size)
      attention_weights: (batch_size, num_heads, seq_len_q, seq_len_k)
    """

        matmul_qk = tf.matmul(
            q, k, transpose_b=True)  # (batch_size, seq_len_q, seq_len_k)

        # Scaled
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        # Masked
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)

        # Normalized
        attention_weights = tf.nn.softmax(
            scaled_attention_logits,
            axis=-1)  # (batch_size, seq_len_q, seq_len_k)

        # Weighted sum
        output = tf.matmul(attention_weights,
                           v)  # (batch_size, seq_len_q, depth_v)

        return output, attention_weights
Example #2
0
def attention(inputs, attention_size, time_major=False, return_alphas=False):
    """Attention layer."""
    if isinstance(inputs, tuple):
        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
        inputs = tf.concat(inputs, 2)

    if time_major:
        # (T,B,D) => (B,T,D)
        inputs = tf.transpose(inputs, [1, 0, 2])

    time_size = inputs.shape[1].value  # T value - time size of the RNN layer
    hidden_size = inputs.shape[
        2].value  # D value - hidden size of the RNN layer

    # Trainable parameters
    W_omega = tf.get_variable(name='W_omega',
                              initializer=tf.random_normal(
                                  [hidden_size, attention_size], stddev=0.1))
    b_omega = tf.get_variable(name='b_omega',
                              initializer=tf.random_normal([attention_size],
                                                           stddev=0.1))
    u_omega = tf.get_variable(name='u_omega',
                              initializer=tf.random_normal([attention_size, 1],
                                                           stddev=0.1))

    # Applying fully connected layer with non-linear activation to each of the B*T timestamps;
    #  the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
    #v = tf.tanh(tf.tensordot(inputs, W_omega, axes=1) + b_omega)
    #v = tf.sigmoid(tf.tensordot(inputs, W_omega, axes=1) + b_omega)
    # (B, T, D) dot (D, Atten)

    logging.info('attention inputs: {}'.format(inputs.shape))
    inputs_reshaped = tf.reshape(inputs, [-1, hidden_size])
    dot = tf.matmul(inputs_reshaped, W_omega)
    dot = tf.reshape(dot, [-1, time_size, attention_size])
    v = tf.sigmoid(dot + b_omega)
    logging.info(f'attention vector: {v.shape}')
    # For each of the timestamps its vector of size A from `v` is reduced with `u` vector
    # (B, T, Atten) dot (Atten)
    #vu = tf.tensordot(v, u_omega, axes=1)   # (B,T) shape
    v = tf.reshape(v, [-1, attention_size])
    vu = tf.matmul(v, u_omega)  # (B,T) shape
    vu = tf.squeeze(vu, axis=-1)
    vu = tf.reshape(vu, [-1, time_size])
    logging.info(f'attention energe: {vu.shape}')
    alphas = tf.nn.softmax(vu)  # (B,T) shape also

    # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
    # [batch, time] -> [batch, time, 1]
    alphas = tf.expand_dims(alphas, -1)
    # [batch, time, dim] -> [batch, dim]
    output = tf.reduce_sum(inputs * alphas, 1)

    if not return_alphas:
        return output

    return output, alphas
Example #3
0
def linear(x, names, shapes, has_bias=True):
    """Linear Layer."""
    assert len(shapes) == 2
    with tf.variable_scope(names):
        weights = tf.get_variable(name='weights',
                                  shape=shapes,
                                  initializer=tf.initializers.glorot_uniform())
        if has_bias:
            bias = tf.get_variable(
                name='bias',
                shape=shapes[1],
                initializer=tf.initializers.glorot_uniform())
            return tf.matmul(x, weights) + bias
        else:
            return tf.matmul(x, weights)
Example #4
0
    def call(self, inputs, training=None, mask=None):  # pylint: disable=too-many-locals

        input_left = inputs["input_x_left"]
        input_right = inputs["input_x_right"]

        embedding = self.embed
        embed_left = embedding(input_left)
        embed_right = embedding(input_right)

        encoded_left = self.lstm_left(embed_left)
        encoded_right = self.lstm_right(embed_right)

        encoded_right = tf.transpose(encoded_right, [0, 2, 1])
        left_right_sim = tf.matmul(encoded_left, encoded_right)
        shape_list = left_right_sim.get_shape()
        newdim = shape_list[1] * shape_list[2]
        sim_matrix = tf.reshape(left_right_sim, [-1, newdim],
                                name="sim_matrix")

        dropout = self.dropout(sim_matrix)
        out = self.outlayer(dropout)

        scores = self.final_dense(out)

        return scores
Example #5
0
    def call(self, inputs, training=None, mask=None):
        batch_size = tf.shape(inputs)[0]
        W_3d = tf.tile(tf.expand_dims(self.W, axis=0),
                       tf.stack([batch_size, 1, 1]))
        # [batch_size, steps, features]
        input_projection = tf.matmul(inputs, W_3d)

        if self.use_bias:
            input_projection += self.b

        input_projection = tf.tanh(input_projection)

        # [batch_size, steps, 1]
        similaritys = tf.reduce_sum(tf.multiply(input_projection,
                                                self.attention_context_vector),
                                    axis=2,
                                    keep_dims=True)

        # [batch_size, steps, 1]
        if mask is not None:
            attention_weights = masked_softmax(similaritys, mask, axis=1)
        else:
            attention_weights = tf.nn.softmax(similaritys, axis=1)

        # [batch_size, features]
        attention_output = tf.reduce_sum(tf.multiply(inputs,
                                                     attention_weights),
                                         axis=1)
        return attention_output
Example #6
0
    def logits_layer(self, x, labels):
        ''' Logits layer to further produce softmax. '''
        if labels is None:
            # serving export mode, no need for logits
            return x

        output_num = self.taskconf['classes']['num']
        logits_type = self.netconf['logits_type']
        logits_shape = [x.shape[-1].value, output_num]

        with tf.variable_scope('logits'):
            init_type = self.netconf['logits_weight_init']['type']
            if init_type == 'truncated_normal':
                stddev = self.netconf['logits_weight_init']['stddev']
                init = tf.truncated_normal_initializer(stddev=stddev)
            elif init_type == 'xavier_uniform':
                init = tf.contrib.layers.xavier_initializer(uniform=True)
            elif init_type == 'xavier_norm':
                init = tf.contrib.layers.xavier_initializer(uniform=False)
            else:
                raise ValueError('Unsupported weight init type: %s' %
                                 (init_type))

            weights = tf.get_variable(name='weights',
                                      shape=logits_shape,
                                      initializer=init)

            if logits_type == 'linear':
                bias = tf.get_variable(
                    name='bias',
                    shape=logits_shape[1],
                    initializer=tf.constant_initializer(0.0))
                return tf.matmul(x, weights) + bias
            elif logits_type == 'linear_no_bias':
                return tf.matmul(x, weights)
            elif logits_type == 'arcface':
                return self.arcface_layer(x, labels, output_num, weights)
Example #7
0
    def call(self, tensors):
        """Attention layer."""
        left, right = tensors

        len_left = left.shape[1]
        len_right = right.shape[1]
        tensor_left = tf.expand_dims(left, axis=2)
        tensor_right = tf.expand_dims(right, axis=1)
        tensor_left = tf.tile(tensor_left, [1, 1, len_right, 1])
        tensor_right = tf.tile(tensor_right, [1, len_left, 1, 1])
        tensor_merged = tf.concat([tensor_left, tensor_right], axis=-1)
        middle_output = self.middle_layer(tensor_merged)
        attn_scores = self.attn(middle_output)
        attn_scores = tf.squeeze(attn_scores, axis=3)
        exp_attn_scores = tf.exp(
            attn_scores - tf.reduce_max(attn_scores, axis=-1, keepdims=True))
        exp_sum = tf.reduce_sum(exp_attn_scores, axis=-1, keepdims=True)
        attention_weights = exp_attn_scores / exp_sum
        return tf.matmul(attention_weights, right)
Example #8
0
def arcface_loss(embedding,
                 labels,
                 out_num,
                 weights=None,
                 s=64.,
                 m=0.5,
                 limit_to_pi=True):
    '''
  https://github.com/auroua/InsightFace_TF/blob/master/losses/face_losses.py
  :param embedding: the input embedding vectors
  :param labels:  the input labels, the shape should be eg: (batch_size, 1)
  :param s: scalar value default is 64
  :param out_num: output class num
  :param weights: a tf.variable with shape (embedding.shape[-1], out_num)
                  or None to make a new one internally. default = None
  :param m: the margin value, default is 0.5
  :return: the final cacualted output, this output is send into the tf.nn.softmax directly
  '''
    cos_m = math.cos(m)
    sin_m = math.sin(m)
    mm = sin_m * m  # issue 1
    threshold = math.cos(math.pi - m)
    with tf.variable_scope('arcface_loss'):
        # inputs and weights norm
        embedding_norm = tf.norm(embedding, axis=1, keep_dims=True)
        embedding = tf.div(embedding, embedding_norm, name='norm_embedding')
        if weights is None:
            weights = tf.get_variable(
                name='weights',
                shape=[embedding.shape[-1].value, out_num],
                initializer=tf.initializer.glorot_unifrom())
        weights_norm = tf.norm(weights, axis=0, keep_dims=True)
        weights = tf.div(weights, weights_norm, name='norm_weights')
        # cos(theta+m)
        cos_t = tf.matmul(embedding, weights, name='cos_t')
        cos_t2 = tf.square(cos_t, name='cos_2')
        sin_t2 = tf.subtract(1., cos_t2, name='sin_2')
        sin_t = tf.sqrt(sin_t2, name='sin_t')
        cos_mt = s * tf.subtract(tf.multiply(cos_t, cos_m),
                                 tf.multiply(sin_t, sin_m),
                                 name='cos_mt')

        if limit_to_pi:
            # this condition controls the theta+m should in range [0, pi]
            #      0<=theta+m<=pi
            #     -m<=theta<=pi-m
            cond_v = cos_t - threshold
            cond = tf.cast(tf.nn.relu(cond_v, name='if_else'), dtype=tf.bool)

            keep_val = s * (cos_t - mm)
            cos_mt_temp = tf.where(cond, cos_mt, keep_val)
        else:
            cos_mt_temp = cos_mt

        mask = tf.one_hot(labels, depth=out_num, name='one_hot_mask')
        # mask = tf.squeeze(mask, 1)
        inv_mask = tf.subtract(1., mask, name='inverse_mask')

        s_cos_t = tf.multiply(s, cos_t, name='scalar_cos_t')

        output = tf.add(tf.multiply(s_cos_t, inv_mask),
                        tf.multiply(cos_mt_temp, mask),
                        name='arcface_loss_output')
    return output
Example #9
0
    def call(self, inputs, training=None, mask=None):

        query, key, value = self._unpack(inputs)

        query_mask, key_mask, _ = self._unpack(mask)

        batch_size = tf.shape(query)[0]
        dimension_query = query.get_shape().as_list()[-1]
        seq_len = tf.shape(query)[-2]
        key_len = tf.shape(key)[-2]
        feature_dim = tf.shape(value)[-1]

        query = tf.matmul(
            query,
            tf.tile(tf.expand_dims(self.kernel_query, 0), [batch_size, 1, 1]))
        key = tf.matmul(
            key, tf.tile(tf.expand_dims(self.kernel_key, 0),
                         [batch_size, 1, 1]))
        value = tf.matmul(
            value,
            tf.tile(tf.expand_dims(self.kernel_value, 0), [batch_size, 1, 1]))
        if self.use_bias:
            query += self.b_query
            key += self.b_key
            value += self.b_value

        def _reshape_multihead(origin_input):
            """
      reshape for multi head
        Input shape: (Batch size, steps, features)
        Output shape: (Batch size * head num, steps, features // head num)
      """
            return tf.concat(tf.split(origin_input, self.head_num, axis=2),
                             axis=0)

        def _reshape_mask(mask):
            """
      repeat mask for multi head
        Input shape: (Batch size, steps)
        Output shape: (Batch size * head num, steps)
      """
            if mask is None:
                return None
            seq_len = tf.shape(mask)[1]
            mask = tf.expand_dims(mask, axis=1)
            mask = tf.tile(mask, [1, self.head_num, 1])
            return tf.reshape(mask, shape=(-1, seq_len))

        query_ = _reshape_multihead(query)
        key_ = _reshape_multihead(key)
        value_ = _reshape_multihead(value)

        key_mask = _reshape_mask(key_mask)

        # (Batch size * head num, query steps, key steps)
        similaritys = tf.matmul(query_, tf.transpose(key_, [0, 2, 1]))
        # scale
        similaritys /= tf.sqrt(tf.cast(dimension_query, tf.float32))
        if self.sequence_mask:
            ones = tf.ones((seq_len, key_len))
            similaritys -= (ones - tf.matrix_band_part(ones, -1, 0)) * 1e9
        if key_mask is not None:
            similaritys -= (1.0 - tf.cast(tf.expand_dims(key_mask, axis=-2),
                                          tf.float32)) * 1e9

        attention_weights = tf.keras.activations.softmax(similaritys)
        attention_outputs = tf.matmul(attention_weights, value_)
        attention_outputs = tf.reshape(
            attention_outputs,
            (-1, self.head_num, seq_len, feature_dim // self.head_num))
        attention_outputs = tf.transpose(attention_outputs, [0, 2, 1, 3])
        attention_outputs = tf.reshape(attention_outputs,
                                       (-1, seq_len, feature_dim))

        attention_outputs = tf.matmul(
            attention_outputs,
            tf.tile(tf.expand_dims(self.kernel_project, 0),
                    [batch_size, 1, 1]))
        if self.use_bias:
            attention_outputs += self.b_project
        if self.activation is not None:
            attention_outputs = self.activation(attention_outputs)

        if query_mask is not None:
            attention_outputs *= tf.cast(tf.expand_dims(query_mask, axis=-1),
                                         tf.float32)

        return attention_outputs