Esempio n. 1
0
    def call(self, inputs, training=None, mask=None):  # pylint: disable=too-many-locals

        input_left = inputs["input_x_left"]
        input_right = inputs["input_x_right"]

        embedding = self.embed
        embed_left = embedding(input_left)
        embed_right = embedding(input_right)

        encoded_left = self.lstm_left(embed_left)
        encoded_right = self.lstm_right(embed_right)

        encoded_right = tf.transpose(encoded_right, [0, 2, 1])
        left_right_sim = tf.matmul(encoded_left, encoded_right)
        shape_list = left_right_sim.get_shape()
        newdim = shape_list[1] * shape_list[2]
        sim_matrix = tf.reshape(left_right_sim, [-1, newdim],
                                name="sim_matrix")

        dropout = self.dropout(sim_matrix)
        out = self.outlayer(dropout)

        scores = self.final_dense(out)

        return scores
Esempio n. 2
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate pitch features of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A float tensor of size (1, num_frames) containing pitch features of every frame in speech.
    """

        p = self.config
        with tf.name_scope('pitch'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=float)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=float))
            with tf.control_dependencies([assert_op]):

                pitch = py_x_ops.pitch(audio_data,
                                       sample_rate,
                                       window_length=p.window_length,
                                       frame_length=p.frame_length,
                                       thres_autoc=p.thres_autoc)

                pitch = tf.squeeze(pitch)
                pitch = tf.transpose(pitch[None, :])
                return pitch
Esempio n. 3
0
        def _dpool_index(one_length_left, one_length_right, fixed_length_left,
                         fixed_length_right):

            logging.info("fixed_length_left: {}".format(fixed_length_left))
            logging.info("fixed_length_right: {}".format(fixed_length_right))

            if one_length_left == 0:
                stride_left = fixed_length_left
            else:
                stride_left = 1.0 * fixed_length_left / tf.cast(
                    one_length_left, dtype=tf.float32)

            if one_length_right == 0:
                stride_right = fixed_length_right
            else:
                stride_right = 1.0 * fixed_length_right / tf.cast(
                    one_length_right, dtype=tf.float32)

            one_idx_left = [
                tf.cast(i / stride_left, dtype=tf.int32)
                for i in range(fixed_length_left)
            ]
            one_idx_right = [
                tf.cast(i / stride_right, dtype=tf.int32)
                for i in range(fixed_length_right)
            ]
            mesh1, mesh2 = tf.meshgrid(one_idx_left, one_idx_right)
            index_one = tf.transpose(tf.stack([mesh1, mesh2]), (2, 1, 0))
            return index_one
Esempio n. 4
0
def splice(feat, left_context, right_context):
    '''
  splice frame with context
    param: feat, tf.float32, [batch, time, feat]
    return: feat, tf.float32, [batch, time, feat*(left_context + 1 + right_context)]
    reference:
      https://github.com/kaldi-asr/kaldi/src/feat/feature-functions.cc#L205:6
  '''
    def _loop_continue(time, end_time, context, unused_left_context,
                       right_context, unused_output_tas):
        del unused_output_tas
        del unused_left_context
        return time < end_time

    def _loop_body(time, end_time, context, left_context, right_context,
                   output_tas):
        shape = tf.shape(context)
        B, _, D = shape[0], shape[1], shape[2]
        N = (1 + left_context + right_context) * D

        new_feat = context[:, time:time + left_context + 1 + right_context, :]
        new_feat = tf.reshape(new_feat, [B, N])
        new_output_tas = output_tas.write(time, new_feat)
        return (time + 1, end_time, context, left_context, right_context,
                new_output_tas)

    with tf.control_dependencies([
            tf.assert_greater_equal(left_context, 0),
            tf.assert_greater_equal(right_context, 0)
    ]):
        T = tf.shape(feat)[1]
        output_tas = _new_tensor_array('splice_feat_ta', T, dtype=tf.float32)
        time = tf.constant(0, tf.int32)
        first = tf.tile(feat[:, 0:1, :], [1, left_context, 1])
        last = tf.tile(feat[:, -1:, :], [1, right_context, 1])
        context = tf.concat([first, feat], axis=1)
        context = tf.concat([context, last], axis=1)

        loop_vars = (time, T, context, left_context, right_context, output_tas)

        parallel_iterations = 10
        shape_invariants = tf.nest.map_structure(
            lambda t: tf.TensorShape(None), loop_vars)

        (time, end_time, context, left_context, right_context,
         output_tas) = tf.while_loop(_loop_continue,
                                     _loop_body,
                                     loop_vars=loop_vars,
                                     shape_invariants=shape_invariants,
                                     parallel_iterations=parallel_iterations,
                                     swap_memory=False)
        del context
        del left_context
        del right_context

        batch_spliced_feats = output_tas.stack()
        batch_spliced_feats = tf.transpose(batch_spliced_feats, [1, 0, 2])
    return batch_spliced_feats
Esempio n. 5
0
def attention(inputs, attention_size, time_major=False, return_alphas=False):
    """Attention layer."""
    if isinstance(inputs, tuple):
        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
        inputs = tf.concat(inputs, 2)

    if time_major:
        # (T,B,D) => (B,T,D)
        inputs = tf.transpose(inputs, [1, 0, 2])

    time_size = inputs.shape[1].value  # T value - time size of the RNN layer
    hidden_size = inputs.shape[
        2].value  # D value - hidden size of the RNN layer

    # Trainable parameters
    W_omega = tf.get_variable(name='W_omega',
                              initializer=tf.random_normal(
                                  [hidden_size, attention_size], stddev=0.1))
    b_omega = tf.get_variable(name='b_omega',
                              initializer=tf.random_normal([attention_size],
                                                           stddev=0.1))
    u_omega = tf.get_variable(name='u_omega',
                              initializer=tf.random_normal([attention_size, 1],
                                                           stddev=0.1))

    # Applying fully connected layer with non-linear activation to each of the B*T timestamps;
    #  the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
    #v = tf.tanh(tf.tensordot(inputs, W_omega, axes=1) + b_omega)
    #v = tf.sigmoid(tf.tensordot(inputs, W_omega, axes=1) + b_omega)
    # (B, T, D) dot (D, Atten)

    logging.info('attention inputs: {}'.format(inputs.shape))
    inputs_reshaped = tf.reshape(inputs, [-1, hidden_size])
    dot = tf.matmul(inputs_reshaped, W_omega)
    dot = tf.reshape(dot, [-1, time_size, attention_size])
    v = tf.sigmoid(dot + b_omega)
    logging.info(f'attention vector: {v.shape}')
    # For each of the timestamps its vector of size A from `v` is reduced with `u` vector
    # (B, T, Atten) dot (Atten)
    #vu = tf.tensordot(v, u_omega, axes=1)   # (B,T) shape
    v = tf.reshape(v, [-1, attention_size])
    vu = tf.matmul(v, u_omega)  # (B,T) shape
    vu = tf.squeeze(vu, axis=-1)
    vu = tf.reshape(vu, [-1, time_size])
    logging.info(f'attention energe: {vu.shape}')
    alphas = tf.nn.softmax(vu)  # (B,T) shape also

    # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
    # [batch, time] -> [batch, time, 1]
    alphas = tf.expand_dims(alphas, -1)
    # [batch, time, dim] -> [batch, dim]
    output = tf.reduce_sum(inputs * alphas, 1)

    if not return_alphas:
        return output

    return output, alphas
Esempio n. 6
0
    def split_heads(self, x, batch_size):
        """
    Split hidden_size into depth(hidden_size // num_heads) for
    multi-head attention.
    Args:
      x: (batch_size, seq_len_x, hidden_size)
      batch_size

    Returns:
      split_x: (batch_size, num_heads, seq_len_x, depth)
    """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        split_x = tf.transpose(x, perm=[0, 2, 1, 3])
        return split_x
Esempio n. 7
0
    def call(self, inputs, training=None, mask=None):
        """
    The implementation of Multi-headed attention.
    Args:
      inputs = (v, k, q)
      q: (batch_size, seq_len_q, hidden_size)
      k: (batch_size, seq_len_k, hidden_size)
      v: (batch_size, seq_len_v, hidden_size)
      mask: (batch_size, seq_len_q, seq_len_k)

    Returns:
      output: (batch_size, seq_len_q, hidden_size)
      attention_weights: (batch_size, num_heads, seq_len_q, seq_len_k)
    """
        q, k, v = inputs
        batch_size = tf.shape(q)[0]

        q = self.wq(q)  # (batch_size, seq_len_q, hidden_size)
        k = self.wk(k)  # (batch_size, seq_len_k, hidden_size)
        v = self.wv(v)  # (batch_size, seq_len_v, hidden_size)

        q = self.split_heads(
            q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(
            k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(
            v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        scaled_attention, attention_weights = self.scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(
            scaled_attention,
            perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

        concat_attention = tf.reshape(
            scaled_attention,
            (batch_size, -1,
             self.hidden_size))  # (batch_size, seq_len_q, hidden_size)

        output = self.dense(
            concat_attention)  # (batch_size, seq_len_q, hidden_size)

        return output, attention_weights
def splice_layer(x, name, context):
  '''
  Splice a tensor along the last dimension with context.
  e.g.:
  t = [[[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]]]
  splice_tensor(t, [0, 1]) =
      [[[1, 2, 3, 4, 5, 6],
        [4, 5, 6, 7, 8, 9],
        [7, 8, 9, 7, 8, 9]]]

  Args:
    tensor: a tf.Tensor with shape (B, T, D) a.k.a. (N, H, W)
    context: a list of context offsets

  Returns:
    spliced tensor with shape (..., D * len(context))
  '''
  with tf.variable_scope(name):
    input_shape = tf.shape(x)
    B, T = input_shape[0], input_shape[1]
    context_len = len(context)
    array = tf.TensorArray(x.dtype, size=context_len)
    for idx, offset in enumerate(context):
      begin = offset
      end = T + offset
      if begin < 0:
        begin = 0
        sliced = x[:, begin:end, :]
        tiled = tf.tile(x[:, 0:1, :], [1, abs(offset), 1])
        final = tf.concat((tiled, sliced), axis=1)
      else:
        end = T
        sliced = x[:, begin:end, :]
        tiled = tf.tile(x[:, -1:, :], [1, abs(offset), 1])
        final = tf.concat((sliced, tiled), axis=1)
      array = array.write(idx, final)
    spliced = array.stack()
    spliced = tf.transpose(spliced, (1, 2, 0, 3))
    spliced = tf.reshape(spliced, (B, T, -1))
  return spliced
Esempio n. 9
0
 def pooling_layer(self, x, time_len):
     ''' pooling layer'''
     with tf.variable_scope('time_pooling'):
         if self.attention:
             x, self.alphas = common_layers.attention(
                 x, self.netconf['attention_size'], return_alphas=True)
             #alphas shape [batch, time, 1] -> [1, batch, time, 1]-> [1, time, batch, 1]
             tf.summary.image(
                 'alignment',
                 tf.transpose(tf.expand_dims(self.alphas, 0), [0, 2, 1, 3]))
         else:
             if self.netconf['use_lstm_layer']:
                 x = tf.concat(x, 2)
             # [batch, seq_len, dim, 1]
             x = tf.expand_dims(x, axis=-1)
             seq_len = time_len
             x = common_layers.max_pool(x,
                                        ksize=[seq_len, 1],
                                        strides=[seq_len, 1])
             if self.netconf['use_lstm_layer']:
                 x = tf.reshape(x, [-1, 2 * self.netconf['cell_num']])
             else:
                 x = tf.reshape(x, [-1, self.netconf['linear_num']])
         return x
Esempio n. 10
0
    def call(self, inputs, training=None, mask=None):

        query, key, value = self._unpack(inputs)

        query_mask, key_mask, _ = self._unpack(mask)

        batch_size = tf.shape(query)[0]
        dimension_query = query.get_shape().as_list()[-1]
        seq_len = tf.shape(query)[-2]
        key_len = tf.shape(key)[-2]
        feature_dim = tf.shape(value)[-1]

        query = tf.matmul(
            query,
            tf.tile(tf.expand_dims(self.kernel_query, 0), [batch_size, 1, 1]))
        key = tf.matmul(
            key, tf.tile(tf.expand_dims(self.kernel_key, 0),
                         [batch_size, 1, 1]))
        value = tf.matmul(
            value,
            tf.tile(tf.expand_dims(self.kernel_value, 0), [batch_size, 1, 1]))
        if self.use_bias:
            query += self.b_query
            key += self.b_key
            value += self.b_value

        def _reshape_multihead(origin_input):
            """
      reshape for multi head
        Input shape: (Batch size, steps, features)
        Output shape: (Batch size * head num, steps, features // head num)
      """
            return tf.concat(tf.split(origin_input, self.head_num, axis=2),
                             axis=0)

        def _reshape_mask(mask):
            """
      repeat mask for multi head
        Input shape: (Batch size, steps)
        Output shape: (Batch size * head num, steps)
      """
            if mask is None:
                return None
            seq_len = tf.shape(mask)[1]
            mask = tf.expand_dims(mask, axis=1)
            mask = tf.tile(mask, [1, self.head_num, 1])
            return tf.reshape(mask, shape=(-1, seq_len))

        query_ = _reshape_multihead(query)
        key_ = _reshape_multihead(key)
        value_ = _reshape_multihead(value)

        key_mask = _reshape_mask(key_mask)

        # (Batch size * head num, query steps, key steps)
        similaritys = tf.matmul(query_, tf.transpose(key_, [0, 2, 1]))
        # scale
        similaritys /= tf.sqrt(tf.cast(dimension_query, tf.float32))
        if self.sequence_mask:
            ones = tf.ones((seq_len, key_len))
            similaritys -= (ones - tf.matrix_band_part(ones, -1, 0)) * 1e9
        if key_mask is not None:
            similaritys -= (1.0 - tf.cast(tf.expand_dims(key_mask, axis=-2),
                                          tf.float32)) * 1e9

        attention_weights = tf.keras.activations.softmax(similaritys)
        attention_outputs = tf.matmul(attention_weights, value_)
        attention_outputs = tf.reshape(
            attention_outputs,
            (-1, self.head_num, seq_len, feature_dim // self.head_num))
        attention_outputs = tf.transpose(attention_outputs, [0, 2, 1, 3])
        attention_outputs = tf.reshape(attention_outputs,
                                       (-1, seq_len, feature_dim))

        attention_outputs = tf.matmul(
            attention_outputs,
            tf.tile(tf.expand_dims(self.kernel_project, 0),
                    [batch_size, 1, 1]))
        if self.use_bias:
            attention_outputs += self.b_project
        if self.activation is not None:
            attention_outputs = self.activation(attention_outputs)

        if query_mask is not None:
            attention_outputs *= tf.cast(tf.expand_dims(query_mask, axis=-1),
                                         tf.float32)

        return attention_outputs