Exemple #1
0
        def grow_finished(finished_seq, finished_scores, finished_flags,
                          curr_seq, curr_scores, curr_finished):
            """
        Given sequences and scores from finished sequence and current finished sequence
        , will gather the top k=beam size sequences to update finished seq.
      """
            # padding zero for finished seq
            finished_seq = tf.concat(
                [finished_seq,
                 tf.zeros([batch_size, beam_size, 1], tf.int32)],
                axis=2)

            # mask unfinished curr seq
            curr_scores += (1. - tf.to_float(curr_finished)) * -INF

            # concatenating the sequences and scores along beam axis
            # (batch_size, 2xbeam_size, seq_len)
            curr_finished_seq = tf.concat([finished_seq, curr_seq], axis=1)
            curr_finished_scores = tf.concat([finished_scores, curr_scores],
                                             axis=1)
            curr_finished_flags = tf.concat([finished_flags, curr_finished],
                                            axis=1)
            return utils.compute_topk_scores_and_seq(
                curr_finished_seq, curr_finished_scores, curr_finished_scores,
                curr_finished_flags, beam_size, batch_size, "grow_finished")
Exemple #2
0
def splice(feat, left_context, right_context):
    '''
  splice frame with context
    param: feat, tf.float32, [batch, time, feat]
    return: feat, tf.float32, [batch, time, feat*(left_context + 1 + right_context)]
    reference:
      https://github.com/kaldi-asr/kaldi/src/feat/feature-functions.cc#L205:6
  '''
    def _loop_continue(time, end_time, context, unused_left_context,
                       right_context, unused_output_tas):
        del unused_output_tas
        del unused_left_context
        return time < end_time

    def _loop_body(time, end_time, context, left_context, right_context,
                   output_tas):
        shape = tf.shape(context)
        B, _, D = shape[0], shape[1], shape[2]
        N = (1 + left_context + right_context) * D

        new_feat = context[:, time:time + left_context + 1 + right_context, :]
        new_feat = tf.reshape(new_feat, [B, N])
        new_output_tas = output_tas.write(time, new_feat)
        return (time + 1, end_time, context, left_context, right_context,
                new_output_tas)

    with tf.control_dependencies([
            tf.assert_greater_equal(left_context, 0),
            tf.assert_greater_equal(right_context, 0)
    ]):
        T = tf.shape(feat)[1]
        output_tas = _new_tensor_array('splice_feat_ta', T, dtype=tf.float32)
        time = tf.constant(0, tf.int32)
        first = tf.tile(feat[:, 0:1, :], [1, left_context, 1])
        last = tf.tile(feat[:, -1:, :], [1, right_context, 1])
        context = tf.concat([first, feat], axis=1)
        context = tf.concat([context, last], axis=1)

        loop_vars = (time, T, context, left_context, right_context, output_tas)

        parallel_iterations = 10
        shape_invariants = tf.nest.map_structure(
            lambda t: tf.TensorShape(None), loop_vars)

        (time, end_time, context, left_context, right_context,
         output_tas) = tf.while_loop(_loop_continue,
                                     _loop_body,
                                     loop_vars=loop_vars,
                                     shape_invariants=shape_invariants,
                                     parallel_iterations=parallel_iterations,
                                     swap_memory=False)
        del context
        del left_context
        del right_context

        batch_spliced_feats = output_tas.stack()
        batch_spliced_feats = tf.transpose(batch_spliced_feats, [1, 0, 2])
    return batch_spliced_feats
Exemple #3
0
def extract_feature(waveforms, params):
    '''extract fbank with delta-delta and do cmvn
     waveforms: [batch, samples]
  '''
    p = params
    with tf.variable_scope('feature_extractor'):
        mel_fbanks = extract_logfbank_with_delta(waveforms, params)
        # shape: [1, nframes, nbins, nchannels]
        fbank_size = utils.shape_list(mel_fbanks)
        #assert fbank_size[0] == 1

        # This replaces CMVN estimation on data
        if not p.audio_global_cmvn:
            mean = tf.reduce_mean(mel_fbanks, keepdims=True, axis=1)
            variance = tf.reduce_mean(tf.square(mel_fbanks - mean),
                                      keepdims=True,
                                      axis=1)
        else:
            assert p.audio_cmvn_path, p.audio_cmvn_path
            mean, variance = utils.load_cmvn(p.audio_cmvn_path)

        var_epsilon = 1e-09
        mel_fbanks = utils.apply_cmvn(mel_fbanks, mean, variance, var_epsilon)

        # Later models like to flatten the two spatial dims. Instead, we add a
        # unit spatial dim and flatten the frequencies and channels.
        batch_size = fbank_size[0]
        feats = tf.concat([
            tf.reshape(
                mel_fbanks,
                [batch_size, fbank_size[1], fbank_size[2], fbank_size[3]]),
            tf.zeros((batch_size, p.num_zeropad_frames, fbank_size[2],
                      fbank_size[3]))
        ], 1)
    return feats  # shape [batch_size, nframes, featue_size, chnanels]
Exemple #4
0
def split_one_doc_to_true_len_sens(doc_t, split_token, padding_token,
                                   max_doc_len, max_sen_len):
    """
  Split a document to sentences with true sentence lengths.
  doc_t: [doc_word_len]
  out_t: [max_doc_len, max_sen_len]
  """
    if len(doc_t.get_shape()) == 1:
        split_token_index = tf.squeeze(tf.where(tf.equal(doc_t, split_token)),
                                       axis=1)
        split_token_index.set_shape([None])
        split_len_part_1 = split_token_index[:1] + 1
        split_len_part_2 = split_token_index[1:] - split_token_index[:-1]
        split_lens = tf.concat([split_len_part_1, split_len_part_2], axis=0)
        split_lens = cut_or_padding(split_lens,
                                    max_doc_len,
                                    padding_token=padding_token)
        new_doc_len = tf.reduce_sum(split_lens)
        splited_sentences = tf.split(doc_t[:new_doc_len], split_lens)
        splited_sentences = [
            cut_or_padding(s, max_sen_len) for s in splited_sentences
        ]
        out_t = tf.stack(splited_sentences)
        padding_tokens = tf.multiply(tf.ones_like(out_t, dtype=tf.int32),
                                     padding_token)
        out_t = tf.where(tf.equal(out_t, split_token), padding_tokens, out_t)
        return out_t

    raise ValueError("doc_t should be a tensor with rank 1.")
Exemple #5
0
    def pooling_layer(self, x, pooling_type=None):
        '''
      Add a pooling layer across the whole utterance.
      Input: [B, T, D]
        --> Reduce along T

      Statistics pooling output: [B, D * 2]
      Average pooling output: [B, D]
    '''
        assert_rank3 = tf.debugging.assert_rank(x, 3)
        with tf.control_dependencies([assert_rank3]):
            x = tf.identity(x)

        pooling_type = pooling_type if pooling_type else self.netconf[
            'frame_pooling_type']
        if pooling_type == 'stats':
            with tf.name_scope('stats_pooling'):
                mean, var = tf.nn.moments(x, 1)
                x = tf.concat([mean, tf.sqrt(var + 1e-6)], 1)
        elif pooling_type == 'average':
            with tf.name_scope('average_pooling'):
                mean, _ = tf.nn.moments(x, 1)
                x = mean
        else:
            raise ValueError('Unsupported frame_pooling_type: %s' %
                             (pooling_type))

        assert_rank2 = tf.debugging.assert_rank(x, 2)
        with tf.control_dependencies([assert_rank2]):
            x = tf.identity(x)

        return x
Exemple #6
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate fbank && pitch(concat) features of wav.
    :param audio_data: the audio signal from which to compute spectrum.
                       Should be an (1, N) tensor.
    :param sample_rate: the samplerate of the signal we working with.
    :return: A tensor with shape (num_frames, dim_features), containing
            fbank && pitch feature of every frame in speech.
    """

        p = self.config
        with tf.name_scope('fbank_pitch'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                fbank_feats = tf.squeeze(self.fbank(audio_data, sample_rate))
                pitch_feats = tf.squeeze(self.pitch(audio_data, sample_rate))
                fbank_pitch_feats = tf.concat([fbank_feats, pitch_feats], 1)

                return fbank_pitch_feats
Exemple #7
0
  def call(self, inputs, training=None, mask=None):
    input_x = inputs["input_x"]
    if self.use_dense_task:
      dense_input = inputs["input_dense"]

    # [batch_size, max_len, embed_len]
    out = self.embed(input_x)
    if self.use_pretrained_model:
      logging.info("use_pretrained_model: {}, {}".format(
          self.pretrained_model_name, self.pretrained_model_mode))
      if self.pretrained_model_name == 'elmo':
        input_px = self.get_pre_train_graph(input_x)
        input_px = tf.reshape(input_px,
                              [-1, self.max_len, self.pretrained_model_dim])
        out = tf.concat([out, input_px], axis=-1)
        out = tf.reduce_max(out, axis=1)
      if self.pretrained_model_name == 'bert':
        out = self.get_pre_train_graph(input_x)
    else:
      out = tf.reduce_max(out, axis=1)
    out = self.embed_d(out, training=training)
    if self.use_dense_input:
      dense_out = self.dense_input_linear(dense_input)
      if self.only_dense_input:
        out = dense_out
      else:
        out = tf.keras.layers.Concatenate()([out, dense_out])
    # [batch_size, class_num]
    scores = self.final_dense(out)
    return scores
Exemple #8
0
   def _reshape_multihead(origin_input):
       """
 reshape for multi head
   Input shape: (Batch size, steps, features)
   Output shape: (Batch size * head num, steps, features // head num)
 """
       return tf.concat(tf.split(origin_input, self.head_num, axis=2),
                        axis=0)
Exemple #9
0
  def call(self, inputs, training=None, mask=None):  # pylint: disable=too-many-locals
    input_x = tf.identity(inputs["input_x"], name='input_x')
    if self.use_dense_task:
      dense_input = inputs["input_dense"]
    if self.use_true_length:
      # [batch_size, max_doc_len, max_sen_len]
      input_hx = self.pad_to_hier_input_true_len(
          input_x,
          self.max_doc_len,
          self.max_sen_len,
          self.split_token,
          padding_token=self.padding_token)
    else:
      # [batch_size, max_doc_len, max_sen_len]
      input_hx = self.pad_to_hier_input(
          input_x,
          self.max_doc_len,
          self.max_sen_len,
          padding_token=self.padding_token)

    # [batch_size, max_doc_len]
    sen_lens = compute_sen_lens(input_hx, padding_token=self.padding_token)
    # [batch_size]
    doc_lens = compute_doc_lens(sen_lens)
    # [batch_size, max_doc_len, max_sen_len, 1]
    sen_mask = tf.expand_dims(
        tf.sequence_mask(sen_lens, self.max_sen_len, dtype=tf.float32), axis=-1)

    # [batch_size, max_doc_len, 1]
    doc_mask = tf.expand_dims(
        tf.sequence_mask(doc_lens, self.max_doc_len, dtype=tf.float32), axis=-1)

    # [batch_size, max_doc_len, max_sen_len, embed_len]
    out = self.embed(input_hx)
    if self.use_pretrained_model:
      input_px = self.get_pre_train_graph(input_x)
      input_px = tf.reshape(
          input_px,
          [-1, self.max_doc_len, self.max_sen_len, self.pretrained_model_dim])
      out = tf.concat([out, input_px], axis=-1)
    out = self.embed_d(out, training=training)
    all_sen_encoder = tf.keras.layers.TimeDistributed(self.sen_encoder)
    # [batch_size, max_doc_len, features]
    out = all_sen_encoder(out, training=training, mask=sen_mask)
    # [batch_size, features]
    out = self.doc_encoder(out, training=training, mask=doc_mask)

    if self.use_dense_input:
      dense_out = self.dense_input_linear(dense_input)
      if self.only_dense_input:
        out = dense_out
      else:
        out = tf.keras.layers.Concatenate()([out, dense_out])

    # [batch_size, class_num]
    scores = self.final_dense(out)

    return scores
Exemple #10
0
def attention(inputs, attention_size, time_major=False, return_alphas=False):
    """Attention layer."""
    if isinstance(inputs, tuple):
        # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
        inputs = tf.concat(inputs, 2)

    if time_major:
        # (T,B,D) => (B,T,D)
        inputs = tf.transpose(inputs, [1, 0, 2])

    time_size = inputs.shape[1].value  # T value - time size of the RNN layer
    hidden_size = inputs.shape[
        2].value  # D value - hidden size of the RNN layer

    # Trainable parameters
    W_omega = tf.get_variable(name='W_omega',
                              initializer=tf.random_normal(
                                  [hidden_size, attention_size], stddev=0.1))
    b_omega = tf.get_variable(name='b_omega',
                              initializer=tf.random_normal([attention_size],
                                                           stddev=0.1))
    u_omega = tf.get_variable(name='u_omega',
                              initializer=tf.random_normal([attention_size, 1],
                                                           stddev=0.1))

    # Applying fully connected layer with non-linear activation to each of the B*T timestamps;
    #  the shape of `v` is (B,T,D)*(D,A)=(B,T,A), where A=attention_size
    #v = tf.tanh(tf.tensordot(inputs, W_omega, axes=1) + b_omega)
    #v = tf.sigmoid(tf.tensordot(inputs, W_omega, axes=1) + b_omega)
    # (B, T, D) dot (D, Atten)

    logging.info('attention inputs: {}'.format(inputs.shape))
    inputs_reshaped = tf.reshape(inputs, [-1, hidden_size])
    dot = tf.matmul(inputs_reshaped, W_omega)
    dot = tf.reshape(dot, [-1, time_size, attention_size])
    v = tf.sigmoid(dot + b_omega)
    logging.info(f'attention vector: {v.shape}')
    # For each of the timestamps its vector of size A from `v` is reduced with `u` vector
    # (B, T, Atten) dot (Atten)
    #vu = tf.tensordot(v, u_omega, axes=1)   # (B,T) shape
    v = tf.reshape(v, [-1, attention_size])
    vu = tf.matmul(v, u_omega)  # (B,T) shape
    vu = tf.squeeze(vu, axis=-1)
    vu = tf.reshape(vu, [-1, time_size])
    logging.info(f'attention energe: {vu.shape}')
    alphas = tf.nn.softmax(vu)  # (B,T) shape also

    # Output of (Bi-)RNN is reduced with attention vector; the result has (B,D) shape
    # [batch, time] -> [batch, time, 1]
    alphas = tf.expand_dims(alphas, -1)
    # [batch, time, dim] -> [batch, dim]
    output = tf.reduce_sum(inputs * alphas, 1)

    if not return_alphas:
        return output

    return output, alphas
def splice_layer(x, name, context):
  '''
  Splice a tensor along the last dimension with context.
  e.g.:
  t = [[[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]]]
  splice_tensor(t, [0, 1]) =
      [[[1, 2, 3, 4, 5, 6],
        [4, 5, 6, 7, 8, 9],
        [7, 8, 9, 7, 8, 9]]]

  Args:
    tensor: a tf.Tensor with shape (B, T, D) a.k.a. (N, H, W)
    context: a list of context offsets

  Returns:
    spliced tensor with shape (..., D * len(context))
  '''
  with tf.variable_scope(name):
    input_shape = tf.shape(x)
    B, T = input_shape[0], input_shape[1]
    context_len = len(context)
    array = tf.TensorArray(x.dtype, size=context_len)
    for idx, offset in enumerate(context):
      begin = offset
      end = T + offset
      if begin < 0:
        begin = 0
        sliced = x[:, begin:end, :]
        tiled = tf.tile(x[:, 0:1, :], [1, abs(offset), 1])
        final = tf.concat((tiled, sliced), axis=1)
      else:
        end = T
        sliced = x[:, begin:end, :]
        tiled = tf.tile(x[:, -1:, :], [1, abs(offset), 1])
        final = tf.concat((sliced, tiled), axis=1)
      array = array.write(idx, final)
    spliced = array.stack()
    spliced = tf.transpose(spliced, (1, 2, 0, 3))
    spliced = tf.reshape(spliced, (B, T, -1))
  return spliced
Exemple #12
0
 def text_layer(self, x, input_text):
     ''' text embbeding layers'''
     with tf.variable_scope('text'):
         embedding_chars_expanded = common_layers.embedding_look_up(
             input_text, self.vocab_size, self.netconf['embedding_dim'])
         h_pool_flat = common_layers.conv_pool(
             embedding_chars_expanded,
             list(map(int, self.netconf['filter_sizes'])),
             self.netconf['embedding_dim'], self.netconf['num_filters'],
             input_text.shape[1])
         outputs = tf.concat((x, h_pool_flat), axis=1)
     return outputs
Exemple #13
0
def logits_blankid_to_last(logits, blank_index):
  ''' Moves the blank_label cloumn to the end of the logit matrix '''
  num_class = logits.shape[2]
  transform_preprocess(blank_index=blank_index, num_class=num_class)

  if blank_index != (num_class - 1):
    logits = tf.concat([
        logits[:, :, :blank_index], logits[:, :, blank_index + 1:],
        logits[:, :, blank_index:blank_index + 1]
    ],
                       axis=2)

  return logits
Exemple #14
0
        def grow_topk(i, alive_seq, alive_log_probs, states):
            """Inner beam search loop."""

            flat_ids = tf.reshape(alive_seq, [batch_size * beam_size, -1])

            # (batch_size * beam_size, decoded_length)
            if states:
                flat_states = nest.map_structure(_merge_beam_dim, states)
                flat_logits, flat_states = symbols_to_logits_fn(
                    flat_ids, i, flat_states)
                states = nest.map_structure(
                    lambda t: _unmerge_beam_dim(t, batch_size, beam_size),
                    flat_states)
            else:
                flat_logits = symbols_to_logits_fn(flat_ids)

            logits = tf.reshape(flat_logits, [batch_size, beam_size, -1])

            candidate_log_probs = log_prob_from_logits(logits)

            log_probs = candidate_log_probs + tf.expand_dims(alive_log_probs,
                                                             axis=2)

            length_penalty = tf.pow(((5. + tf.to_float(i + 1)) / 6.), alpha)

            curr_scores = log_probs / length_penalty
            flat_curr_scores = tf.reshape(curr_scores,
                                          [-1, beam_size * vocab_size])

            topk_scores, topk_ids = tf.nn.top_k(flat_curr_scores,
                                                k=beam_size * 2)

            topk_log_probs = topk_scores * length_penalty

            topk_beam_index = topk_ids // vocab_size
            topk_ids %= vocab_size  # Unflatten the ids
            batch_pos = compute_batch_indices(batch_size, beam_size * 2)
            topk_coordinates = tf.stack([batch_pos, topk_beam_index], axis=2)

            topk_seq = tf.gather_nd(alive_seq, topk_coordinates)
            if states:
                states = nest.map_structure(
                    lambda state: tf.gather_nd(state, topk_coordinates),
                    states)
            topk_seq = tf.concat(
                [topk_seq, tf.expand_dims(topk_ids, axis=2)], axis=2)

            topk_finished = tf.equal(topk_ids, eos_id)

            return topk_seq, topk_log_probs, topk_scores, topk_finished, states
Exemple #15
0
    def call(self, tensors):
        """Attention layer."""
        left, right = tensors

        len_left = left.shape[1]
        len_right = right.shape[1]
        tensor_left = tf.expand_dims(left, axis=2)
        tensor_right = tf.expand_dims(right, axis=1)
        tensor_left = tf.tile(tensor_left, [1, 1, len_right, 1])
        tensor_right = tf.tile(tensor_right, [1, len_left, 1, 1])
        tensor_merged = tf.concat([tensor_left, tensor_right], axis=-1)
        middle_output = self.middle_layer(tensor_merged)
        attn_scores = self.attn(middle_output)
        attn_scores = tf.squeeze(attn_scores, axis=3)
        exp_attn_scores = tf.exp(
            attn_scores - tf.reduce_max(attn_scores, axis=-1, keepdims=True))
        exp_sum = tf.reduce_sum(exp_attn_scores, axis=-1, keepdims=True)
        attention_weights = exp_attn_scores / exp_sum
        return tf.matmul(attention_weights, right)
Exemple #16
0
 def call(self, inputs, training=None, mask=None):
     input_x = inputs["input_x"]
     # [batch_size, max_len, embed_len]
     out = self.embed(input_x)
     # [batch_size, features]
     if self.use_pretrained_model:
         logging.info("use_pretrained_model: {}, {}".format(
             self.pretrained_model_name, self.pretrained_model_mode))
         if self.pretrained_model_name == 'bert' and \
           self.pretrained_model_mode == 'fine-tune':
             out = self.get_pre_train_graph(input_x)
         else:
             input_px = self.get_pre_train_graph(input_x)
             out = tf.concat([out, input_px], axis=-1)
             out = self.embed_dropout(out, training=training)
     out = self.bilstm(out)
     out = self.dropout(out, training=training)
     output = self.dense(out)
     return output
  def call(self, inputs: list, **kwargs) -> typing.Any:
    """
        The computation logic of DynamicPoolingLayer.
        :param inputs: two input tensors.
        """
    self._validate_dpool_size()
    x, dpool_index = inputs
    dpool_shape = tf.shape(dpool_index)
    batch_index_one = tf.expand_dims(
        tf.expand_dims(tf.range(dpool_shape[0]), axis=-1), axis=-1)
    batch_index = tf.expand_dims(
        tf.tile(batch_index_one, [1, self._msize1, self._msize2]), axis=-1)
    dpool_index_ex = tf.concat([batch_index, dpool_index], axis=3)
    x_expand = tf.gather_nd(x, dpool_index_ex)
    stride1 = self._msize1 // self._psize1
    stride2 = self._msize2 // self._psize2

    x_pool = tf.nn.max_pool(x_expand, [1, stride1, stride2, 1],
                            [1, stride1, stride2, 1], "VALID")
    return x_pool
Exemple #18
0
def conv_pool(embedded_chars_expanded, filter_sizes, embedding_size,
              num_filters, sequence_length):
    """
    text conv and max pooling to get one-dimension vector to representation of text
    :param filter_sizes:
    :return:
    """
    pooled_outputs = []
    for _, filter_size in enumerate(filter_sizes):
        with tf.variable_scope("conv-maxpool-%s" % filter_size):
            # Convolution Layer
            filter_shape = [filter_size, embedding_size, 1, num_filters]
            W = tf.get_variable(name='W',
                                initializer=tf.truncated_normal(filter_shape,
                                                                stddev=0.1))
            b = tf.get_variable(name='b',
                                initializer=tf.constant(0.1,
                                                        shape=[num_filters]))
            conv = tf.nn.conv2d(embedded_chars_expanded,
                                W,
                                strides=[1, 1, 1, 1],
                                padding="VALID",
                                name="conv")
            # Apply nonlinearity
            h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
            # Maxpooling over the outputs
            pooled = tf.nn.max_pool(
                h,
                ksize=[1, sequence_length - filter_size + 1, 1, 1],
                strides=[1, 1, 1, 1],
                padding='VALID',
                name="pool")
            pooled_outputs.append(pooled)
    # Combine all the pooled features
    num_filters_total = num_filters * len(filter_sizes)

    h_pool = tf.concat(pooled_outputs, 3)

    h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])
    return h_pool_flat
Exemple #19
0
 def pooling_layer(self, x, time_len):
     ''' pooling layer'''
     with tf.variable_scope('time_pooling'):
         if self.attention:
             x, self.alphas = common_layers.attention(
                 x, self.netconf['attention_size'], return_alphas=True)
             #alphas shape [batch, time, 1] -> [1, batch, time, 1]-> [1, time, batch, 1]
             tf.summary.image(
                 'alignment',
                 tf.transpose(tf.expand_dims(self.alphas, 0), [0, 2, 1, 3]))
         else:
             if self.netconf['use_lstm_layer']:
                 x = tf.concat(x, 2)
             # [batch, seq_len, dim, 1]
             x = tf.expand_dims(x, axis=-1)
             seq_len = time_len
             x = common_layers.max_pool(x,
                                        ksize=[seq_len, 1],
                                        strides=[seq_len, 1])
             if self.netconf['use_lstm_layer']:
                 x = tf.reshape(x, [-1, 2 * self.netconf['cell_num']])
             else:
                 x = tf.reshape(x, [-1, self.netconf['linear_num']])
         return x
 def func(x, y):
     return tf.concat([x, y], axis=3)