Example #1
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate power spectrum and phase spectrum of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: Two returns:
        power spectrum —— A float tensor of size (num_frames, num_frequencies) containing
            power spectrum and of every frame in speech.
        phase spectrum —— A float tensor of size (num_frames, num_frequencies) containing
            phase spectrum and of every frame in speech.
    """

        p = self.config
        with tf.name_scope('analyfiltbank'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                sample_rate = tf.cast(sample_rate, dtype=float)
                power_spectrum, phase_spectrum = py_x_ops.analyfiltbank(
                    audio_data,
                    sample_rate,
                    window_length=p.window_length,
                    frame_length=p.frame_length)

                return power_spectrum, phase_spectrum
Example #2
0
  def call(self, audio_data, sample_rate=None):
    """
        Caculate power spectrum or log power spectrum of audio data.
        :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
        :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
        :return: A float tensor of size N containing add-noise audio.
        """

    p = self.config
    with tf.name_scope('add_rir_noise_aecres'):
      if sample_rate == None:
        sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

      assert_op = tf.assert_equal(
          tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32))
      with tf.control_dependencies([assert_op]):
        sample_rate = tf.cast(sample_rate, dtype=float)
        add_rir_noise_aecres_out = py_x_ops.add_rir_noise_aecres(
            audio_data,
            sample_rate,
            if_add_rir=p.if_add_rir,
            rir_filelist=p.rir_filelist,
            if_add_noise=p.if_add_noise,
            snr_min=p.snr_min,
            snr_max=p.snr_max,
            noise_filelist=p.noise_filelist,
            if_add_aecres=p.if_add_aecres,
            aecres_filelist=p.aecres_filelist)

        return tf.squeeze(add_rir_noise_aecres_out)
Example #3
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate power spectrum or log power spectrum of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A float tensor of size (num_frames, num_frequencies) containing power spectrum (output_type=1)
        or log power spectrum (output_type=2) of every frame in speech.
    """

        p = self.config
        with tf.name_scope('spectrum'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                sample_rate = tf.cast(sample_rate, dtype=float)
                spectrum = py_x_ops.spectrum(
                    audio_data,
                    sample_rate,
                    window_length=p.window_length,
                    frame_length=p.frame_length,
                    output_type=p.output_type,
                    snip_edges=p.snip_edges,
                    raw_energy=p.raw_energy,
                    preEph_coeff=p.preeph_coeff,
                    window_type=p.window_type,
                    remove_dc_offset=p.remove_dc_offset,
                    is_fbank=p.is_fbank)

                return spectrum
Example #4
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate fbank features of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A float tensor of size (num_channels, num_frames, num_frequencies) containing
            fbank features of every frame in speech.
    """
        p = self.config
        with tf.name_scope('fbank'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=float)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=float))
            with tf.control_dependencies([assert_op]):

                spectrum = self.spect(audio_data, sample_rate)
                spectrum = tf.expand_dims(spectrum, 0)
                sample_rate = tf.cast(sample_rate, dtype=tf.int32)

                fbank = py_x_ops.fbank(
                    spectrum,
                    sample_rate,
                    upper_frequency_limit=p.upper_frequency_limit,
                    lower_frequency_limit=p.lower_frequency_limit,
                    filterbank_channel_count=p.filterbank_channel_count)

                return fbank
Example #5
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate cepstrum of audio data.
    :param audio_data: the audio signal from which to compute spectrum.
                        Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with,
                        default is 16kHz.
    :return:A float tensor of size (num_frames, ceps_subband_num) containing
            normalized cepstrum (tag_ceps_mean_norm = True) or cepstrum
            (tag_ceps_mean_norm = False) of every frame in speech.
    """

        p = self.config

        with tf.name_scope('cepstrum'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                sample_rate = tf.cast(sample_rate, dtype=float)
                cepstrum = py_x_ops.cepstrum(
                    audio_data,
                    sample_rate,
                    window_length=p.window_length,
                    frame_length=p.frame_length,
                    ceps_subband_num=p.ceps_subband_num,
                    tag_ceps_mean_norm=p.tag_ceps_mean_norm)

                return cepstrum
Example #6
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate plp features of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return:A float tensor of size (num_frames, (plp_order + 1)) containing plp features of every frame in speech.
    """

        p = self.config
        with tf.name_scope('plp'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                sample_rate = tf.cast(sample_rate, dtype=float)
                plp = py_x_ops.plp(audio_data,
                                   sample_rate,
                                   window_length=p.window_length,
                                   frame_length=p.frame_length,
                                   plp_order=p.plp_order)
                return plp
Example #7
0
    def call(self, audio_data, sample_rate=None):
        """
    Calculate the zero-crossing rate of speech.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A tensor with shape (1, num_frames), containing zero-crossing rate of every frame in speech.
    """

        p = self.config
        with tf.name_scope('zcr'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                sample_rate = tf.cast(sample_rate, dtype=float)
                zcr = py_x_ops.zcr(audio_data,
                                   sample_rate,
                                   window_length=p.window_length,
                                   frame_length=p.frame_length)

                return zcr
Example #8
0
  def call(self, audio_data, sample_rate=None):
    """
    Caculate power of every frame in speech.
    :param audio_data: the audio signal from which to compute spectrum.
                       Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with,
                        default is 16kHz.
    :return:A float tensor of size (1 * num_frames) containing power of every
            frame in speech.
    """

    p = self.config
    with tf.name_scope('framepow'):

      if sample_rate == None:
        sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

      assert_op = tf.assert_equal(
          tf.constant(p.sample_rate), tf.cast(sample_rate, dtype=tf.int32))
      with tf.control_dependencies([assert_op]):

        sample_rate = tf.cast(sample_rate, dtype=float)
        framepow = py_x_ops.frame_pow(
            audio_data,
            sample_rate,
            snip_edges=p.snip_edges,
            remove_dc_offset=p.remove_dc_offset,
            window_length=p.window_length,
            frame_length=p.frame_length)

        return tf.squeeze(framepow)
Example #9
0
        def _dpool_index(one_length_left, one_length_right, fixed_length_left,
                         fixed_length_right):

            logging.info("fixed_length_left: {}".format(fixed_length_left))
            logging.info("fixed_length_right: {}".format(fixed_length_right))

            if one_length_left == 0:
                stride_left = fixed_length_left
            else:
                stride_left = 1.0 * fixed_length_left / tf.cast(
                    one_length_left, dtype=tf.float32)

            if one_length_right == 0:
                stride_right = fixed_length_right
            else:
                stride_right = 1.0 * fixed_length_right / tf.cast(
                    one_length_right, dtype=tf.float32)

            one_idx_left = [
                tf.cast(i / stride_left, dtype=tf.int32)
                for i in range(fixed_length_left)
            ]
            one_idx_right = [
                tf.cast(i / stride_right, dtype=tf.int32)
                for i in range(fixed_length_right)
            ]
            mesh1, mesh2 = tf.meshgrid(one_idx_left, one_idx_right)
            index_one = tf.transpose(tf.stack([mesh1, mesh2]), (2, 1, 0))
            return index_one
Example #10
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate mfcc features of audio data.
    :param audio_data: the audio signal from which to compute spectrum.
                       Should be an (1, N) tensor.
    :param sample_rate: the samplerate of the signal we working with.
    :return: A float tensor of size (num_channels, num_frames, num_frequencies)
            containing mfcc features of every frame in speech.
    """
        p = self.config
        with tf.name_scope('mfcc'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                fbank_feats = self.fbank(audio_data, sample_rate)
                sample_rate = tf.cast(sample_rate, dtype=tf.int32)
                shape = tf.shape(fbank_feats)
                nframe = shape[0]
                nfbank = shape[1]
                fbank_feats = tf.reshape(fbank_feats, (1, nframe, nfbank))
                framepow_feats = self.framepow(audio_data, sample_rate)
                mfcc = py_x_ops.mfcc(fbank_feats,
                                     framepow_feats,
                                     sample_rate,
                                     use_energy=p.use_energy,
                                     cepstral_lifter=p.cepstral_lifter,
                                     coefficient_count=p.coefficient_count)
                return mfcc
Example #11
0
  def get_learning_rate(self):
    """Get the learning rate."""
    lrconf = self.config['solver']['optimizer']['learning_rate']
    learning_rate = lrconf['rate']
    learning_type = lrconf['type']

    #pylint: disable=invalid-name
    if learning_type == 'exp_decay':
      lr = tf.train.exponential_decay(
          learning_rate,
          tf.train.get_or_create_global_step(),
          lrconf['decay_steps'],
          lrconf['decay_rate'],
          staircase=True)
    elif learning_type == 'piecewise':
      #boundaries = [15000, 30000]
      #values = [1e-3, 1e-4, 1e-5]
      boundaries = lrconf['boundaries']
      values = lrconf['values']
      assert len(values) == len(
          boundaries) + 1, 'values len must equal boundaries len plus one'
      lr = tf.train.piecewise_constant(
          tf.train.get_or_create_global_step(),
          boundaries=boundaries,
          values=values)
    elif learning_type == 'warmup':
      learning_rate = tf.constant(
          value=learning_rate, shape=[], dtype=tf.float32)
      global_step = tf.train.get_or_create_global_step()
      data_size = self.config['data']['train_data_size']
      num_epochs = self.config["data"]["task"]['epochs']
      batch_size = self.config["data"]["task"]['batch_size']
      num_batch = int(math.ceil(data_size * num_epochs / batch_size))
      learning_rate = tf.train.polynomial_decay(
          learning_rate,
          global_step,
          num_batch,
          end_learning_rate=0.0,
          power=1.0,
          cycle=False)
      global_steps_int = tf.cast(global_step, tf.int32)
      warmup_steps_int = tf.constant(lrconf['num_warmup_steps'], dtype=tf.int32)

      global_steps_float = tf.cast(global_steps_int, tf.float32)
      warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

      warmup_percent_done = global_steps_float / warmup_steps_float
      warmup_learning_rate = learning_rate * warmup_percent_done

      is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
      lr = ((1.0 - is_warmup) * learning_rate +
            is_warmup * warmup_learning_rate)
    elif learning_type == 'const':
      lr = learning_rate
    else:
      raise ValueError(
          "Not support learning rate type: {}".format(learning_type))
    tf.summary.scalar('lr', lr)
    return lr
Example #12
0
def compute_doc_lens(sen_lens):
    """
  Count how many sentences in a document.
  inputs: [..., time_steps]
  doc_lens: [...]
  """
    x_binary = tf.cast(tf.cast(sen_lens, tf.bool), tf.int32)
    doc_lens = tf.reduce_sum(x_binary, axis=-1)
    return doc_lens
Example #13
0
  def compute_lens(inputs, max_len):
    """count sequence length.
    input: [batch_size, max_len]
    lens: [batch_size]
    """

    x_binary = tf.cast(tf.cast(tf.reverse(inputs, axis=[1]), tf.bool), tf.int32)
    lens = max_len - tf.argmax(x_binary, axis=1, output_type=tf.int32)

    zeros = tf.zeros_like(lens, dtype=tf.int32)
    x_sum = tf.reduce_sum(inputs, axis=1)
    sen_lens = tf.where(tf.equal(x_sum, 0), zeros, lens)
    return sen_lens
Example #14
0
 def call(self, wavfile):
     """
 Get audio data and sample rate from a wavfile.
 :param wavfile: filepath of wav
 :return: 2 values. The first is a Tensor of audio data. The second return value is the sample rate of the input wav
     file, which is a tensor with float dtype.
 """
     p = self.config
     contents = tf.io.read_file(wavfile)
     audio_data, sample_rate = tf.audio.decode_wav(
         contents, desired_channels=p.audio_channels)
     assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                 tf.cast(sample_rate, dtype=float))
     with tf.control_dependencies([assert_op]):
         return tf.squeeze(audio_data, axis=-1), tf.cast(sample_rate,
                                                         dtype=float)
Example #15
0
    def get_pos_embedding_matrix(max_len, embed_dim, use_const, name):
        """
    generate position embedding matrix, two optional types:
    constant(untrainable) and trainable.
    Args:
      max_len, embed_dim, use_const

    Return:
      pos_embed: [max_len, embed_dim]
    """
        # First part of the PE function: sin and cos argument
        if use_const:
            pos_embed = np.array([[
                pos / np.power(10000, (i - i % 2) / embed_dim)
                for i in range(embed_dim)
            ] for pos in range(max_len)])

            # Second part, apply the cosine to even columns and sin to odds.
            pos_embed[:, 0::2] = np.sin(pos_embed[:, 0::2])  # dim 2i
            pos_embed[:, 1::2] = np.cos(pos_embed[:, 1::2])  # dim 2i+1
            pos_embed = pos_embed[np.newaxis, ...]
            pos_embed = tf.cast(pos_embed, dtype=tf.float32)
        else:
            pos_embed = tf.get_variable(
                name=name,
                shape=[max_len, embed_dim],
                initializer=tf.random_uniform_initializer(-0.1, 0.1))
            pos_embed = tf.expand_dims(pos_embed, 0)

        return pos_embed
Example #16
0
    def scaled_dot_product_attention(q, k, v, mask):
        """
    The implementation of scaled attention.
    Args:
      v: (batch_size, seq_len_v, hidden_size)
      k: (batch_size, seq_len_k, hidden_size)
      q: (batch_size, seq_len_q, hidden_size)
      mask: (batch_size, seq_len_q, seq_len_k)

    Returns:
      output: (batch_size, seq_len_q, hidden_size)
      attention_weights: (batch_size, num_heads, seq_len_q, seq_len_k)
    """

        matmul_qk = tf.matmul(
            q, k, transpose_b=True)  # (batch_size, seq_len_q, seq_len_k)

        # Scaled
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        # Masked
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)

        # Normalized
        attention_weights = tf.nn.softmax(
            scaled_attention_logits,
            axis=-1)  # (batch_size, seq_len_q, seq_len_k)

        # Weighted sum
        output = tf.matmul(attention_weights,
                           v)  # (batch_size, seq_len_q, depth_v)

        return output, attention_weights
Example #17
0
def accuracy(logits, labels):
    ''' accuracy candies
  params:
    logits: [B, ..., D]
    labels: [B, ...]
  return:
    accuracy tensor
  '''
    with tf.name_scope('accuracy'):
        assert_rank = tf.assert_equal(tf.rank(logits), tf.rank(labels) + 1)
        assert_shape = tf.assert_equal(tf.shape(logits)[:-1], tf.shape(labels))
        with tf.control_dependencies([assert_rank, assert_shape]):
            predictions = tf.argmax(logits, axis=-1, output_type=tf.int64)
            labels = tf.cast(labels, tf.int64)
            return tf.reduce_mean(
                tf.cast(tf.equal(predictions, labels), dtype=tf.float32))
Example #18
0
    def __init__(self, config, emb_layer, vocab_size, **kwargs):
        model_config = config['model']['net']['structure']
        self.is_infer = config['model']['is_infer']
        if self.is_infer:
            self.length_penalty = model_config['length_penalty']
        self.dropout_rate = model_config['dropout_rate']
        self.num_layers = model_config['num_layers']
        self.l2_reg_lambda = model_config['l2_reg_lambda']
        self.embedding_size = model_config['embedding_size']
        self.max_enc_len = model_config['max_enc_len']
        self.max_dec_len = model_config['max_dec_len']
        self.share_embedding = model_config['share_embedding']
        self.padding_token = 0
        self.beam_size = model_config['beam_size']

        self.mask_layer = tf.keras.layers.Lambda(lambda inputs: tf.cast(
            tf.not_equal(inputs, self.padding_token), tf.int32))

        self.embed = emb_layer
        self.vocab_size = vocab_size
        self.embed_d = tf.keras.layers.Dropout(self.dropout_rate)

        self.pos_embed = PositionEmbedding(self.max_enc_len,
                                           self.embedding_size)

        self.transformer_decoders = [
            TransformerDecoderLayer(config) for _ in range(self.num_layers)
        ]

        self.final_dense = tf.keras.layers.TimeDistributed(
            tf.keras.layers.Dense(self.vocab_size, name="final_dense"))

        super().__init__(**kwargs)
Example #19
0
    def __init__(self, config, **kwargs):
        super().__init__(config, **kwargs)
        tf.logging.info("Initialize TransformerModel...")
        model_config = config['model']['net']['structure']
        self.is_infer = config['model']['is_infer']
        if self.is_infer:
            self.length_penalty = model_config['length_penalty']
        self.dropout_rate = model_config['dropout_rate']
        self.num_layers = model_config['num_layers']
        self.l2_reg_lambda = model_config['l2_reg_lambda']
        self.max_enc_len = model_config['max_enc_len']
        self.max_dec_len = model_config['max_dec_len']
        self.share_embedding = model_config['share_embedding']
        self.padding_token = utils.PAD_IDX
        self.beam_size = model_config['beam_size']

        self.mask_layer = tf.keras.layers.Lambda(lambda inputs: tf.cast(
            tf.not_equal(inputs, self.padding_token), tf.int32))

        self.embed_d = tf.keras.layers.Dropout(self.dropout_rate)

        self.pos_embed = layers.PositionEmbedding(self.max_enc_len,
                                                  self.embedding_size)

        self.encoder = layers.TransformerEncoder(config)
        self.decoder = layers.TransformerDecoder(config, self.embed,
                                                 self.decode_vocab_size)
        logging.info("decode_vocab_size: {}".format(self.decode_vocab_size))
        logging.info("Initialize TransformerModel done.")
Example #20
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate pitch features of audio data.
    :param audio_data: the audio signal from which to compute spectrum. Should be an (1, N) tensor.
    :param sample_rate: [option]the samplerate of the signal we working with, default is 16kHz.
    :return: A float tensor of size (1, num_frames) containing pitch features of every frame in speech.
    """

        p = self.config
        with tf.name_scope('pitch'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=float)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=float))
            with tf.control_dependencies([assert_op]):

                pitch = py_x_ops.pitch(audio_data,
                                       sample_rate,
                                       window_length=p.window_length,
                                       frame_length=p.frame_length,
                                       thres_autoc=p.thres_autoc)

                pitch = tf.squeeze(pitch)
                pitch = tf.transpose(pitch[None, :])
                return pitch
Example #21
0
    def call(self, power_spectrum, phase_spectrum, sample_rate=None):
        """
    Implement frequency domain to time domain conversion.
    :param power_spectrum: a float tensor of size (num_frames, num_frequencies).
    :param phase_spectrum: a float tensor of size (num_frames, num_frequencies).
    :param sample_rate: a scalar tensor.
    :return: audio data
    """

        p = self.config
        with tf.name_scope('synthfiltbank'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                audio_data = py_x_ops.synthfiltbank(
                    power_spectrum,
                    phase_spectrum,
                    sample_rate,
                    window_length=p.window_length,
                    frame_length=p.frame_length)

                return audio_data
Example #22
0
    def call(self, audio_data, sample_rate=None):
        """
    Caculate fbank && pitch(concat) features of wav.
    :param audio_data: the audio signal from which to compute spectrum.
                       Should be an (1, N) tensor.
    :param sample_rate: the samplerate of the signal we working with.
    :return: A tensor with shape (num_frames, dim_features), containing
            fbank && pitch feature of every frame in speech.
    """

        p = self.config
        with tf.name_scope('fbank_pitch'):

            if sample_rate == None:
                sample_rate = tf.constant(p.sample_rate, dtype=tf.int32)

            assert_op = tf.assert_equal(tf.constant(p.sample_rate),
                                        tf.cast(sample_rate, dtype=tf.int32))
            with tf.control_dependencies([assert_op]):

                fbank_feats = tf.squeeze(self.fbank(audio_data, sample_rate))
                pitch_feats = tf.squeeze(self.pitch(audio_data, sample_rate))
                fbank_pitch_feats = tf.concat([fbank_feats, pitch_feats], 1)

                return fbank_pitch_feats
Example #23
0
 def grad_sparsity(self):
     # If the sparse minibatch gradient has 10 percent of its entries
     # non-zero, its sparsity is 0.1.
     # The norm of dense gradient averaged from full dataset
     # are roughly estimated norm of minibatch
     # sparse gradient norm * sqrt(sparsity)
     # An extension maybe only correct the sparse blob.
     non_zero_cnt = tf.add_n([tf.count_nonzero(g) for g in self._grads])
     all_entry_cnt = tf.add_n([tf.size(g) for g in self._grads])
     self._sparsity = tf.cast(non_zero_cnt, self._grads[0].dtype) \
       / tf.cast(all_entry_cnt, self._grads[0].dtype)
     avg_op = self._moving_averager.apply([
         self._sparsity,
     ])
     with tf.control_dependencies([avg_op]):
         self._sparsity_avg = self._moving_averager.average(self._sparsity)
     return avg_op
Example #24
0
def get_pad_mask_from_token_idx(inputs, pad_idx):
    """
  get padding mask from the input token idx
  inputs: [batch_size, time_steps]
  mask: [batch_size, time_steps]
  """
    pad_mask = tf.cast(tf.math.greater(inputs, pad_idx), tf.int32)
    return pad_mask
Example #25
0
def get_seg_mask_from_token_idx(inputs, seg_idx):
    """
  get padding mask from the input token idx
  inputs: [batch_size, time_steps]
  mask: [batch_size, time_steps]
  """
    seg_mask = tf.cast(tf.math.equal(inputs, seg_idx), tf.int32)
    return seg_mask
Example #26
0
 def ctc_greedy_decode_lambda_func(args):
     y_pred, input_length = args
     input_length = tf.cast(input_length, dtype=tf.int32)
     decode_result, _ = ctc_greedy_decode(logits=y_pred,
                                          sequence_length=input_length,
                                          merge_repeated=True,
                                          blank_id=None)
     return decode_result
Example #27
0
File: utils.py Project: zhjou/delta
def get_expand_pad_mask(inputs, pad_idx):
    """
  get padding mask from the input token idx
  inputs: [batch_size, time_steps]
  mask: [batch_size, time_steps, 1]
  """
    pad_mask = tf.cast(tf.math.greater(inputs, pad_idx), tf.float32)
    pad_mask = tf.expand_dims(pad_mask, -1)
    return pad_mask
Example #28
0
def ctc_lambda_loss(logits, labels, input_length, label_length, blank_index=0):
  '''
  ctc loss function
  psram: logits, (B, T, D)
  psram: input_length,  (B, 1), input length of encoder
  psram: labels, (B, T)
  psram: label_length,  (B, 1), label length for convert dense label to sparse
  returns: loss, scalar
  '''
  ilen = tf.cond(
      pred=tf.equal(tf.rank(input_length), 1),
      true_fn=lambda: input_length,
      false_fn=lambda: tf.squeeze(input_length),
  )
  ilen = tf.cast(ilen, tf.int32)

  olen = tf.cond(
      pred=tf.equal(tf.rank(label_length), 1),
      true_fn=lambda: label_length,
      false_fn=lambda: tf.squeeze(label_length))
  olen = tf.cast(olen, tf.int32)

  deps = [
      tf.assert_rank(labels, 2, name='label_rank_check'),
      tf.assert_rank(logits, 3, name='logits_rank_check'),
      tf.assert_rank(ilen, 1, name='src_len_rank_check'),  # input_length
      tf.assert_rank(olen, 1, name='tgt_len_rank_check'),  # output_length
  ]

  labels, logits = ctc_data_transform(labels, logits, blank_index)

  with tf.control_dependencies(deps):
    # (B, 1)
    # blank index is consistent with Espnet, zero
    batch_loss = tf.nn.ctc_loss(
        labels=labels,
        inputs=logits,
        sequence_length=ilen,
        time_major=False,
        preprocess_collapse_repeated=False,
        ctc_merge_repeated=True,
        ignore_longer_outputs_than_inputs=False)
  return batch_loss
Example #29
0
def compute_sen_lens(inputs, padding_token=0):
    """
  Count how many words in a sentence.
  inputs: [..., time_steps]
  sen_lens: [...]
  """
    x_binary = tf.cast(tf.not_equal(inputs, padding_token), tf.int32)
    sen_lens = tf.reduce_sum(x_binary, axis=-1)
    ones = tf.ones_like(sen_lens)
    sen_lens = tf.where(tf.equal(sen_lens, utils.PAD_IDX), x=ones, y=sen_lens)
    return sen_lens
Example #30
0
def _freq_feat_graph(feat_name, **kwargs):
  winlen = kwargs.get('winlen')
  winstep = kwargs.get('winstep')
  feature_size = kwargs.get('feature_size')
  sr = kwargs.get('sr')  #pylint: disable=invalid-name
  nfft = kwargs.get('nfft')
  del nfft

  assert feat_name in ('fbank', 'spec')

  params = speech_ops.speech_params(
      sr=sr,
      bins=feature_size,
      add_delta_deltas=False,
      audio_frame_length=winlen,
      audio_frame_step=winstep)

  graph = None
  if feat_name == 'fbank':
    # get session
    if feat_name not in _global_sess:
      graph = tf.Graph()
      #pylint: disable=not-context-manager
      with graph.as_default():
        # fbank
        filepath = tf.placeholder(dtype=tf.string, shape=[], name='wavpath')
        waveforms, sample_rate = speech_ops.read_wav(filepath, params)
        del sample_rate
        fbank = speech_ops.extract_feature(waveforms, params)
        # shape must be [T, D, C]
        feat = tf.identity(fbank, name=feat_name)
  elif feat_name == 'spec':
    # magnitude spec
    if feat_name not in _global_sess:
      graph = tf.Graph()
      #pylint: disable=not-context-manager
      with graph.as_default():
        filepath = tf.placeholder(dtype=tf.string, shape=[], name='wavpath')
        waveforms, sample_rate = speech_ops.read_wav(filepath, params)

        spec = py_x_ops.spectrum(
            waveforms[:, 0],
            tf.cast(sample_rate, tf.dtypes.float32),
            output_type=1)  #output_type: 1, power spec; 2 log power spec
        spec = tf.sqrt(spec)
        # shape must be [T, D, C]
        spec = tf.expand_dims(spec, -1)
        feat = tf.identity(spec, name=feat_name)
  else:
    raise ValueError(f"Not support freq feat: {feat_name}.")

  return graph, (_get_out_tensor_name('wavpath',
                                      0), _get_out_tensor_name(feat_name, 0))