def preprocess_example(self, example, mode, hparams):
        p = hparams
        if p.audio_preproc_in_bottom:
            example["inputs"] = tf.expand_dims(
                tf.expand_dims(example["waveforms"], -1), -1)
        else:
            waveforms = tf.expand_dims(example["waveforms"], 0)
            mel_fbanks = common_audio.compute_mel_filterbank_features(
                waveforms,
                sample_rate=p.audio_sample_rate,
                dither=p.audio_dither,
                preemphasis=p.audio_preemphasis,
                frame_length=p.audio_frame_length,
                frame_step=p.audio_frame_step,
                lower_edge_hertz=p.audio_lower_edge_hertz,
                upper_edge_hertz=p.audio_upper_edge_hertz,
                num_mel_bins=p.audio_num_mel_bins,
                apply_mask=False)
            if p.audio_add_delta_deltas:
                mel_fbanks = common_audio.add_delta_deltas(mel_fbanks)
            fbank_size = common_layers.shape_list(mel_fbanks)
            assert fbank_size[0] == 1

            # This replaces CMVN estimation on data
            var_epsilon = 1e-09
            mean = tf.reduce_mean(mel_fbanks, keepdims=True, axis=1)
            variance = tf.reduce_mean(tf.square(mel_fbanks - mean),
                                      keepdims=True,
                                      axis=1)
            mel_fbanks = (mel_fbanks - mean) * tf.rsqrt(variance + var_epsilon)

            ######### specaugment added by kyubyong #########
            if mode == tf.estimator.ModeKeys.TRAIN:
                # mel_fbanks = time_warp(mel_fbanks)
                mel_fbanks = freq_mask(mel_fbanks)
                mel_fbanks = time_mask(mel_fbanks)

            ######### /specaugment added by kyubyong #########

            # Later models like to flatten the two spatial dims. Instead, we add a
            # unit spatial dim and flatten the frequencies and channels.
            example["inputs"] = tf.concat([
                tf.reshape(mel_fbanks,
                           [fbank_size[1], fbank_size[2], fbank_size[3]]),
                tf.zeros((p.num_zeropad_frames, fbank_size[2], fbank_size[3]))
            ], 0)

        if not p.audio_keep_example_waveforms:
            del example["waveforms"]
        return super(SpeechRecognitionProblem,
                     self).preprocess_example(example, mode, hparams)
  def preprocess_example(self, example, mode, hparams):
    p = hparams
    if p.audio_preproc_in_bottom:
      example["inputs"] = tf.expand_dims(
          tf.expand_dims(example["waveforms"], -1), -1)
    else:
      waveforms = tf.expand_dims(example["waveforms"], 0)
      mel_fbanks = common_audio.compute_mel_filterbank_features(
          waveforms,
          sample_rate=p.audio_sample_rate,
          dither=p.audio_dither,
          preemphasis=p.audio_preemphasis,
          frame_length=p.audio_frame_length,
          frame_step=p.audio_frame_step,
          lower_edge_hertz=p.audio_lower_edge_hertz,
          upper_edge_hertz=p.audio_upper_edge_hertz,
          num_mel_bins=p.audio_num_mel_bins,
          apply_mask=False)
      if p.audio_add_delta_deltas:
        mel_fbanks = common_audio.add_delta_deltas(mel_fbanks)
      fbank_size = common_layers.shape_list(mel_fbanks)
      assert fbank_size[0] == 1

      # This replaces CMVN estimation on data
      var_epsilon = 1e-09
      mean = tf.reduce_mean(mel_fbanks, keepdims=True, axis=1)
      variance = tf.reduce_mean(tf.square(mel_fbanks - mean),
                                keepdims=True, axis=1)
      mel_fbanks = (mel_fbanks - mean) * tf.rsqrt(variance + var_epsilon)

      # Later models like to flatten the two spatial dims. Instead, we add a
      # unit spatial dim and flatten the frequencies and channels.
      example["inputs"] = tf.concat([
          tf.reshape(mel_fbanks, [fbank_size[1], fbank_size[2], fbank_size[3]]),
          tf.zeros((p.num_zeropad_frames, fbank_size[2], fbank_size[3]))], 0)

    if not p.audio_keep_example_waveforms:
      del example["waveforms"]
    return super(SpeechRecognitionProblem, self
                ).preprocess_example(example, mode, hparams)
Beispiel #3
0
    def bottom(self, x):
        """Use batchnorm instead of CMVN and shorten the stft with strided convs.

    Args:
      x: float32 tensor with shape [batch_size, len, 1, freqs * channels]

    Returns:
      float32 tensor with shape [batch_size, shorter_len, 1, hidden_size]
    """
        inputs = x
        p = self._model_hparams

        num_mel_bins = p.audio_num_mel_bins
        num_channels = 3 if p.audio_add_delta_deltas else 1

        with tf.variable_scope(self.name):
            if p.audio_preproc_in_bottom:
                # Compute filterbanks
                with tf.variable_scope("fbanks"):
                    waveforms = tf.squeeze(inputs, [2, 3])
                    mel_fbanks = common_audio.compute_mel_filterbank_features(
                        waveforms,
                        sample_rate=p.audio_sample_rate,
                        dither=p.audio_dither,
                        preemphasis=p.audio_preemphasis,
                        frame_length=p.audio_frame_length,
                        frame_step=p.audio_frame_step,
                        lower_edge_hertz=p.audio_lower_edge_hertz,
                        upper_edge_hertz=p.audio_upper_edge_hertz,
                        num_mel_bins=p.audio_num_mel_bins,
                        apply_mask=True)
                    if p.audio_add_delta_deltas:
                        mel_fbanks = common_audio.add_delta_deltas(mel_fbanks)
                    x = tf.reshape(
                        mel_fbanks,
                        common_layers.shape_list(mel_fbanks)[:2] +
                        [num_mel_bins, num_channels])

                    nonpadding_mask = 1. - common_attention.embedding_to_padding(
                        x)
                    num_of_nonpadding_elements = tf.reduce_sum(
                        nonpadding_mask) * num_mel_bins * num_channels

                    # This replaces CMVN estimation on data
                    var_epsilon = 1e-09
                    mean = tf.reduce_sum(x, axis=[
                        1
                    ], keepdims=True) / num_of_nonpadding_elements
                    variance = (
                        num_of_nonpadding_elements * mean**2. -
                        2. * mean * tf.reduce_sum(x, axis=[1], keepdims=True) +
                        tf.reduce_sum(x**2, axis=[1], keepdims=True)
                    ) / num_of_nonpadding_elements
                    x = (x - mean) * tf.rsqrt(variance +
                                              var_epsilon) * tf.expand_dims(
                                                  nonpadding_mask, -1)
            else:
                x = inputs

            # The convention is that the models are flattened along the spatial,
            # dimensions, thus the speech preprocessor treats frequencies and
            # channels as image colors (last axis)
            x.set_shape([None, None, num_mel_bins, num_channels])

            # TODO(chorowski): how to specify bottom's hparams and avoid hardcoding?
            x = tf.pad(x, [[0, 0], [0, 8], [0, 0], [0, 0]])
            for _ in range(2):
                x = tf.layers.conv2d(x, 128, (3, 3), (2, 2), use_bias=False)
                x = common_layers.layer_norm(x)
                x = tf.nn.relu(x)

            xshape = common_layers.shape_list(x)
            # apply a conv that will remove all frequencies and at the same time
            # project the output into desired hidden_size
            x = tf.pad(x, [[0, 0], [0, 2], [0, 0], [0, 0]])
            x = tf.layers.conv2d(x,
                                 p.hidden_size, (3, xshape[2]),
                                 use_bias=False)

            assert common_layers.shape_list(x)[2] == 1
            x = common_layers.layer_norm(x)
            x = tf.nn.relu(x)
        return x
def process_audio(audio_path,
                  sess,
                  prepro_batch=128,
                  sample_rate=22050,
                  frame_step=10,
                  frame_length=25,
                  feat_dim=40,
                  feat_type='fbank'):
    """GPU accerated audio features extracting in tensorflow

    Args:
        audio_path: List of path of audio files.
        sess: Tf session to execute the graph for feature extraction.
        prepro_batch: Batch size for preprocessing audio features.
        frame_step: Step size in ms.
        feat_dim: Feature dimension.
        feat_type: Types of features you want to apply.

    Returns:
        feats: List of features with variable length L, 
               each element is in the shape of (L, feat_dim), N is
               the number of samples.
        featlen: List of feature length.
    """

    # build extacting graph
    input_audio = tf.placeholder(dtype=tf.float32, shape=[None, None])
    if feat_type == 'fbank':
        mel_fbanks = common_audio.compute_mel_filterbank_features(
            input_audio,
            sample_rate=sample_rate,
            frame_step=frame_step,
            frame_length=frame_length,
            num_mel_bins=feat_dim,
            apply_mask=True)
        mel_fbanks = tf.reduce_sum(mel_fbanks, -1)

    def extract_feat(audio_batch, len_batch, fs):
        max_len = max(len_batch)
        audio_padded = np.zeros([prepro_batch, max_len], dtype=np.float32)
        for i in range(len(audio_batch)):
            audio_padded[i][:len(audio_batch[i])] = audio_batch[i]
        feat = sess.run(mel_fbanks, feed_dict={input_audio: audio_padded})
        # compute the feature length:
        feat_len = np.array(len_batch) // int(fs * frame_step / 1e3) + 1
        feat_len = feat_len.astype(np.int32)
        return feat, feat_len

    audio_batch = []
    len_batch = []
    feats = []
    featlen = []

    # start extracting audio feature in a batch manner:
    for p in audio_path[:13]:
        audio, fs = librosa.load(p)
        audio_batch.append(audio)
        len_batch.append(len(audio))
        if len(audio_batch) == prepro_batch:
            feat, feat_len = extract_feat(audio_batch, len_batch, fs)
            # remove paddings of audios batch:
            for index, l in enumerate(feat_len):
                feats.append(feat[index][:l])
            featlen = np.concatenate([featlen, feat_len])
            audio_batch = []
            len_batch = []
            print("Processed samples: {}/{}".format(len(feats),
                                                    len(audio_path)))

    if len(audio_batch) % prepro_batch != 0:
        feat, feat_len = extract_feat(audio_batch, len_batch, fs)
        # remove paddings:
        for index, l in enumerate(feat_len):
            feats.append(feat[index][:l])
        featlen = np.concatenate([featlen, feat_len])
        print("Processed samples: {}/{}".format(len(feats), len(audio_path)))

    return np.array(feats), featlen.astype(np.int32)