Ejemplo n.º 1
0
    def preprocess_example(self, example, mode, hparams):
        p = hparams
        if p.audio_preproc_in_bottom:
            example["inputs"] = tf.expand_dims(
                tf.expand_dims(example["waveforms"], -1), -1)
        else:
            waveforms = tf.expand_dims(example["waveforms"], 0)
            mel_fbanks = common_audio.compute_mel_filterbank_features(
                waveforms,
                sample_rate=p.audio_sample_rate,
                dither=p.audio_dither,
                preemphasis=p.audio_preemphasis,
                frame_length=p.audio_frame_length,
                frame_step=p.audio_frame_step,
                lower_edge_hertz=p.audio_lower_edge_hertz,
                upper_edge_hertz=p.audio_upper_edge_hertz,
                num_mel_bins=p.audio_num_mel_bins,
                apply_mask=False)
            if p.audio_add_delta_deltas:
                mel_fbanks = common_audio.add_delta_deltas(mel_fbanks)
            fbank_size = common_layers.shape_list(mel_fbanks)
            assert fbank_size[0] == 1

            # This replaces CMVN estimation on data
            var_epsilon = 1e-09
            mean = tf.reduce_mean(mel_fbanks, keepdims=True, axis=1)
            variance = tf.reduce_mean(tf.square(mel_fbanks - mean),
                                      keepdims=True,
                                      axis=1)
            mel_fbanks = (mel_fbanks - mean) * tf.rsqrt(variance + var_epsilon)

            ######### specaugment added by kyubyong #########
            if mode == tf.estimator.ModeKeys.TRAIN:
                # mel_fbanks = time_warp(mel_fbanks)
                mel_fbanks = freq_mask(mel_fbanks)
                mel_fbanks = time_mask(mel_fbanks)

            ######### /specaugment added by kyubyong #########

            # Later models like to flatten the two spatial dims. Instead, we add a
            # unit spatial dim and flatten the frequencies and channels.
            example["inputs"] = tf.concat([
                tf.reshape(mel_fbanks,
                           [fbank_size[1], fbank_size[2], fbank_size[3]]),
                tf.zeros((p.num_zeropad_frames, fbank_size[2], fbank_size[3]))
            ], 0)

        if not p.audio_keep_example_waveforms:
            del example["waveforms"]
        return super(SpeechRecognitionProblem,
                     self).preprocess_example(example, mode, hparams)
Ejemplo n.º 2
0
  def preprocess_example(self, example, mode, hparams):
    p = hparams
    if p.audio_preproc_in_bottom:
      example["inputs"] = tf.expand_dims(
          tf.expand_dims(example["waveforms"], -1), -1)
    else:
      waveforms = tf.expand_dims(example["waveforms"], 0)
      mel_fbanks = common_audio.compute_mel_filterbank_features(
          waveforms,
          sample_rate=p.audio_sample_rate,
          dither=p.audio_dither,
          preemphasis=p.audio_preemphasis,
          frame_length=p.audio_frame_length,
          frame_step=p.audio_frame_step,
          lower_edge_hertz=p.audio_lower_edge_hertz,
          upper_edge_hertz=p.audio_upper_edge_hertz,
          num_mel_bins=p.audio_num_mel_bins,
          apply_mask=False)
      if p.audio_add_delta_deltas:
        mel_fbanks = common_audio.add_delta_deltas(mel_fbanks)
      fbank_size = common_layers.shape_list(mel_fbanks)
      assert fbank_size[0] == 1

      # This replaces CMVN estimation on data
      var_epsilon = 1e-09
      mean = tf.reduce_mean(mel_fbanks, keepdims=True, axis=1)
      variance = tf.reduce_mean(tf.square(mel_fbanks - mean),
                                keepdims=True, axis=1)
      mel_fbanks = (mel_fbanks - mean) * tf.rsqrt(variance + var_epsilon)

      # Later models like to flatten the two spatial dims. Instead, we add a
      # unit spatial dim and flatten the frequencies and channels.
      example["inputs"] = tf.concat([
          tf.reshape(mel_fbanks, [fbank_size[1], fbank_size[2], fbank_size[3]]),
          tf.zeros((p.num_zeropad_frames, fbank_size[2], fbank_size[3]))], 0)

    if not p.audio_keep_example_waveforms:
      del example["waveforms"]
    return super(SpeechRecognitionProblem, self
                ).preprocess_example(example, mode, hparams)
Ejemplo n.º 3
0
    def bottom(self, x):
        """Use batchnorm instead of CMVN and shorten the stft with strided convs.

    Args:
      x: float32 tensor with shape [batch_size, len, 1, freqs * channels]

    Returns:
      float32 tensor with shape [batch_size, shorter_len, 1, hidden_size]
    """
        inputs = x
        p = self._model_hparams

        num_mel_bins = p.audio_num_mel_bins
        num_channels = 3 if p.audio_add_delta_deltas else 1

        with tf.variable_scope(self.name):
            if p.audio_preproc_in_bottom:
                # Compute filterbanks
                with tf.variable_scope("fbanks"):
                    waveforms = tf.squeeze(inputs, [2, 3])
                    mel_fbanks = common_audio.compute_mel_filterbank_features(
                        waveforms,
                        sample_rate=p.audio_sample_rate,
                        dither=p.audio_dither,
                        preemphasis=p.audio_preemphasis,
                        frame_length=p.audio_frame_length,
                        frame_step=p.audio_frame_step,
                        lower_edge_hertz=p.audio_lower_edge_hertz,
                        upper_edge_hertz=p.audio_upper_edge_hertz,
                        num_mel_bins=p.audio_num_mel_bins,
                        apply_mask=True)
                    if p.audio_add_delta_deltas:
                        mel_fbanks = common_audio.add_delta_deltas(mel_fbanks)
                    x = tf.reshape(
                        mel_fbanks,
                        common_layers.shape_list(mel_fbanks)[:2] +
                        [num_mel_bins, num_channels])

                    nonpadding_mask = 1. - common_attention.embedding_to_padding(
                        x)
                    num_of_nonpadding_elements = tf.reduce_sum(
                        nonpadding_mask) * num_mel_bins * num_channels

                    # This replaces CMVN estimation on data
                    var_epsilon = 1e-09
                    mean = tf.reduce_sum(x, axis=[
                        1
                    ], keepdims=True) / num_of_nonpadding_elements
                    variance = (
                        num_of_nonpadding_elements * mean**2. -
                        2. * mean * tf.reduce_sum(x, axis=[1], keepdims=True) +
                        tf.reduce_sum(x**2, axis=[1], keepdims=True)
                    ) / num_of_nonpadding_elements
                    x = (x - mean) * tf.rsqrt(variance +
                                              var_epsilon) * tf.expand_dims(
                                                  nonpadding_mask, -1)
            else:
                x = inputs

            # The convention is that the models are flattened along the spatial,
            # dimensions, thus the speech preprocessor treats frequencies and
            # channels as image colors (last axis)
            x.set_shape([None, None, num_mel_bins, num_channels])

            # TODO(chorowski): how to specify bottom's hparams and avoid hardcoding?
            x = tf.pad(x, [[0, 0], [0, 8], [0, 0], [0, 0]])
            for _ in range(2):
                x = tf.layers.conv2d(x, 128, (3, 3), (2, 2), use_bias=False)
                x = common_layers.layer_norm(x)
                x = tf.nn.relu(x)

            xshape = common_layers.shape_list(x)
            # apply a conv that will remove all frequencies and at the same time
            # project the output into desired hidden_size
            x = tf.pad(x, [[0, 0], [0, 2], [0, 0], [0, 0]])
            x = tf.layers.conv2d(x,
                                 p.hidden_size, (3, xshape[2]),
                                 use_bias=False)

            assert common_layers.shape_list(x)[2] == 1
            x = common_layers.layer_norm(x)
            x = tf.nn.relu(x)
        return x