コード例 #1
0
 def prepare_processing_graph(self, model_settings):
     """Builds a TensorFlow graph to apply the input distortions."""
     desired_samples = model_settings['desired_samples']
     self.wav_filename_placeholder_ = tf.placeholder(tf.string, [],
                                                     name='filename')
     wav_loader = tf.io.read_file(self.wav_filename_placeholder_)
     wav_decoder = tf.audio.decode_wav(wav_loader,
                                       desired_channels=1,
                                       desired_samples=desired_samples)
     # Allow the audio sample's volume to be adjusted.
     self.foreground_volume_placeholder_ = tf.placeholder(
         tf.float32, [], name='foreground_volme')
     scaled_foreground = tf.multiply(wav_decoder.audio,
                                     self.foreground_volume_placeholder_)
     # Shift the sample's start position, and pad any gaps with zeros.
     self.time_shift_placeholder_ = tf.placeholder(tf.int32,
                                                   name='timeshift')
     shifted_foreground = tf_roll(scaled_foreground,
                                  self.time_shift_placeholder_)
     # Mix in background noise.
     self.background_data_placeholder_ = tf.placeholder(
         tf.float32, [desired_samples, 1], name='background_data')
     self.background_volume_placeholder_ = tf.placeholder(
         tf.float32, [], name='background_volume')
     background_mul = tf.multiply(self.background_data_placeholder_,
                                  self.background_volume_placeholder_)
     background_add = tf.add(background_mul, shifted_foreground)
     # removed clipping: tf.clip_by_value(background_add, -1.0, 1.0)
     self.background_clamp_ = background_add
     self.background_clamp_ = tf.reshape(
         self.background_clamp_, (1, model_settings['desired_samples']))
     # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
     stfts = tf.signal.stft(
         self.background_clamp_,
         frame_length=model_settings['window_size_samples'],
         frame_step=model_settings['window_stride_samples'],
         fft_length=None)
     self.spectrogram_ = tf.abs(stfts)
     print("self", self.spectrogram_.shape)
     num_spectrogram_bins = self.spectrogram_.shape[-1]
     lower_edge_hertz, upper_edge_hertz = 80.0, 7600.0
     linear_to_mel_weight_matrix = \
         tf.signal.linear_to_mel_weight_matrix(
             model_settings['dct_coefficient_count'],
             num_spectrogram_bins, model_settings['sample_rate'],
             lower_edge_hertz, upper_edge_hertz)
     mel_spectrograms = tf.tensordot(self.spectrogram_,
                                     linear_to_mel_weight_matrix, 1)
     mel_spectrograms.set_shape(self.spectrogram_.shape[:-1].concatenate(
         linear_to_mel_weight_matrix.shape[-1:]))
     log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6)
     self.mfcc_ = tf.signal.mfccs_from_log_mel_spectrograms(
         log_mel_spectrograms
     )[:, :, :model_settings['num_log_mel_features']]  # :13
コード例 #2
0
ファイル: generator.py プロジェクト: indranig/examples
 def prepare_processing_graph(self, model_settings):
   """Builds a TensorFlow graph to apply the input distortions"""
   desired_samples = model_settings['desired_samples']
   self.wav_filename_placeholder_ = tf.placeholder(
       tf.string, [], name='filename')
   wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
   wav_decoder = contrib_audio.decode_wav(
       wav_loader, desired_channels=1, desired_samples=desired_samples)
   # Allow the audio sample's volume to be adjusted.
   self.foreground_volume_placeholder_ = tf.placeholder(
       tf.float32, [], name='foreground_volme')
   scaled_foreground = tf.multiply(wav_decoder.audio,
                                   self.foreground_volume_placeholder_)
   # Shift the sample's start position, and pad any gaps with zeros.
   self.time_shift_placeholder_ = tf.placeholder(tf.int32, name='timeshift')
   shifted_foreground = tf_roll(scaled_foreground,
                                self.time_shift_placeholder_)
   # Mix in background noise.
   self.background_data_placeholder_ = tf.placeholder(
       tf.float32, [desired_samples, 1], name='background_data')
   self.background_volume_placeholder_ = tf.placeholder(
       tf.float32, [], name='background_volume')
   background_mul = tf.multiply(self.background_data_placeholder_,
                                self.background_volume_placeholder_)
   background_add = tf.add(background_mul, shifted_foreground)
   # removed clipping: tf.clip_by_value(background_add, -1.0, 1.0)
   self.background_clamp_ = background_add
   self.background_clamp_ = tf.reshape(self.background_clamp_,
                                       (1, model_settings['desired_samples']))
   # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
   stfts = tf.contrib.signal.stft(
       self.background_clamp_,
       frame_length=model_settings['window_size_samples'],
       frame_step=model_settings['window_stride_samples'],
       fft_length=None)
   self.spectrogram_ = tf.abs(stfts)
   num_spectrogram_bins = self.spectrogram_.shape[-1].value
   lower_edge_hertz, upper_edge_hertz = 80.0, 7600.0
   linear_to_mel_weight_matrix = \
       tf.contrib.signal.linear_to_mel_weight_matrix(
           model_settings['dct_coefficient_count'],
           num_spectrogram_bins, model_settings['sample_rate'],
           lower_edge_hertz, upper_edge_hertz)
   mel_spectrograms = tf.tensordot(self.spectrogram_,
                                   linear_to_mel_weight_matrix, 1)
   mel_spectrograms.set_shape(self.spectrogram_.shape[:-1].concatenate(
       linear_to_mel_weight_matrix.shape[-1:]))
   log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6)
   self.mfcc_ = tf.contrib.signal.mfccs_from_log_mel_spectrograms(
       log_mel_spectrograms)[:, :, :
                             model_settings['num_log_mel_features']]  # :13
コード例 #3
0
    def prepare_processing_graph(self, model_settings):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_placeholder_: How much the clip is shifted.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - mfcc_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
    """
        desired_samples = model_settings['desired_samples']
        self.wav_filename_placeholder_ = tf.placeholder(tf.string, [],
                                                        name='filename')
        wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
        wav_decoder = contrib_audio.decode_wav(wav_loader,
                                               desired_channels=1,
                                               desired_samples=desired_samples)
        # Allow the audio sample's volume to be adjusted.
        self.foreground_volume_placeholder_ = tf.placeholder(
            tf.float32, [], name='foreground_volme')
        scaled_foreground = tf.multiply(wav_decoder.audio,
                                        self.foreground_volume_placeholder_)
        # Shift the sample's start position, and pad any gaps with zeros.
        self.time_shift_placeholder_ = tf.placeholder(tf.int32,
                                                      name='timeshift')
        # TODO(see--): Write test with np.roll
        shifted_foreground = tf_roll(scaled_foreground,
                                     self.time_shift_placeholder_)
        # Mix in background noise.
        self.background_data_placeholder_ = tf.placeholder(
            tf.float32, [desired_samples, 1], name='background_data')
        self.background_volume_placeholder_ = tf.placeholder(
            tf.float32, [], name='background_volume')
        background_mul = tf.multiply(self.background_data_placeholder_,
                                     self.background_volume_placeholder_)
        background_add = tf.add(background_mul, shifted_foreground)
        # removed clipping: tf.clip_by_value(background_add, -1.0, 1.0)
        self.background_clamp_ = background_add
        self.background_clamp_ = tf.reshape(
            self.background_clamp_, (1, model_settings['desired_samples']))
        # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
        stfts = tf.contrib.signal.stft(
            self.background_clamp_,
            frame_length=model_settings['window_size_samples'],
            frame_step=model_settings['window_stride_samples'],
            fft_length=None)
        self.spectrogram_ = tf.abs(stfts)
        num_spectrogram_bins = self.spectrogram_.shape[-1].value
        lower_edge_hertz, upper_edge_hertz = 80.0, 7600.0
        linear_to_mel_weight_matrix = \
            tf.contrib.signal.linear_to_mel_weight_matrix(
                model_settings['dct_coefficient_count'],
                num_spectrogram_bins, model_settings['sample_rate'],
                lower_edge_hertz, upper_edge_hertz)
        mel_spectrograms = tf.tensordot(self.spectrogram_,
                                        linear_to_mel_weight_matrix, 1)
        mel_spectrograms.set_shape(self.spectrogram_.shape[:-1].concatenate(
            linear_to_mel_weight_matrix.shape[-1:]))
        log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6)
        self.mfcc_ = tf.contrib.signal.mfccs_from_log_mel_spectrograms(
            log_mel_spectrograms
        )[:, :, :model_settings['num_log_mel_features']]  # :13