def prepare_processing_graph(self, model_settings): """Builds a TensorFlow graph to apply the input distortions.""" desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, [], name='filename') wav_loader = tf.io.read_file(self.wav_filename_placeholder_) wav_decoder = tf.audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volme') scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_placeholder_ = tf.placeholder(tf.int32, name='timeshift') shifted_foreground = tf_roll(scaled_foreground, self.time_shift_placeholder_) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, shifted_foreground) # removed clipping: tf.clip_by_value(background_add, -1.0, 1.0) self.background_clamp_ = background_add self.background_clamp_ = tf.reshape( self.background_clamp_, (1, model_settings['desired_samples'])) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. stfts = tf.signal.stft( self.background_clamp_, frame_length=model_settings['window_size_samples'], frame_step=model_settings['window_stride_samples'], fft_length=None) self.spectrogram_ = tf.abs(stfts) print("self", self.spectrogram_.shape) num_spectrogram_bins = self.spectrogram_.shape[-1] lower_edge_hertz, upper_edge_hertz = 80.0, 7600.0 linear_to_mel_weight_matrix = \ tf.signal.linear_to_mel_weight_matrix( model_settings['dct_coefficient_count'], num_spectrogram_bins, model_settings['sample_rate'], lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot(self.spectrogram_, linear_to_mel_weight_matrix, 1) mel_spectrograms.set_shape(self.spectrogram_.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6) self.mfcc_ = tf.signal.mfccs_from_log_mel_spectrograms( log_mel_spectrograms )[:, :, :model_settings['num_log_mel_features']] # :13
def prepare_processing_graph(self, model_settings): """Builds a TensorFlow graph to apply the input distortions""" desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder( tf.string, [], name='filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volme') scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_placeholder_ = tf.placeholder(tf.int32, name='timeshift') shifted_foreground = tf_roll(scaled_foreground, self.time_shift_placeholder_) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, shifted_foreground) # removed clipping: tf.clip_by_value(background_add, -1.0, 1.0) self.background_clamp_ = background_add self.background_clamp_ = tf.reshape(self.background_clamp_, (1, model_settings['desired_samples'])) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. stfts = tf.contrib.signal.stft( self.background_clamp_, frame_length=model_settings['window_size_samples'], frame_step=model_settings['window_stride_samples'], fft_length=None) self.spectrogram_ = tf.abs(stfts) num_spectrogram_bins = self.spectrogram_.shape[-1].value lower_edge_hertz, upper_edge_hertz = 80.0, 7600.0 linear_to_mel_weight_matrix = \ tf.contrib.signal.linear_to_mel_weight_matrix( model_settings['dct_coefficient_count'], num_spectrogram_bins, model_settings['sample_rate'], lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot(self.spectrogram_, linear_to_mel_weight_matrix, 1) mel_spectrograms.set_shape(self.spectrogram_.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6) self.mfcc_ = tf.contrib.signal.mfccs_from_log_mel_spectrograms( log_mel_spectrograms)[:, :, : model_settings['num_log_mel_features']] # :13
def prepare_processing_graph(self, model_settings): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_placeholder_: How much the clip is shifted. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - mfcc_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. """ desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, [], name='filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volme') scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_placeholder_ = tf.placeholder(tf.int32, name='timeshift') # TODO(see--): Write test with np.roll shifted_foreground = tf_roll(scaled_foreground, self.time_shift_placeholder_) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, shifted_foreground) # removed clipping: tf.clip_by_value(background_add, -1.0, 1.0) self.background_clamp_ = background_add self.background_clamp_ = tf.reshape( self.background_clamp_, (1, model_settings['desired_samples'])) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. stfts = tf.contrib.signal.stft( self.background_clamp_, frame_length=model_settings['window_size_samples'], frame_step=model_settings['window_stride_samples'], fft_length=None) self.spectrogram_ = tf.abs(stfts) num_spectrogram_bins = self.spectrogram_.shape[-1].value lower_edge_hertz, upper_edge_hertz = 80.0, 7600.0 linear_to_mel_weight_matrix = \ tf.contrib.signal.linear_to_mel_weight_matrix( model_settings['dct_coefficient_count'], num_spectrogram_bins, model_settings['sample_rate'], lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot(self.spectrogram_, linear_to_mel_weight_matrix, 1) mel_spectrograms.set_shape(self.spectrogram_.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6) self.mfcc_ = tf.contrib.signal.mfccs_from_log_mel_spectrograms( log_mel_spectrograms )[:, :, :model_settings['num_log_mel_features']] # :13