def prepare_processing_graph(self, model_settings): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - mfcc_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. """ desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2]) self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2]) padded_foreground = tf.pad(scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1]) self.background_volume_placeholder_ = tf.placeholder(tf.float32, []) background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) self.mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count'])
def get_unprocessed_data(self, how_many, model_settings, mode): """Retrieve sample data for the given partition, with no transformations. Args: how_many: Desired number of samples to return. -1 means the entire contents of this partition. model_settings: Information about the current model being trained. mode: Which partition to use, must be 'training', 'validation', or 'testing'. Returns: List of sample data for the samples, and list of labels in one-hot form. """ candidates = self.data_index[mode] if how_many == -1: sample_count = len(candidates) else: sample_count = how_many desired_samples = model_settings['desired_samples'] words_list = self.words_list data = np.zeros((sample_count, desired_samples)) labels = [] with tf.Session(graph=tf.Graph()) as sess: wav_filename_placeholder = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) foreground_volume_placeholder = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, foreground_volume_placeholder) for i in range(sample_count): if how_many == -1: sample_index = i else: sample_index = np.random.randint(len(candidates)) sample = candidates[sample_index] input_dict = {wav_filename_placeholder: sample['file']} if sample['label'] == SILENCE_LABEL: input_dict[foreground_volume_placeholder] = 0 else: input_dict[foreground_volume_placeholder] = 1 data[i, :] = sess.run(scaled_foreground, feed_dict=input_dict).flatten() label_index = self.word_to_index[sample['label']] labels.append(words_list[label_index]) return data, labels
def load_wav_file(filename): """Loads an audio file and returns a float PCM-encoded array of samples. Args: filename: Path to the .wav file to load. Returns: Numpy array holding the sample data as floats between -1.0 and 1.0. """ with tf.Session(graph=tf.Graph()) as sess: wav_filename_placeholder = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1) return sess.run(wav_decoder, feed_dict={ wav_filename_placeholder: filename }).audio.flatten()
def prepare_background_data(self): """Searches a folder for background noise audio, and loads it into memory. It's expected that the background audio samples will be in a subdirectory named '_background_noise_' inside the 'data_dir' folder, as .wavs that match the sample rate of the training data, but can be much longer in duration. If the '_background_noise_' folder doesn't exist at all, this isn't an error, it's just taken to mean that no background noise augmentation should be used. If the folder does exist, but it's empty, that's treated as an error. Returns: List of raw PCM-encoded audio samples of background noise. Raises: Exception: If files aren't found in the folder. """ self.background_data = [] background_dir = os.path.join(self.data_dir, BACKGROUND_NOISE_DIR_NAME) if not os.path.exists(background_dir): return self.background_data with tf.Session(graph=tf.Graph()) as sess: wav_filename_placeholder = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1) search_path = os.path.join(self.data_dir, BACKGROUND_NOISE_DIR_NAME, '*.wav') for wav_path in gfile.Glob(search_path): wav_data = sess.run(wav_decoder, feed_dict={ wav_filename_placeholder: wav_path }).audio.flatten() self.background_data.append(wav_data) if not self.background_data: raise Exception('No background wav files were found in ' + search_path)