def get_unprocessed_data(self, how_many, model_settings, mode): """Gets sample data without transformations.""" candidates = self.data_index[mode] if how_many == -1: sample_count = len(candidates) else: sample_count = how_many desired_samples = model_settings['desired_samples'] words_list = self.words_list data = np.zeros((sample_count, desired_samples)) labels = [] with tf.Session(graph=tf.Graph()) as sess: wav_filename_placeholder = tf.placeholder(tf.string, [], name='filename') wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) foreground_volume_placeholder = tf.placeholder( tf.float32, [], name='foreground_volume') scaled_foreground = tf.multiply(wav_decoder.audio, foreground_volume_placeholder) for i in range(sample_count): if how_many == -1: sample_index = i else: sample_index = np.random.randint(len(candidates)) sample = candidates[sample_index] input_dict = {wav_filename_placeholder: sample['file']} if sample['label'] == SILENCE_LABEL: input_dict[foreground_volume_placeholder] = 0 else: input_dict[foreground_volume_placeholder] = 1 data[i, :] = sess.run(scaled_foreground, feed_dict=input_dict).flatten() label_index = self.word_to_index[sample['label']] labels.append(words_list[label_index]) return data, labels
def prepare_background_data(self): """Searches a folder for background noise audio, and loads it into memory. It's expected that the background audio samples will be in a subdirectory named '_background_noise_' inside the 'data_dir' folder, as .wavs that match the sample rate of the training data, but can be much longer in duration. If the '_background_noise_' folder doesn't exist at all, this isn't an error, it's just taken to mean that no background noise augmentation should be used. If the folder does exist, but it's empty, that's treated as an error. Returns: List of raw PCM-encoded audio samples of background noise. Raises: Exception: If files aren't found in the folder. """ self.background_data = [] background_dir = os.path.join(self.data_dir, BACKGROUND_NOISE_DIR_NAME) if not os.path.exists(background_dir): return self.background_data with tf.Session(graph=tf.Graph()) as sess: wav_filename_placeholder = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1) search_path = os.path.join(self.data_dir, BACKGROUND_NOISE_DIR_NAME, '*.wav') for wav_path in gfile.Glob(search_path): wav_data = sess.run( wav_decoder, feed_dict={wav_filename_placeholder: wav_path}).audio.flatten() self.background_data.append(wav_data) if not self.background_data: raise Exception('No background wav files were found in ' + search_path)
def prepare_processing_graph(self, model_settings): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - mfcc_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. """ desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2]) self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2]) padded_foreground = tf.pad( scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder(tf.float32, [desired_samples, 1]) self.background_volume_placeholder_ = tf.placeholder(tf.float32, []) background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) self.mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count'])
def load_wav_file(filename): """Loads an audio file and returns a float PCM-encoded array of samples.""" with tf.Session(graph=tf.Graph()) as sess: wav_filename_placeholder = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1) return sess.run( wav_decoder, feed_dict={ wav_filename_placeholder: filename }).audio.flatten()
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=dct_coefficient_count) fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape(fingerprint_input, [ -1, fingerprint_time_size * fingerprint_frequency_size ]) logits = models.create_model( reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def prepare_processing_graph(self, model_settings): """Builds a TensorFlow graph to apply the input distortions""" desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder( tf.string, [], name='filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volme') scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_placeholder_ = tf.placeholder(tf.int32, name='timeshift') shifted_foreground = tf_roll(scaled_foreground, self.time_shift_placeholder_) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, shifted_foreground) # removed clipping: tf.clip_by_value(background_add, -1.0, 1.0) self.background_clamp_ = background_add self.background_clamp_ = tf.reshape(self.background_clamp_, (1, model_settings['desired_samples'])) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. stfts = tf.contrib.signal.stft( self.background_clamp_, frame_length=model_settings['window_size_samples'], frame_step=model_settings['window_stride_samples'], fft_length=None) self.spectrogram_ = tf.abs(stfts) num_spectrogram_bins = self.spectrogram_.shape[-1].value lower_edge_hertz, upper_edge_hertz = 80.0, 7600.0 linear_to_mel_weight_matrix = \ tf.contrib.signal.linear_to_mel_weight_matrix( model_settings['dct_coefficient_count'], num_spectrogram_bins, model_settings['sample_rate'], lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot(self.spectrogram_, linear_to_mel_weight_matrix, 1) mel_spectrograms.set_shape(self.spectrogram_.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6) self.mfcc_ = tf.contrib.signal.mfccs_from_log_mel_spectrograms( log_mel_spectrograms)[:, :, : model_settings['num_log_mel_features']] # :13
def load_wav_file(filename): """Loads an audio file and returns a float PCM-encoded array of samples. Args: filename: Path to the .wav file to load. Returns: Numpy array holding the sample data as floats between -1.0 and 1.0. """ with tf.Session(graph=tf.Graph()) as sess: wav_filename_placeholder = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1) return sess.run( wav_decoder, feed_dict={wav_filename_placeholder: filename}).audio.flatten()
def get_unprocessed_data(self, how_many, model_settings, mode): """Retrieve sample data for the given partition, with no transformations. Args: how_many: Desired number of samples to return. -1 means the entire contents of this partition. model_settings: Information about the current model being trained. mode: Which partition to use, must be 'training', 'validation', or 'testing'. Returns: List of sample data for the samples, and list of labels in one-hot form. """ candidates = self.data_index[mode] if how_many == -1: sample_count = len(candidates) else: sample_count = how_many desired_samples = model_settings['desired_samples'] words_list = self.words_list data = np.zeros((sample_count, desired_samples)) labels = [] with tf.Session(graph=tf.Graph()) as sess: wav_filename_placeholder = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) foreground_volume_placeholder = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, foreground_volume_placeholder) for i in range(sample_count): if how_many == -1: sample_index = i else: sample_index = np.random.randint(len(candidates)) sample = candidates[sample_index] input_dict = {wav_filename_placeholder: sample['file']} if sample['label'] == SILENCE_LABEL: input_dict[foreground_volume_placeholder] = 0 else: input_dict[foreground_volume_placeholder] = 1 data[i, :] = sess.run(scaled_foreground, feed_dict=input_dict).flatten() label_index = self.word_to_index[sample['label']] labels.append(words_list[label_index]) return data, labels
def clip_to_waveform(clip, clip_dir=None): """Decodes a WAV clip into a waveform tensor.""" ''' data , sampling_rate = librosa.load('data/sound.wav', sr=SAMPLE_RATE) # for use in tensorflow data_tensor = tf.convert_to_tensor( data ) ''' # Decode the WAV-format clip into a waveform tensor where # the values lie in [-1, +1]. clip_path = tf.string_join([clip_dir, clip], separator=os.sep) clip_data = tf.read_file(clip_path) waveform, sr = tf_audio.decode_wav(clip_data) # Assert that the clip has the expected sample rate. check_sr = tf.assert_equal(sr, sr) # and check that it is mono. check_channels = tf.assert_equal(tf.shape(waveform)[1], 1) with tf.control_dependencies([tf.group(check_sr, check_channels)]): return tf.squeeze(waveform)
def prepare_background_data(self): """Searches a folder for background noise audio, and loads it into memory""" self.background_data = [] background_dir = os.path.join(self.data_dirs[0], BACKGROUND_NOISE_DIR_NAME) if not os.path.exists(background_dir): return self.background_data with tf.Session(graph=tf.Graph()) as sess: wav_filename_placeholder = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1) search_path = os.path.join(self.data_dirs[0], BACKGROUND_NOISE_DIR_NAME, '*.wav') for wav_path in gfile.Glob(search_path): wav_data = sess.run( wav_decoder, feed_dict={ wav_filename_placeholder: wav_path }).audio.flatten() self.background_data.append(wav_data) if not self.background_data: raise Exception('No background wav files were found in ' + search_path)
def process_wav(): file_range = 0 #Each file gets its own two-dimensional array. fileIdx = 0 for file in iglob('audio_wav' + '/*.wav'): #So this doesn't take too long. if fileIdx < numbertotrain: audio_binary = tf.read_file(file) wav_decoder = audio_ops.decode_wav(audio_binary, desired_channels=2) sample_rate, audio = sess.run( [wav_decoder.sample_rate, wav_decoder.audio]) fileAudio = np.array(audio) #Only use sounds of the same length, this length seems to match most. if len(fileAudio) == 5294592: #Audio is split into two channels, cut each one into inputSize sized chunks and store them sequentially. #Use discrete fourier transforms to map from the time domain into the frequency domain. leftAudio = rfft(audio[:, 0]) rightAudio = rfft(audio[:, 1]) #Split the both arrays into subarrays of length inputSize lower = 0 upper = inputSize #Sliding window. while upper < len(leftAudio): leftAudioSection = leftAudio[lower:upper] rightAudioSection = rightAudio[lower:upper] #Add them in sequential order x.append(leftAudioSection) x.append(rightAudioSection) lower += inputSize upper += inputSize #Now x contains the subarrays we'd like, in the order [ [left1], [right1,], [left2], [right2]...[leftn], right[n]] else: pass print("preprocessed file: " + str(file) + ", Number: " + str(fileIdx)) fileIdx += 1
def get_unprocessed_data(self, how_many, model_settings, mode): candidates = self.data_index[mode] if how_many == -1: sample_count = len(candidates) else: sample_count = how_many desired_samples = model_settings['desired_samples'] words_list = self.words_list data = np.zeros((sample_count, desired_samples)) labels = [] with tf.Session(graph=tf.Graph()) as sess: wav_filename_placeholder = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) foreground_volume_placeholder = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, foreground_volume_placeholder) for i in range(sample_count): if how_many == -1: sample_index = i else: sample_index = np.random.randint(len(candidates)) sample = candidates[sample_index] input_dict = {wav_filename_placeholder: sample['file']} if sample['label'] == SILENCE_LABEL: input_dict[foreground_volume_placeholder] = 0 else: input_dict[foreground_volume_placeholder] = 1 data[i, :] = sess.run(scaled_foreground, feed_dict=input_dict).flatten() label_index = self.word_to_index[sample['label']] labels.append(words_list[label_index]) return data, labels
def prepare_background_data(self): """Searches a folder for background noise audio, and loads it into memory. It's expected that the background audio samples will be in a subdirectory named '_background_noise_' inside the 'data_dir' folder, as .wavs that match the sample rate of the training data, but can be much longer in duration. If the '_background_noise_' folder doesn't exist at all, this isn't an error, it's just taken to mean that no background noise augmentation should be used. If the folder does exist, but it's empty, that's treated as an error. Returns: List of raw PCM-encoded audio samples of background noise. Raises: Exception: If files aren't found in the folder. """ self.background_data = [] background_dir = os.path.join(self.data_dir, BACKGROUND_NOISE_DIR_NAME) if not os.path.exists(background_dir): return self.background_data with tf.Session(graph=tf.Graph()) as sess: wav_filename_placeholder = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1) search_path = os.path.join(self.data_dir, BACKGROUND_NOISE_DIR_NAME, '*.wav') for wav_path in gfile.Glob(search_path): print(wav_path) wav_data = sess.run( wav_decoder, feed_dict={wav_filename_placeholder: wav_path}).audio.flatten() self.background_data.append(wav_data) if not self.background_data: raise Exception('No background wav files were found in ' + search_path)
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture): words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=dct_coefficient_count) fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape( fingerprint_input, [-1, fingerprint_time_size * fingerprint_frequency_size]) logits = models.create_model(reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) prediction_node = tf.argmax(logits, axis=-1) return wav_data_placeholder, prediction_node
def __init__(self, FLAGS): model_settings = prepare_settings( FLAGS.num_classes, FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count ) runtime_settings = {'clip_stride_ms': FLAGS.clip_stride_ms} # Perform preprocessing self.wav_data_placeholder = tf.placeholder(tf.string, [], name='wav') wav_data = io_ops.read_file(self.wav_data_placeholder) decoded_sample_data = contrib_audio.decode_wav( wav_data, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data' ) spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True ) fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=FLAGS.dct_coefficient_count ) fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] # Add channel dimension self.reshaped_input = tf.reshape( fingerprint_input, [fingerprint_time_size, fingerprint_frequency_size, 1] )
def build_preproc_graph_for_cnn(): model_settings = init_cnn_model_settings() wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') audio_binary = tf.read_file(wav_data_placeholder) decoded_sample_data = contrib_audio.decode_wav( audio_binary, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_datal') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) mfcc_tensor = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count']) fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] mfcc_tensor_flatten = tf.reshape(mfcc_tensor, [ -1, fingerprint_time_size * fingerprint_frequency_size ]) return mfcc_tensor_flatten, mfcc_tensor
def prepare_processing_graph(file, window_size_samples, window_stride_samples, dct_coefficient_count): desired_samples = 16000 wav_filename_placeholder_ = file wav_loader = io_ops.read_file(wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( # wav_loader, desired_channels=1, desired_samples=desired_samples) wav_loader, desired_channels=1, desired_samples=16000) # Allow the audio sample's volume to be adjusted. # Shift the sample's start position, and pad any gaps with zeros. wave_input = tf.clip_by_value(wav_decoder.audio, -1.0, 1.0) ###################### M F C C ################################# # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = contrib_audio.audio_spectrogram( wave_input, window_size=window_size_samples, stride=window_stride_samples, magnitude_squared=True) mfcc_ = contrib_audio.mfcc(spectrogram, wav_decoder.sample_rate, dct_coefficient_count=dct_coefficient_count) return mfcc_
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture, input_type, model_size_info): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, 100) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') #input_spectrogram = tf.placeholder(tf.float32, shape=[49,513], name='speech_signal') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) #spectrogram = input_spectrogram if (input_type == 'log-mel'): print("log-mel energies") # Warp the linear-scale, magnitude spectrograms into the mel-scale. num_spectrogram_bins = spectrogram.shape[ -1].value # magnitude_spectrograms.shape[-1].value lower_edge_hertz, upper_edge_hertz, num_mel_bins = 20.0, 4000.0, model_settings[ 'dct_coefficient_count'] linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, model_settings['sample_rate'], lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot(spectrogram, linear_to_mel_weight_matrix, 1) # Note: Shape inference for `tf.tensordot` does not currently handle this case. mel_spectrograms.set_shape(spectrogram.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) log_offset = 1e-6 log_mel_spectrograms = tf.log(mel_spectrograms + log_offset) fingerprint_input = log_mel_spectrograms elif (input_type == 'MFCC'): print('MFCC-features') fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count']) #fingerprint_input = tf.placeholder(tf.float32,shape=[49,20],name='fingerprint') fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape( fingerprint_input, [-1, fingerprint_time_size * fingerprint_frequency_size]) logits, dropout_prob = models.create_model( reshaped_input, model_settings, model_architecture, model_size_info, is_training=True, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def create_inference_graph_batched(wanted_words, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture, model_size_info=None): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) if (model_architecture == 'dnc'): model_settings['batch_size'] = 1000 fingerprint_size = model_settings['fingerprint_size'] #Wav Data Placeholder wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) mfcc_output = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=dct_coefficient_count, name='mfcc') #Batched Input Placeholder fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size], name='fingerprint_input') fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape( fingerprint_input, [-1, fingerprint_time_size * fingerprint_frequency_size]) logits = models.create_model(reshaped_input, model_settings, model_architecture, model_size_info=model_size_info, is_training=False) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def main(_): print(FLAGS.model_size_info) reg_conv_bits = FLAGS.bit_widths[0] dw_conv_bits = FLAGS.bit_widths[1] pw_conv_bits = FLAGS.bit_widths[2] fc_bits = FLAGS.bit_widths[3] activations_bits = FLAGS.bit_widths[4] print("Regular Conv-weights bit width: " + str(reg_conv_bits)) print("Depthwise Conv-weights bit width: " + str(dw_conv_bits)) print("Pointwise Conv-weights bit width: " + str(pw_conv_bits)) print("FC-weights bit width: " + str(fc_bits)) print("Activations bit width: " + str(activations_bits)) # We want to see all the logging messages for this tutorial. tf.logging.set_verbosity(tf.logging.INFO) # Start a new TensorFlow session. sess = tf.InteractiveSession() words_list = input_data.prepare_words_list(FLAGS.wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count, 100) clip_stride_ms = 260 runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') # input_spectrogram = tf.placeholder(tf.float32, shape=[49,513], name='speech_signal') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) # spectrogram = input_spectrogram if (FLAGS.input_type == 'log-mel'): print("log-mel energies") # Warp the linear-scale, magnitude spectrograms into the mel-scale. num_spectrogram_bins = spectrogram.shape[ -1].value # magnitude_spectrograms.shape[-1].value lower_edge_hertz, upper_edge_hertz, num_mel_bins = 20.0, 4000.0, model_settings[ 'dct_coefficient_count'] linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, model_settings['sample_rate'], lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot(spectrogram, linear_to_mel_weight_matrix, 1) # Note: Shape inference for `tf.tensordot` does not currently handle this case. mel_spectrograms.set_shape(spectrogram.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) log_offset = 1e-6 log_mel_spectrograms = tf.log(mel_spectrograms + log_offset) fingerprint_input = log_mel_spectrograms elif (FLAGS.input_type == 'MFCC'): print('MFCC-features') fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count']) # fingerprint_input = tf.placeholder(tf.float32,shape=[49,20],name='fingerprint') fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape( fingerprint_input, [-1, fingerprint_time_size * fingerprint_frequency_size]) training = tf.placeholder(tf.bool, name='training') logits, net_c1 = models.create_model(reshaped_input, model_settings, FLAGS.model_architecture, FLAGS.model_size_info, is_training=True, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax') saver = tf.train.Saver(tf.global_variables()) tf.global_variables_initializer().run() start_step = 1 if FLAGS.start_checkpoint: models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint) for v in tf.trainable_variables(): print(v.name) v_backup = tf.trainable_variables() eps = 0.001 # Layer information [weights, biases, channel means, channel variances, input fractional bits, output fractional bits, name for .h file] conv_1 = [ 'DS-CNN/conv_1/weights', 'DS-CNN/conv_1/biases', 'DS-CNN/conv_1/batch_norm/moving_mean', 'DS-CNN/conv_1/batch_norm/moving_variance', 2, 5, 'CONV1', 'DS-CNN/conv_1/batch_norm/beta' ] dw_conv_1 = [ 'DS-CNN/conv_ds_1/depthwise_conv/depthwise_weights', 'DS-CNN/conv_ds_1/depthwise_conv/biases', 'DS-CNN/conv_ds_1/dw_batch_norm/moving_mean', 'DS-CNN/conv_ds_1/dw_batch_norm/moving_variance', 5, 5, 'DW_CONV1', 'DS-CNN/conv_ds_1/dw_batch_norm/beta' ] pw_conv_1 = [ 'DS-CNN/conv_ds_1/pointwise_conv/weights', 'DS-CNN/conv_ds_1/pointwise_conv/biases', 'DS-CNN/conv_ds_1/pw_batch_norm/moving_mean', 'DS-CNN/conv_ds_1/pw_batch_norm/moving_variance', 5, 5, 'PW_CONV1', 'DS-CNN/conv_ds_1/pw_batch_norm/beta' ] dw_conv_2 = [ 'DS-CNN/conv_ds_2/depthwise_conv/depthwise_weights', 'DS-CNN/conv_ds_2/depthwise_conv/biases', 'DS-CNN/conv_ds_2/dw_batch_norm/moving_mean', 'DS-CNN/conv_ds_2/dw_batch_norm/moving_variance', 5, 5, 'DW_CONV2', 'DS-CNN/conv_ds_2/dw_batch_norm/beta' ] pw_conv_2 = [ 'DS-CNN/conv_ds_2/pointwise_conv/weights', 'DS-CNN/conv_ds_2/pointwise_conv/biases', 'DS-CNN/conv_ds_2/pw_batch_norm/moving_mean', 'DS-CNN/conv_ds_2/pw_batch_norm/moving_variance', 5, 5, 'PW_CONV2', 'DS-CNN/conv_ds_2/pw_batch_norm/beta' ] dw_conv_3 = [ 'DS-CNN/conv_ds_3/depthwise_conv/depthwise_weights', 'DS-CNN/conv_ds_3/depthwise_conv/biases', 'DS-CNN/conv_ds_3/dw_batch_norm/moving_mean', 'DS-CNN/conv_ds_3/dw_batch_norm/moving_variance', 5, 5, 'DW_CONV3', 'DS-CNN/conv_ds_3/dw_batch_norm/beta' ] pw_conv_3 = [ 'DS-CNN/conv_ds_3/pointwise_conv/weights', 'DS-CNN/conv_ds_3/pointwise_conv/biases', 'DS-CNN/conv_ds_3/pw_batch_norm/moving_mean', 'DS-CNN/conv_ds_3/pw_batch_norm/moving_variance', 5, 5, 'PW_CONV3', 'DS-CNN/conv_ds_3/pw_batch_norm/beta' ] dw_conv_4 = [ 'DS-CNN/conv_ds_4/depthwise_conv/depthwise_weights', 'DS-CNN/conv_ds_4/depthwise_conv/biases', 'DS-CNN/conv_ds_4/dw_batch_norm/moving_mean', 'DS-CNN/conv_ds_4/dw_batch_norm/moving_variance', 5, 5, 'DW_CONV4', 'DS-CNN/conv_ds_4/dw_batch_norm/beta' ] pw_conv_4 = [ 'DS-CNN/conv_ds_4/pointwise_conv/weights', 'DS-CNN/conv_ds_4/pointwise_conv/biases', 'DS-CNN/conv_ds_4/pw_batch_norm/moving_mean', 'DS-CNN/conv_ds_4/pw_batch_norm/moving_variance', 5, 5, 'PW_CONV4', 'DS-CNN/conv_ds_4/pw_batch_norm/beta' ] dw_conv_5 = [ 'DS-CNN/conv_ds_5/depthwise_conv/depthwise_weights', 'DS-CNN/conv_ds_5/depthwise_conv/biases', 'DS-CNN/conv_ds_5/dw_batch_norm/moving_mean', 'DS-CNN/conv_ds_5/dw_batch_norm/moving_variance', 5, 5, 'DW_CONV5', 'DS-CNN/conv_ds_5/dw_batch_norm/beta' ] pw_conv_5 = [ 'DS-CNN/conv_ds_5/pointwise_conv/weights', 'DS-CNN/conv_ds_5/pointwise_conv/biases', 'DS-CNN/conv_ds_5/pw_batch_norm/moving_mean', 'DS-CNN/conv_ds_5/pw_batch_norm/moving_variance', 5, 5, 'PW_CONV5', 'DS-CNN/conv_ds_5/pw_batch_norm/beta' ] dw_conv_6 = [ 'DS-CNN/conv_ds_6/depthwise_conv/depthwise_weights', 'DS-CNN/conv_ds_6/depthwise_conv/biases', 'DS-CNN/conv_ds_6/dw_batch_norm/moving_mean', 'DS-CNN/conv_ds_6/dw_batch_norm/moving_variance', 5, 5, 'DW_CONV6', 'DS-CNN/conv_ds_6/dw_batch_norm/beta' ] pw_conv_6 = [ 'DS-CNN/conv_ds_6/pointwise_conv/weights', 'DS-CNN/conv_ds_6/pointwise_conv/biases', 'DS-CNN/conv_ds_6/pw_batch_norm/moving_mean', 'DS-CNN/conv_ds_6/pw_batch_norm/moving_variance', 5, 5, 'PW_CONV6', 'DS-CNN/conv_ds_6/pw_batch_norm/beta' ] layer_list = [ conv_1, dw_conv_1, pw_conv_1, dw_conv_2, pw_conv_2, dw_conv_3, pw_conv_3, dw_conv_4, pw_conv_4, dw_conv_5, pw_conv_5, dw_conv_6, pw_conv_6 ] n_filters = 76 for layer in layer_list: bit_width = reg_conv_bits layer_name = layer[6] PW = False if (layer_name[0:2] == 'PW'): PW = True bit_width = pw_conv_bits DW = False if (layer_name[0:2] == 'DW'): DW = True bit_width = dw_conv_bits print("Name of node - " + layer[6]) for v in tf.trainable_variables(): if v.name == layer[0] + ':0': v_weights = v if v.name == layer[1] + ':0': v_bias = v if v.name == layer[7] + ':0': v_beta = v for v in tf.global_variables(): if v.name == layer[2] + ':0': v_mean = v if v.name == layer[3] + ':0': v_var = v weights = sess.run(v_weights) bias = sess.run(v_bias) beta = sess.run(v_beta) mean = sess.run(v_mean) var = sess.run(v_var) #print("Weights shape: " + str(weights.shape)) #print("Bias shape: " + str(bias.shape)) #print("Var shape: " + str(var.shape)) #print("Mean shape: " + str(mean.shape)) #print("Beta shape: " + str(beta.shape)) w_shape = weights.shape b_shape = bias.shape weights = weights.squeeze() weights_t1 = np.zeros(weights.shape) bias_t1 = np.zeros((1, n_filters)) for i in range(0, len(bias)): if (PW): filter = weights[:, i] else: filter = weights[:, :, i] bias_temp = bias[i] mean_temp = mean[i] var_temp = var[i] beta_temp = beta[i] new_filter = filter / math.sqrt(var_temp + eps) new_bias = beta_temp + (bias_temp - mean_temp) / (math.sqrt(var_temp + eps)) if (PW): weights_t1[:, i] = new_filter else: weights_t1[:, :, i] = new_filter bias_t1[0, i] = new_bias #if (i == 0): #print('filters : ' + str(filter)) #print('Bias : ' + str(bias_temp)) #print('Mean : ' + str(mean_temp)) #print('Variance : ' + str(var_temp)) #print("New filter : " + str(new_filter)) #print("New Bias : " + str(new_bias)) min_value = weights_t1.min() max_value = weights_t1.max() int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value))))) dec_bits_weight = min((bit_width - 1) - int_bits, 111) weights_quant = np.round(weights_t1 * 2**dec_bits_weight) weights_quant = weights_quant / (2**dec_bits_weight) weights_quant = weights_quant.reshape(w_shape) #print("input fractional bits: " + str(layer[4])) #print("Weights min value: " + str(min_value)) #print("Weights max value: " + str(max_value)) #print("Weights fractional bits: " + str(dec_bits_weight)) min_value = bias_t1.min() max_value = bias_t1.max() int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value))))) dec_bits_bias = min((bit_width - 1) - int_bits, 10000) bias_quant = np.round(bias_t1 * 2**dec_bits_bias) bias_quant = bias_quant / (2**dec_bits_bias) bias_quant = bias_quant.reshape(b_shape) bias_left_shift = layer[4] + dec_bits_weight - dec_bits_bias #print("Bias min value: " + str(min_value)) #print("Bias max value: " + str(max_value)) #print("Bias fractional bits: " + str(dec_bits_bias)) # update the weights in tensorflow graph for quantizing the activations updated_weights = sess.run(tf.assign(v_weights, weights_quant)) updated_bias = sess.run(tf.assign(v_bias, bias_quant)) fc_layer = ['DS-CNN/fc1/weights', 'DS-CNN/fc1/biases', 5, 3, 'FC'] for v in tf.trainable_variables(): if v.name == fc_layer[0] + ':0': v_fc_weights = v if v.name == fc_layer[1] + ':0': v_fc_bias = v weights = sess.run(v_fc_weights) bias = sess.run(v_fc_bias) w_shape = weights.shape b_shape = bias.shape #print("FC weights : " + str(weights.shape)) #print(weights) #print("FC bias : " + str(bias.shape)) #print(bias) min_value = weights.min() max_value = weights.max() int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value))))) dec_bits_weight = min((fc_bits - 1) - int_bits, 111) weights_quant = np.round(weights * 2**dec_bits_weight) weights_quant = weights_quant / (2**dec_bits_weight) weights_quant = weights_quant.reshape(w_shape) #print("input fractional bits: " + str(fc_layer[2])) #print("Weights min value: " + str(min_value)) #print("Weights max value: " + str(max_value)) #print("Weights fractional bits: " + str(dec_bits_weight)) min_value = bias.min() max_value = bias.max() int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value))))) dec_bits_bias = min((fc_bits - 1) - int_bits, 10000) bias_quant = np.round(bias * 2**dec_bits_bias) #print("Bias min value: " + str(min_value)) #print("Bias max value: " + str(max_value)) #print("Bias fractional bits: " + str(dec_bits_bias)) bias_quant = bias_quant / (2**dec_bits_bias) bias_quant = bias_quant.reshape(b_shape) #print("Quantized weights: " + str(weights_quant)) #print("Quantized bias: " +str(bias_quant)) updated_weights = sess.run(tf.assign(v_fc_weights, weights_quant)) updated_bias = sess.run(tf.assign(v_fc_bias, bias_quant)) #print("bias[0] : " + str(bias[0])) #print("bias_quant[0] : " + str(bias_quant[0])) training_step = 30000 checkpoint_path = os.path.join(FLAGS.train_dir, 'quant', FLAGS.model_architecture + '.ckpt') tf.logging.info('Saving best model to "%s-%d"', checkpoint_path, training_step) saver.save(sess, checkpoint_path, global_step=training_step)
def audiofile_to_features(wav_filename): samples = tf.io.read_file(wav_filename) decoded = contrib_audio.decode_wav(samples, desired_channels=1) features, features_len = samples_to_mfccs(decoded.audio, decoded.sample_rate) return features, features_len
def prepare_processing_graph(self, model_settings, summaries_dir): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - output_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. summaries_dir: Path to save training summary information to. Raises: ValueError: If the preprocessing mode isn't recognized. """ with tf.get_default_graph().name_scope('data'): desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder( tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volume') scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder( tf.int32, [2, 2], name='time_shift_padding') self.time_shift_offset_placeholder_ = tf.placeholder( tf.int32, [2], name='time_shift_offset') padded_foreground = tf.pad( scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) tf.summary.image( 'spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1) # The number of buckets in each FFT row in the spectrogram will depend on # how many input samples there are in each window. This can be quite # large, with a 160 sample window producing 127 buckets for example. We # don't need this level of detail for classification, so we often want to # shrink them down to produce a smaller result. That's what this section # implements. One method is to use average pooling to merge adjacent # buckets, but a more sophisticated approach is to apply the MFCC # algorithm to shrink the representation. if model_settings['preprocess'] == 'average': self.output_ = tf.nn.pool( tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') tf.summary.image('shrunk_spectrogram', self.output_, max_outputs=1) elif model_settings['preprocess'] == 'mfcc': self.output_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) tf.summary.image( 'mfcc', tf.expand_dims(self.output_, -1), max_outputs=1) else: raise ValueError('Unknown preprocess mode "%s" (should be "mfcc" or' ' "average")' % (model_settings['preprocess'])) # Merge all the summaries and write them out to /tmp/retrain_logs (by # default) self.merged_summaries_ = tf.summary.merge_all(scope='data') self.summary_writer_ = tf.summary.FileWriter(summaries_dir + '/data', tf.get_default_graph())
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, feature_bin_count, model_architecture, preprocess): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. feature_bin_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. preprocess: How the spectrogram is processed to produce features, for example 'mfcc' or 'average'. Raises: Exception: If the preprocessing mode isn't recognized. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, feature_bin_count, preprocess) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') daudio = tf.identity(decoded_sample_data.audio, name='dao') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) if preprocess == 'average': fingerprint_input = tf.nn.pool( tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') elif preprocess == 'mfcc': fingerprint_input = contrib_audio.mfcc( spectrogram, sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) else: raise Exception('Unknown preprocess mode "%s" (should be "mfcc" or' ' "average")' % (preprocess)) fingerprint_size = model_settings['fingerprint_size'] reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size]) logits = models.create_model(reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') if False: spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=dct_coefficient_count) fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape( fingerprint_input, [-1, fingerprint_time_size * fingerprint_frequency_size]) else: audio_size = model_settings['desired_samples'] reshaped_input = tf.reshape(decoded_sample_data.audio, [-1, audio_size]) logits = models.create_model(reshaped_input, model_settings, model_architecture, is_training=False, norm_binw=FLAGS.norm_binw, downsample=FLAGS.downsample, add_prefilter_bias=FLAGS.prefilter_bias, use_down_avgfilt=FLAGS.use_down_avgfilt, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def prepare_processing_graph(self, model_settings): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - mfcc_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. """ desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2]) self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2]) padded_foreground = tf.pad( scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder(tf.float32, [desired_samples, 1]) self.background_volume_placeholder_ = tf.placeholder(tf.float32, []) background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. # mel_bias_ = linear_to_mel_weight_matrix(num_mel_bins=model_settings['dct_coefficient_count'], # num_spectrogram_bins=int(2048/2+1), # sample_rate=model_settings['sample_rate'], # lower_edge_hertz=100, # upper_edge_hertz=4800) #warp_factor=self.warp_factor_placeholder_) # spectrogram = tf.abs(tf.contrib.signal.stft(tf.transpose(background_clamp), # model_settings['window_size_samples'], # model_settings['window_stride_samples'], # fft_length=2048, # window_fn=tf.contrib.signal.hann_window, # pad_end=False)) # self.mfcc_ = tf.matmul(tf.reshape(tf.pow(spectrogram, 2), [-1, 1025]), mel_bias_) # #self.mfcc_ = tf.maximum(self.mfcc_, 1e-7) # self.mfcc_ = tf.log(tf.maximum(self.mfcc_, 1e-7)) # print('/n New feature without DCT and Log by iVip-Tsinghua /n hahahahahahahaha /n') spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) self.mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count'])
import numpy as np if len(sys.argv) < 3: raise ValueError("give me a path to model and to a file float32 .dat") else: model_path = sys.argv[1] file_path = sys.argv[2] FRAME_SIZE = 640 FRAME_STRIDE = 320 SAMPLE_RATE = DESIRED_SAMPLES = 16000 NUM_CEP = 10 wav_loader = io_ops.read_file(file_path) wav_decoder = audio_ops.decode_wav(wav_loader, desired_channels=1, desired_samples=DESIRED_SAMPLES) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrograms_power = audio_ops.audio_spectrogram(wav_decoder.audio, window_size=FRAME_SIZE, stride=FRAME_STRIDE, magnitude_squared=True) USE_POWER = True if USE_POWER: # Warp the linear scale spectrograms into the mel-scale. num_spectrogram_bins = spectrograms_power.shape[-1].value lower_edge_hertz, upper_edge_hertz, num_mel_bins = 20.0, 4000.0, 40 linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, 16000.0, lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot(spectrograms_power,
cond_fp = tf.placeholder(tf.string, []) cond_dataset = tf.data.TextLineDataset( [cond_fp]) # Multiple conditional texts per audio file cond_texts_iter = cond_dataset.make_initializable_iterator() cond_text = cond_texts_iter.get_next() # Conditional text embedding embed = hub.Module("https://tfhub.dev/google/elmo/2", trainable=False, name='embed') cond_texts_batch = tf.placeholder(tf.string, [None]) cond_text_embeds = embed(cond_texts_batch) audio_fp = tf.placeholder(tf.string, []) audio_bin = tf.read_file(audio_fp) samps = contrib_audio.decode_wav(audio_bin, 1).audio[:, 0] if slice_len_samps is not None: if args.first_only: pad_end = True else: pad_end = False slices = tf.contrib.signal.frame(samps, slice_len_samps, slice_len_samps, axis=0, pad_end=pad_end) if args.nrg_top_k: nsecs = tf.cast(tf.shape(samps)[0], tf.float32) / args.fs
def prepare_processing_graph(self, model_settings, summaries_dir): """ 建立张量流图以应用输入失真。 创建一个图形,加载一个WAVE文件,对其进行解码、缩放体积、平移, 添加背景噪声,计算一个声谱图,然后从中生成MFCC特征。 必须在TensorFlow会话运行时调用它,它会创建多个占位符输入和一个输出:: - wav_filename_placeholder_: 音频文件名 - foreground_volume_placeholder_: 主剪辑的声音应该有多大 - time_shift_padding_placeholder_: 在哪个位置剪辑 - time_shift_offset_placeholder_: 在剪辑上移动多少 - background_data_placeholder_: 背景噪声的PCM采样数据 - background_volume_placeholder_: 背景中混音的响度 - output_: 经过处理后的二维输出 Args: model_settings: 正在训练的当前模型信息 summaries_dir: 保存训练摘要信息的路径 """ with tf.get_default_graph().name_scope('data'): desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder( tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) #允许调整音频样本的音量 self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volume') scaled_foreground = tf.multiply( wav_decoder.audio, self.foreground_volume_placeholder_) # 移动样本的起始位置,并用零填充任何间隙 self.time_shift_padding_placeholder_ = tf.placeholder( tf.int32, [2, 2], name='time_shift_padding') self.time_shift_offset_placeholder_ = tf.placeholder( tf.int32, [2], name='time_shift_offset') padded_foreground = tf.pad(scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # 混入背景噪音 self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # 运行频谱图和MFCC节点来获取音频的二维特征 spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) tf.summary.image('spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1) #频谱图中每个FFT行中的桶数将取决于每个窗口中有多少输入样本。 #不需要详细分类,希望缩小它们以产生更小的结果。 #一种方法是使用平均法来遍历相邻的bucket,更复杂的方法是应用MFCC算法来缩小表示。 if model_settings['preprocess'] == 'average': self.output_ = tf.nn.pool( tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') tf.summary.image('shrunk_spectrogram', self.output_, max_outputs=1) elif model_settings['preprocess'] == 'mfcc': self.output_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) tf.summary.image('mfcc', tf.expand_dims(self.output_, -1), max_outputs=1) else: raise ValueError( 'Unknown preprocess mode "%s" (should be "mfcc" or' ' "average")' % (model_settings['preprocess'])) # 合并所有摘要并将其写入/tmp/retrain_日志 self.merged_summaries_ = tf.summary.merge_all(scope='data') self.summary_writer_ = tf.summary.FileWriter( summaries_dir + '/data', tf.get_default_graph())
def prepare_processing_graph(self, model_settings): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - mfcc_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. """ desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2]) self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2]) padded_foreground = tf.pad( scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder(tf.float32, [desired_samples, 1]) self.background_volume_placeholder_ = tf.placeholder(tf.float32, []) background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrograms_power = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) if USE_POWER: # Warp the linear scale spectrograms into the mel-scale. num_spectrogram_bins = spectrograms_power.shape[-1].value lower_edge_hertz, upper_edge_hertz, num_mel_bins = 20.0, 4000.0, 40 linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, 16000.0, lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot( spectrograms_power, linear_to_mel_weight_matrix, 1) mel_spectrograms.set_shape(spectrograms_power.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) # Compute a stabilized log to get log-magnitude mel-scale spectrograms. log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6) # Compute MFCCs from log_mel_spectrograms and take the first NDCT. mfccs = tf.signal.mfccs_from_log_mel_spectrograms( log_mel_spectrograms)[..., :model_settings['dct_coefficient_count']] self.mfcc_ = tf.expand_dims(mfccs, axis=0) else: self.mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count']) def set_size(self, mode): """Calculates the number of samples in the dataset partition. Args: mode: Which partition, must be 'training', 'validation', or 'testing'. Returns: Number of samples in the partition. """ return len(self.data_index[mode]) def get_data(self, how_many, offset, model_settings, background_frequency, background_volume_range, time_shift, mode, sess): """Gather samples from the data set, applying transformations as needed. When the mode is 'training', a random selection of samples will be returned, otherwise the first N clips in the partition will be used. This ensures that validation always uses the same samples, reducing noise in the metrics. Args: how_many: Desired number of samples to return. -1 means the entire contents of this partition. offset: Where to start when fetching deterministically. model_settings: Information about the current model being trained. background_frequency: How many clips will have background noise, 0.0 to 1.0. background_volume_range: How loud the background noise will be. time_shift: How much to randomly shift the clips by in time. mode: Which partition to use, must be 'training', 'validation', or 'testing'. sess: TensorFlow session that was active when processor was created. Returns: List of sample data for the transformed samples, and list of labels in one-hot form. """ # Pick one of the partitions to choose samples from. candidates = self.data_index[mode] if how_many == -1: sample_count = len(candidates) else: sample_count = max(0, min(how_many, len(candidates) - offset)) # Data and labels will be populated and returned. data = np.zeros((sample_count, model_settings['fingerprint_size'])) labels = np.zeros((sample_count, model_settings['label_count'])) desired_samples = model_settings['desired_samples'] use_background = self.background_data and (mode == 'training') pick_deterministically = (mode != 'training') # Use the processing graph we created earlier to repeatedly to generate the # final output sample data we'll use in training. for i in xrange(offset, offset + sample_count): # Pick which audio sample to use. if how_many == -1 or pick_deterministically: sample_index = i else: sample_index = np.random.randint(len(candidates)) sample = candidates[sample_index] # If we're time shifting, set up the offset for this sample. if time_shift > 0: time_shift_amount = np.random.randint(-time_shift, time_shift) else: time_shift_amount = 0 if time_shift_amount > 0: time_shift_padding = [[time_shift_amount, 0], [0, 0]] time_shift_offset = [0, 0] else: time_shift_padding = [[0, -time_shift_amount], [0, 0]] time_shift_offset = [-time_shift_amount, 0] input_dict = { self.wav_filename_placeholder_: sample['file'], self.time_shift_padding_placeholder_: time_shift_padding, self.time_shift_offset_placeholder_: time_shift_offset, } # Choose a section of background noise to mix in. if use_background: background_index = np.random.randint(len(self.background_data)) background_samples = self.background_data[background_index] background_offset = np.random.randint( 0, len(background_samples) - model_settings['desired_samples']) background_clipped = background_samples[background_offset:( background_offset + desired_samples)] background_reshaped = background_clipped.reshape([desired_samples, 1]) if np.random.uniform(0, 1) < background_frequency: background_volume = np.random.uniform(0, background_volume_range) else: background_volume = 0 else: background_reshaped = np.zeros([desired_samples, 1]) background_volume = 0 input_dict[self.background_data_placeholder_] = background_reshaped input_dict[self.background_volume_placeholder_] = background_volume # If we want silence, mute out the main sample but leave the background. if sample['label'] == SILENCE_LABEL: input_dict[self.foreground_volume_placeholder_] = 0 else: input_dict[self.foreground_volume_placeholder_] = 1 # Run the graph to produce the output audio. data[i - offset, :] = sess.run(self.mfcc_, feed_dict=input_dict).flatten() label_index = self.word_to_index[sample['label']] labels[i - offset, label_index] = 1 return data, labels def get_wav_files(self, how_many, offset, model_settings, mode): """Return wav_file names and labels from train/val/test sets. """ # Pick one of the partitions to choose samples from. candidates = self.data_index[mode] if how_many == -1: sample_count = len(candidates) else: sample_count = max(0, min(how_many, len(candidates) - offset)) pick_deterministically = (mode != 'training') wav_files = [] labels = np.zeros((sample_count, model_settings['label_count'])) for i in xrange(offset, offset + sample_count): # Pick which audio sample to use. if how_many == -1 or pick_deterministically: sample_index = i else: sample_index = np.random.randint(len(candidates)) sample = candidates[sample_index] if sample['label'] == SILENCE_LABEL: wav_files.append('silence.wav') else: wav_files.append(sample['file']) label_index = self.word_to_index[sample['label']] labels[i - offset, label_index] = 1 return wav_files, labels def get_unprocessed_data(self, how_many, model_settings, mode): """Retrieve sample data for the given partition, with no transformations. Args: how_many: Desired number of samples to return. -1 means the entire contents of this partition. model_settings: Information about the current model being trained. mode: Which partition to use, must be 'training', 'validation', or 'testing'. Returns: List of sample data for the samples, and list of labels in one-hot form. """ candidates = self.data_index[mode] if how_many == -1: sample_count = len(candidates) else: sample_count = how_many desired_samples = model_settings['desired_samples'] words_list = self.words_list data = np.zeros((sample_count, desired_samples)) labels = [] with tf.Session(graph=tf.Graph()) as sess: wav_filename_placeholder = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) foreground_volume_placeholder = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, foreground_volume_placeholder) for i in range(sample_count): if how_many == -1: sample_index = i else: sample_index = np.random.randint(len(candidates)) sample = candidates[sample_index] input_dict = {wav_filename_placeholder: sample['file']} if sample['label'] == SILENCE_LABEL: input_dict[foreground_volume_placeholder] = 0 else: input_dict[foreground_volume_placeholder] = 1 data[i, :] = sess.run(scaled_foreground, feed_dict=input_dict).flatten() label_index = self.word_to_index[sample['label']] labels.append(words_list[label_index]) return data, labels
def prepare_processing_graph(self, model_settings, summaries_dir): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - output_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. summaries_dir: Path to save training summary information to. Raises: ValueError: If the preprocessing mode isn't recognized. """ with tf.get_default_graph().name_scope('data'): desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder( tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volume') scaled_foreground = tf.multiply( wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder( tf.int32, [2, 2], name='time_shift_padding') self.time_shift_offset_placeholder_ = tf.placeholder( tf.int32, [2], name='time_shift_offset') padded_foreground = tf.pad(scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) tf.summary.image('spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1) # The number of buckets in each FFT row in the spectrogram will depend on # how many input samples there are in each window. This can be quite # large, with a 160 sample window producing 127 buckets for example. We # don't need this level of detail for classification, so we often want to # shrink them down to produce a smaller result. That's what this section # implements. One method is to use average pooling to merge adjacent # buckets, but a more sophisticated approach is to apply the MFCC # algorithm to shrink the representation. if model_settings['preprocess'] == 'average': self.output_ = tf.nn.pool( tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') tf.summary.image('shrunk_spectrogram', self.output_, max_outputs=1) elif model_settings['preprocess'] == 'mfcc': self.output_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) tf.summary.image('mfcc', tf.expand_dims(self.output_, -1), max_outputs=1) else: raise ValueError( 'Unknown preprocess mode "%s" (should be "mfcc" or' ' "average")' % (model_settings['preprocess'])) # Merge all the summaries and write them out to /tmp/retrain_logs (by # default) self.merged_summaries_ = tf.summary.merge_all(scope='data') if summaries_dir: self.summary_writer_ = tf.summary.FileWriter( summaries_dir + '/data', tf.get_default_graph())
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data_tensor', type=str, default='decoded_sample_data', help="""\ Input data tensor name. Leave as is for the competition.\ """) parser.add_argument('--final_tensor', type=str, default='labels_softmax', help="""\ Name of the softmax output tensor. Leave as is for the competition.\ """) parser.add_argument('--frozen_path', type=str, default='tf_files/frozen.pb', help="""\ The frozen graph's filename.\ """) parser.add_argument('--checkpoint_path', type=str, default='checkpoints_106/ep-062-vl-0.1815.hdf5', help="""\ Path to the hdf5 checkpoint that you want to freeze.\ """) args, unparsed = parser.parse_known_args() custom_objects = { 'relu6': relu6, 'DepthwiseConv2D': DepthwiseConv2D, 'overlapping_time_slice_stack': overlapping_time_slice_stack, 'softmax': softmax, '<lambda>': smooth_categorical_crossentropy } model = load_model(args.checkpoint_path, custom_objects=custom_objects) # rename placeholders for special prize: # https://www.kaggle.com/c/tensorflow-speech-recognition-challenge#Prizes # decoded_sample_data:0, taking a [16000, 1] float tensor as input, # representing the audio PCM-encoded data. # `decode_wav` will produce two outputs. tf names them: 'name:0', 'name:1'. wav_filename_placeholder_ = tf.placeholder(tf.string, [], name='wav_fn') wav_loader = io_ops.read_file(wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=16000, name=args.data_tensor) # add batch dimension and remove last one # keras model wants (None, 16000) data_reshaped = tf.reshape(wav_decoder.audio, (1, -1)) # call keras model softmax_probs = model(data_reshaped) # remove batch dimension softmax_probs = tf.reshape(softmax_probs, (-1, ), name=args.final_tensor) frozen_graph_def = graph_util.convert_variables_to_constants( sess, sess.graph.as_graph_def(), [args.final_tensor]) with gfile.FastGFile(args.frozen_path, 'wb') as f: f.write(frozen_graph_def.SerializeToString()) print("Wrote frozen graph to: %s" % args.frozen_path)
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, feature_bin_count, model_architecture, preprocess): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. feature_bin_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. preprocess: How the spectrogram is processed to produce features, for example 'mfcc' or 'average'. Raises: Exception: If the preprocessing mode isn't recognized. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, feature_bin_count, preprocess) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) if preprocess == 'average': fingerprint_input = tf.nn.pool( tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') elif preprocess == 'mfcc': fingerprint_input = contrib_audio.mfcc( spectrogram, sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) else: raise Exception('Unknown preprocess mode "%s" (should be "mfcc" or' ' "average")' % (preprocess)) fingerprint_size = model_settings['fingerprint_size'] reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size]) logits = models.create_model( reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
# Taken from Mauri de Souza Nunes at https://mauri870.github.io/blog/posts/audio-spectrograms-in-tensorflow/ import tensorflow as tf from tensorflow.contrib.framework.python.ops import audio_ops # Wav file name wav_file = tf.placeholder(tf.string) # Read the wav file audio_binary = tf.read_file(wav_file) # Decode the wav mono into a 2D tensor with time in dimension 0 # and channel along dimension 1 waveform = audio_ops.decode_wav(audio_binary, file_format='wav', desired_channels=1) # Compute the spectrogram spectrogram = audio_ops.audio_spectrogram(waveform.audio, window_size=1024, stride=64) # Custom brightness brightness = tf.placeholder(tf.float32, shape=[]) mul = tf.multiply(spectrogram, brightness) # Normalize pixels min_const = tf.constant(255.) minimum = tf.minimum(mul, min_const) # Expand dims so we get the proper shape
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, feature_bin_count, model_architecture, preprocess): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. feature_bin_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. preprocess: How the spectrogram is processed to produce features, for example 'mfcc', 'average', or 'micro'. Raises: Exception: If the preprocessing mode isn't recognized. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, feature_bin_count, preprocess) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) if preprocess == 'average': fingerprint_input = tf.nn.pool( tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') elif preprocess == 'mfcc': fingerprint_input = contrib_audio.mfcc( spectrogram, sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) elif preprocess == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running TensorFlow' ' directly from Python, you need to build and run through Bazel, for' ' example' ' `bazel run tensorflow/examples/speech_commands:freeze_graph`' ) sample_rate = model_settings['sample_rate'] window_size_ms = (model_settings['window_size_samples'] * 1000) / sample_rate window_step_ms = (model_settings['window_stride_samples'] * 1000) / sample_rate int16_input = tf.cast( tf.multiply(decoded_sample_data.audio, 32767), tf.int16) micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=model_settings['fingerprint_width'], out_scale=1, out_type=tf.float32) fingerprint_input = tf.multiply(micro_frontend, (10.0 / 256.0)) else: raise Exception('Unknown preprocess mode "%s" (should be "mfcc",' ' "average", or "micro")' % (preprocess)) fingerprint_size = model_settings['fingerprint_size'] reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size]) logits = models.create_model( reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
import tensorflow as tf import numpy as np from tensorflow.python.ops import io_ops from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio from scipy.io import wavfile filename = '/Users/tsingh1/Developer/kaggle/speech/data/train/audio/bed/d78858d9_nohash_1.wav' filenameTensor = tf.constant(filename) with tf.Session() as sess: wav_loader = io_ops.read_file(filenameTensor) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=5) x = wav_decoder.audio.eval().flatten() print('x1', x) print('x1', x.shape) _, wav = wavfile.read(filename) wav1 = wav.astype(np.float32) / np.iinfo(np.int16).max print('w', wav) print('w1', wav1) print('w1', wav1.shape)
def parse_files_function(example): from tensorflow.contrib.framework.python.ops import audio_ops wav_loader = tf.read_file(example) wav_tensor = audio_ops.decode_wav(wav_loader) return wav_tensor
print(f"wav_test_data_path_start: {params.wav_test_data_path_start}") print("+--------------+") for f in iglob(params.wav_test_data_path_start + "*.wav"): print(f"File: {f}") print("+--------------+") sess = tf.Session() file_number = 0 section_size = params.section_size for f in file_arr: ch1_song = np.array([]).astype(float) ch2_song = np.array([]).astype(float) audio_binary = tf.read_file(f) wav_decoder = decode_wav(audio_binary, desired_channels=2) sample_rate, audio = sess.run([wav_decoder.sample_rate, wav_decoder.audio]) audio = np.array(audio) print(len(audio[:, 0])) print(audio.shape) a0 = audio[:, 0] a1 = audio[:, 1] a0 = normalize(a0) a1 = normalize(a1) if params.overlap_sections: s_a0 = segment(a0, params.overlap_section_size, section_size) s_a1 = segment(a1, params.overlap_section_size, section_size)
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture, model_size_info, use_mfcc): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, use_mfcc) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) if model_settings['use_mfcc'] == True: fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=dct_coefficient_count) else: linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix( num_mel_bins=model_settings['dct_coefficient_count'], num_spectrogram_bins=spectrogram.shape[-1].value, sample_rate=model_settings['sample_rate'], upper_edge_hertz=7600.0, lower_edge_hertz=80.0) fingerprint_input = tf.tensordot(spectrogram, linear_to_mel_weight_matrix, 1) fingerprint_input.set_shape(spectrogram.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape( fingerprint_input, [-1, fingerprint_time_size * fingerprint_frequency_size]) logits = models.create_model(reshaped_input, model_settings, model_architecture, model_size_info, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def prepare_processing_graph(self, model_settings, input_type, volume_scale): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - mfcc_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. """ desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2]) self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2]) padded_foreground = tf.pad(scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1]) self.background_volume_placeholder_ = tf.placeholder(tf.float32, []) background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add( background_mul, sliced_foreground) # Noise is added to clean speech signal background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) background_clamp = tf.multiply(background_clamp, volume_scale) background_clamp = tf.clip_by_value(background_clamp, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) if (input_type == 'log-mel'): print("log-mel energies") # Warp the linear-scale, magnitude spectrograms into the mel-scale. num_spectrogram_bins = spectrogram.shape[ -1].value #magnitude_spectrograms.shape[-1].value lower_edge_hertz, upper_edge_hertz, num_mel_bins = 20.0, 4000.0, model_settings[ 'dct_coefficient_count'] linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, model_settings['sample_rate'], lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot(spectrogram, linear_to_mel_weight_matrix, 1) # Note: Shape inference for `tf.tensordot` does not currently handle this case. mel_spectrograms.set_shape(spectrogram.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) log_offset = 1e-6 log_mel_spectrograms = tf.log(mel_spectrograms + log_offset) self.mfcc_ = log_mel_spectrograms elif (input_type == 'MFCC'): print('MFCC-features') self.mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count'])
def prepare_processing_graph(self, model_settings, summaries_dir): with tf.get_default_graph().name_scope('data'): desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder( tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volume') scaled_foreground = tf.multiply( wav_decoder.audio, self.foreground_volume_placeholder_) self.time_shift_padding_placeholder_ = tf.placeholder( tf.int32, [2, 2], name='time_shift_padding') self.time_shift_offset_placeholder_ = tf.placeholder( tf.int32, [2], name='time_shift_offset') padded_foreground = tf.pad(scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) tf.summary.image('spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1) if model_settings['preprocess'] == 'average': self.output_ = tf.nn.pool( tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') tf.summary.image('shrunk_spectrogram', self.output_, max_outputs=1) elif model_settings['preprocess'] == 'mfcc': self.output_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) tf.summary.image('mfcc', tf.expand_dims(self.output_, -1), max_outputs=1) else: raise ValueError( 'Unknown preprocess mode "%s" (should be "mfcc" or "average")' % (model_settings['preprocess'])) self.merged_summaries_ = tf.summary.merge_all(scope='data') self.summary_writer_ = tf.summary.FileWriter( summaries_dir + '/data', tf.get_default_graph())
def prepare_processing_graph(self, model_settings): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - mfcc_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. """ desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2]) self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2]) padded_foreground = tf.pad(scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1]) self.background_volume_placeholder_ = tf.placeholder(tf.float32, []) background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) ## Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. #spectrogram = contrib_audio.audio_spectrogram( # background_clamp, # window_size=model_settings['window_size_samples'], # stride=model_settings['window_stride_samples'], # magnitude_squared=True) #self.mfcc_ = contrib_audio.mfcc( # spectrogram, # wav_decoder.sample_rate, # dct_coefficient_count=model_settings['dct_coefficient_count']) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. background_clamp = tf.reshape(background_clamp, [1, -1]) self.background_clamp = background_clamp stfts = tf.contrib.signal.stft( background_clamp, frame_length=model_settings['window_size_samples'], frame_step=model_settings['window_stride_samples'], fft_length=512, window_fn=None) spectrograms = tf.abs(stfts) ## Warp the linear scale spectrograms into the mel-scale. num_spectrogram_bins = stfts.shape[-1].value lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 3800.0, 26 linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, 16000, lower_edge_hertz, upper_edge_hertz) self.linear_to_mel_weight_matrix = linear_to_mel_weight_matrix self.spectrograms = spectrograms self.num_spectrogram_bins = num_spectrogram_bins mel_spectrograms = tf.tensordot(spectrograms, linear_to_mel_weight_matrix, 1) mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) ## Compute a stabilized log to get log-magnitude mel-scale spectrograms. log_mel_spectrograms = tf.log(mel_spectrograms + 1e-6) ## Compute MFCCs from log_mel_spectrograms and take the first 13. self.mfcc_ = tf.contrib.signal.mfccs_from_log_mel_spectrograms( log_mel_spectrograms)[ ..., :model_settings['dct_coefficient_count']]
def prepare_processing_graph(self, model_settings): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - mfcc_: 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. """ desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2]) self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2]) padded_foreground = tf.pad(scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1]) self.background_volume_placeholder_ = tf.placeholder(tf.float32, []) background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) self.mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count'])
def audiofile_to_features(wav_filename): samples = tf.read_file(wav_filename) decoded = contrib_audio.decode_wav(samples, desired_channels=1) features, features_len = samples_to_mfccs(decoded.audio, decoded.sample_rate) return features, features_len