def prepare_input(filename): from tensorflow.contrib.framework.python.ops import audio_ops from tensorflow.python.ops import io_ops with tf.Session(graph=tf.Graph()) as sess: wav_filename_placeholder = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = tf.audio.decode_wav(wav_loader, desired_channels=1, desired_samples=16000, name='decoded_sample_data') spectrum = audio_ops.audio_spectrogram(input=wav_decoder[0], window_size=640, stride=320, magnitude_squared=True, name='AudioSpectrogram') final = audio_ops.mfcc(spectrogram=spectrum, sample_rate=wav_decoder[1], upper_frequency_limit=4000.0, lower_frequency_limit=20.0, filterbank_channel_count=40, dct_coefficient_count=10, name='Mfcc') data = sess.run(final, feed_dict={wav_filename_placeholder: filename}) print(f'Data shape: {data.shape}') return data
def get_test_data(self, how_many, offset, model_settings, sess, features='mfcc'): candidates = self.data_index if how_many == -1: sample_count = len(candidates) else: sample_count = max(0, min(how_many, len(candidates) - offset)) desired_samples = model_settings['desired_samples'] data = np.zeros((sample_count, model_settings['fingerprint_size'])) wav_filename_placeholder = tf.placeholder(tf.string, [], name='wav_file_names') wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) spectrogram = contrib_audio.audio_spectrogram( wav_decoder.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) mfcc = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count']) for i in range(offset, offset + sample_count): input_dict = {wav_filename_placeholder : candidates[i]} if features == "spectrogram": data[i - offset, :] = sess.run(spectrogram, feed_dict=input_dict).flatten() elif features == "raw": data[i - offset, :] = sess.run(wav_decoder.audio, feed_dict=input_dict).flatten() else: data[i - offset, :] = sess.run(mfcc, feed_dict=input_dict).flatten() return data
def _build_processing_graph(self): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - mfcc_: Output 2D fingerprint of processed audio. """ with tf.name_scope('audio_processing'): desired_samples = self._model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) background_clamp = tf.clip_by_value(wav_decoder.audio, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=self._model_settings['window_size_samples'], stride=self._model_settings['window_stride_samples'], magnitude_squared=True) self.mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=self. _model_settings['dct_coefficient_count'])
def mfcc_tensorflow(wavfile, _sr, frame_size, frame_shift, order=13): sess = tf.InteractiveSession() wav_filename_placeholder = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1) wav_data = wav_decoder.audio wav_sample_rate = sess.run(wav_decoder, feed_dict={ wav_filename_placeholder: wavfile }).sample_rate check_sample_rate(wavfile, _sr, wav_sample_rate) spectrogram = contrib_audio.audio_spectrogram(wav_data, window_size=frame_size, stride=frame_shift, magnitude_squared=True) mfcc_ = contrib_audio.mfcc(spectrogram, wav_decoder.sample_rate, dct_coefficient_count=order) mfcc_data = sess.run(mfcc_, feed_dict={wav_filename_placeholder: wavfile}) return mfcc_data
def load_mfcc_file(sess, filename): filename_ph = tf.placeholder(tf.string) loader = io_ops.read_file(filename_ph) decoder = contrib_audio.decode_wav(loader, desired_channels=1, desired_samples=16000) spectrogram = contrib_audio.audio_spectrogram( decoder.audio, window_size=480, stride=160, magnitude_squared=True) mfcc = contrib_audio.mfcc( spectrogram, decoder.sample_rate, dct_coefficient_count=40) return sess.run(mfcc, feed_dict={filename_ph: filename})
def prepare_processing_graph(self, model_settings): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - mfcc_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. """ desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( # wav_loader, desired_channels=1, desired_samples=desired_samples) wav_loader, desired_channels=1, desired_samples=16000) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2]) self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2]) padded_foreground = tf.pad( scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder(tf.float32, [desired_samples, 1]) self.background_volume_placeholder_ = tf.placeholder(tf.float32, []) background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) ###################### M F C C ################################# # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) self.mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count'])
def wav_to_features(filenames_dataset, hparams, feature_count): dataset = filenames_dataset.map(lambda filename: io_ops.read_file(filename)) dataset = dataset.map(lambda wav_loader: contrib_audio.decode_wav(wav_loader, desired_channels=1)) dataset = dataset.map(lambda wav_decoder: (contrib_audio.audio_spectrogram( wav_decoder.audio, window_size=int(hparams.sample_rate * hparams.window_size_ms / 1000), stride=int(hparams.sample_rate * hparams.window_stride_ms / 1000), magnitude_squared=True), wav_decoder.sample_rate)) dataset = dataset.map(lambda spectrogram, sample_rate: contrib_audio.mfcc( spectrogram, sample_rate, dct_coefficient_count=feature_count)) dataset = dataset.map(lambda inputs: ( inputs, tf.nn.moments(inputs, axes=[1]) )) dataset = dataset.map(lambda inputs, moments: ( tf.divide(tf.subtract(inputs, moments[0]), moments[1]), tf.shape(inputs)[1] )) dataset = dataset.map(lambda inputs, seq_len: ( inputs[0], seq_len )) return dataset
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, ) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=dct_coefficient_count) fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape( fingerprint_input, [-1, fingerprint_time_size * fingerprint_frequency_size]) logits = models.create_model(reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def prepare_processing_graph(self, model_settings): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - mfcc_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. """ desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2]) self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2]) padded_foreground = tf.pad( scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder(tf.float32, [desired_samples, 1]) self.background_volume_placeholder_ = tf.placeholder(tf.float32, []) background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) self.mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count'])
def samples_to_mfccs(samples, sample_rate): spectrogram = contrib_audio.audio_spectrogram(samples, window_size=Config.audio_window_samples, stride=Config.audio_step_samples, magnitude_squared=True) mfccs = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=Config.n_input) mfccs = tf.reshape(mfccs, [-1, Config.n_input]) return mfccs, tf.shape(mfccs)[0]
def samples_to_mfccs(samples, sample_rate): spectrogram = contrib_audio.audio_spectrogram(samples, window_size=Config.audio_window_samples, stride=Config.audio_step_samples, magnitude_squared=True) mfccs = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=Config.n_input) mfccs = tf.reshape(mfccs, [-1, Config.n_input]) return mfccs, tf.shape(mfccs)[0]
def _load_sample( wav_filename, model_settings): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model: Name of the kind of model to generate. """ wav_loader = io_ops.read_file(wav_filename) decoded_sample_data = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') if model_settings['input_format'] == 'raw': print(decoded_sample_data.audio.shape) reshaped_input = tf.reshape(decoded_sample_data.audio, [ -1, model_settings['desired_samples'] ]) print(reshaped_input.shape) else: spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, lower_frequency_limit=model_settings['lower_frequency_limit'], upper_frequency_limit=model_settings['upper_frequency_limit'], filterbank_channel_count=model_settings['filterbank_channel_count'], dct_coefficient_count=model_settings['dct_coefficient_count']) fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape(fingerprint_input, [ -1, fingerprint_time_size * fingerprint_frequency_size ]) return reshaped_input
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=dct_coefficient_count) fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape(fingerprint_input, [ -1, fingerprint_time_size * fingerprint_frequency_size ]) logits = models.create_model( reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def AudioToMfcc(sample_rate, audio, window_size_ms, window_stride_ms, num_coefficients): window_size_samples = sample_rate * window_size_ms // 1000 window_stride_samples = sample_rate * window_stride_ms // 1000 spectrogram = contrib_audio.audio_spectrogram( audio, window_size=window_size_samples, stride=window_stride_samples, magnitude_squared=True) mfcc = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=num_coefficients) return mfcc
def __init__(self, sample_rate: int, dct_coef_count: int=-1): ''' suppose the channel number is 1. ''' assert sample_rate == 16_000 if dct_coef_count == -1: dct_coef_count = DataGraphMFCC.max_mfcc_num else: assert dct_coef_count <= DataGraphMFCC.max_mfcc_num self._sample_rate = sample_rate samples_per_second = sample_rate / 1000 window = int(DataGraphMFCC.window_duration * samples_per_second) stride = int(DataGraphMFCC.stride_duration * samples_per_second) self._graph = tf.Graph() with self._graph.as_default(): self._in_wav_file = tf.placeholder(tf.string, [], name='wav_filename') self._in_frame_num = tf.placeholder(tf.int32, []) wav_loader = io_ops.read_file(self._in_wav_file) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1) self._out_audio = tf.squeeze(wav_decoder.audio) self._out_sample_rate = wav_decoder.sample_rate self._in_audio = tf.placeholder(tf.float32, [None]) in_audio = tf.expand_dims(self._in_audio, -1) audio_clamp = tf.clip_by_value(in_audio, -1.0, 1.0) spectrogram = contrib_audio.audio_spectrogram( audio_clamp, window_size=window, stride=stride, magnitude_squared=True) self._out_spectrogram = spectrogram feat_ts = contrib_audio.mfcc( spectrogram=spectrogram, sample_rate=sample_rate, dct_coefficient_count=dct_coef_count, ) self._out_mfcc = feat_ts[0] self._out_real_mfcc_len = tf.shape(self._out_mfcc)[0] diff = tf.maximum(0, self._in_frame_num - self._out_real_mfcc_len) self._out_expanded_mfcc = tf.pad( self._out_mfcc, [[0, diff], [0, 0]], )[: self._in_frame_num] self._sess = tf.Session(graph=self._graph) print(f"DataGgraphMFCC graph is created!")
def build_graph(self): """ Graph to extract mfcc fingerprint given wav file Here we add the necessary input & output tensors, to decode wav, serialize mfcc fingerprint, restore from checkpoint etc. Returns: input_wav_filename: A tensor containing wav filename as the input layer. mfcc_fingerprint: The MFCC fingerprint tensor, that will be materialized later. """ self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=self.desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2]) self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2]) padded_foreground = tf.pad(scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [self.desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [self.desired_samples, 1]) self.background_volume_placeholder_ = tf.placeholder(tf.float32, []) background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=self.window_size_samples, stride=self.window_stride_samples, magnitude_squared=True) self.mfcc_fingerprint_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=self.opt.dct_coefficient_count)
def _single_mfcc(self, audio, window_size_samples, window_stride_samples, magnitude_squared, **kwargs): spectrogram = self._single_spectrogram(audio, window_size_samples, window_stride_samples, magnitude_squared) mfcc = contrib_audio.mfcc( spectrogram, kwargs["sample_rate_const"], upper_frequency_limit=kwargs["upper_edge_hertz"], lower_frequency_limit=kwargs["lower_edge_hertz"], filterbank_channel_count=kwargs["num_mel_bins"], dct_coefficient_count=kwargs["num_mfccs"], ) return mfcc
def encode_data(audio_data, sample_rate): spectrogram = contrib_audio.audio_spectrogram( audio_data, window_size_samples, window_stride_samples ) print(spectrogram.shape) mfcc = contrib_audio.mfcc( spectrogram, sample_rate=sample_rate, dct_coefficient_count=dct_coefficient_count ) return mfcc
def wav_to_mfcc(self, raw_data): spectrogram = audio_ops.audio_spectrogram( raw_data, window_size=self.parameters['spectogram_window_size'], stride=self.parameters['spectogram_stride'], magnitude_squared=True) mfcc = audio_ops.mfcc( spectrogram, self.parameters['audio_sample_rate'], dct_coefficient_count=self.parameters['dtc_coefficient_count']) mfcc = tf.expand_dims(mfcc, -1) self.input_dimensions = ( self.input_dimensions[0] / self.parameters['spectogram_stride'] - 2, self.parameters['dtc_coefficient_count'], 1) mfcc = tf.squeeze(mfcc, 0) return mfcc
def prepare_processing_graph(self, model_settings): """ Build a tensorflow graph creates a graph that loads a wave file, decodes it, scales the volume, shifts it in time calculates a spectrogram and builds MFCC fingerprint from that input: model_settings: info about model being trained """ desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2]) self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2]) padded_foreground = tf.pad(scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) self.background_data_placeholder_ = tf.placeholder( np.float32, [desired_samples, 1]) self.background_volume_placeholder_ = tf.placeholder(np.float32, []) background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) self.mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count'])
def create_inference_graph_and_load_variables(sess, FLAGS): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output the trained model graph. """ model_settings = data_utils.prepare_settings(FLAGS.num_classes, FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) runtime_settings = {'clip_stride_ms': FLAGS.clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=FLAGS.dct_coefficient_count) fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape( fingerprint_input, [-1, fingerprint_time_size, fingerprint_frequency_size, 1], name="model_input") # Init model and load variables model = models.create_model(FLAGS) fw = framework.Framework(sess, model, None, FLAGS, input_tensor=reshaped_input) # Create an output to use for inference logits = tf.nn.softmax(model.get_raw_scores(), name='labels_softmax')
def save_my_test_file(self, model_settings): desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( # wav_loader, desired_channels=1, desired_samples=desired_samples) wav_loader, desired_channels=1, desired_samples=16000) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2]) self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2]) padded_foreground = tf.pad( scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder(tf.float32, [desired_samples, 1]) self.background_volume_placeholder_ = tf.placeholder(tf.float32, []) background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) self.filename_ = tf.placeholder(tf.string) # save_wav_file(self.filename_, np.array(background_clamp), 16000) wav_encoder = contrib_audio.encode_wav(background_clamp, 16000) self.wav_saver = io_ops.write_file(self.filename_, wav_encoder) # with tf.Session(graph=tf.Graph()) as sess: # sess.run(wav_saver) spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) self.test_mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count'])
def get_mfcc_graph(model_settings): g = tf.Graph() with g.as_default(): input_file_placeholder = tf.compat.v1.placeholder(tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(input_file_placeholder) wav_decoder = audio_ops.decode_wav( wav_loader, desired_channels=1, desired_samples=model_settings['desired_samples']) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrograms_power = audio_ops.audio_spectrogram( wav_decoder.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) USE_POWER = True if USE_POWER: # Warp the linear scale spectrograms into the mel-scale. num_spectrogram_bins = spectrograms_power.shape[-1].value lower_edge_hertz, upper_edge_hertz, num_mel_bins = 20.0, 4000.0, 40 linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, 16000.0, lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot(spectrograms_power, linear_to_mel_weight_matrix, 1) mel_spectrograms.set_shape( spectrograms_power.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) # Compute a stabilized log to get log-magnitude mel-scale spectrograms. log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6) # Compute MFCCs from log_mel_spectrograms and take the first NDCT. mfccs = tf.signal.mfccs_from_log_mel_spectrograms( log_mel_spectrograms)[ ..., :model_settings['dct_coefficient_count']] #output = tf.expand_dims(mfccs, axis=0) output = mfccs else: output = audio_ops.mfcc( spectrograms_power, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count']) return g, input_file_placeholder, output, wav_decoder.audio
def prepare_processing_graph(self, model_settings): desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2]) self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2]) padded_foreground = tf.pad(scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1]) self.background_volume_placeholder_ = tf.placeholder(tf.float32, []) background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. print 'window_size_samples', model_settings['window_size_samples'] print 'window_stride_samples', model_settings['window_stride_samples'] print 'background_clamp', background_clamp spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) print 'spectrogram', spectrogram print 'dct_coefficient_count', model_settings['dct_coefficient_count'] print 'wav_decoder.sample_rate', wav_decoder.sample_rate self.mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count']) print 'self.mfcc_', self.mfcc_
def __init__(self, desired_samples=16000, window_size_samples=480, window_stride_samples=160): self.wav_filename_placeholder = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder) # already pads/crops wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) spectrogram = contrib_audio.audio_spectrogram( wav_decoder.audio, window_size=window_size_samples, stride=window_stride_samples, magnitude_squared=True) self.mfcc = contrib_audio.mfcc(spectrogram, wav_decoder.sample_rate, dct_coefficient_count=40)
def build_data_generator(): # Build data generator pipeline desired_samples = model_settings['desired_samples'] wav_filename_placeholder_ = tf.placeholder( tf.string, [], name="wav_filename_placeholder_") wav_loader = io_ops.read_file(wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name="foreground_volume_placeholder_") scaled_foreground = tf.multiply(wav_decoder.audio, foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. time_shift_padding_placeholder_ = tf.placeholder( tf.int32, [2, 2], name="time_shift_padding_placeholder_") time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2]) padded_foreground = tf.pad(scaled_foreground, time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. background_data_placeholder_ = tf.placeholder(tf.float32, [desired_samples, 1]) background_volume_placeholder_ = tf.placeholder(tf.float32, []) background_mul = tf.multiply(background_data_placeholder_, background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count']) return wav_filename_placeholder_, foreground_volume_placeholder_, time_shift_padding_placeholder_, time_shift_offset_placeholder_, background_data_placeholder_, background_volume_placeholder_, mfcc_
def decode_audio(audio_str): wav_decoder = contrib_audio.decode_wav( audio_str, desired_channels=1, desired_samples=self.desired_samples) spectrogram = contrib_audio.audio_spectrogram( wav_decoder.audio, window_size=self.window_size_samples, stride=self.window_stride_samples, magnitude_squared=True) mfcc_fingerprint = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=self.dct_coefficient_count) return mfcc_fingerprint
def prepare_processing_graph(self): self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) spectrogram = contrib_audio.audio_spectrogram( wav_decoder.audio, window_size=window_size_samples, stride=window_stride_samples, magnitude_squared=True) print 'spectrogram', spectrogram print 'dct_coefficient_count', dct_coefficient_count print 'wav_decoder.sample_rate', wav_decoder.sample_rate self.mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=dct_coefficient_count) print 'self.mfcc_', self.mfcc_
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture): graph = tf.Graph() with graph.as_default(): words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=dct_coefficient_count) fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape( fingerprint_input, [-1, fingerprint_time_size * fingerprint_frequency_size]) logits = models.create_model(reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax') return graph
def create_decoder_graph(): #may need to pass in session """Creates the input of the CNN model based off of this paper https://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf Returns: input node and output node """ words_list = prepare_words_list(FLAGS.wanted_words.split(',')) model_settings = prepare_model_settings(len(words_list), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) runtime_settings = {'clip_stride_ms': FLAGS.clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=FLAGS.dct_coefficient_count) fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape( fingerprint_input, [-1, fingerprint_time_size * fingerprint_frequency_size]) input_frequency_size = model_settings['dct_coefficient_count'] input_time_size = model_settings['spectrogram_length'] fingerprint_4d = tf.reshape(reshaped_input, [-1, input_time_size, input_frequency_size, 1]) return wav_data_placeholder, fingerprint_4d
def process_audio(audio_data): with tf.Session() as sess: data_input = tf.compat.v1.placeholder(dtype=tf.float32, shape=(16000, 1), name=None) spectrum = audio_ops.audio_spectrogram(input=data_input, window_size=640, stride=320, magnitude_squared=True, name='AudioSpectrogram') final = audio_ops.mfcc(spectrogram=spectrum, sample_rate=RATE, upper_frequency_limit=4000.0, lower_frequency_limit=20.0, filterbank_channel_count=40, dct_coefficient_count=10, name='Mfcc') data_out = sess.run(final, feed_dict={data_input: audio_data}) return data_out
def Features(conf): sound = tf.placeholder(tf.float32, [None, None], name='wav') # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = contrib_audio.audio_spectrogram( sound, window_size=conf['window_size_samples'], stride=conf['window_stride_samples'], magnitude_squared=True, name='spectogram') mfcc = contrib_audio.mfcc( spectrogram, conf['sample_rate'], dct_coefficient_count=conf['dct_coefficient_count'], name='mfcc') spect_norm = spectrogram / tf.reduce_sum(spectrogram, [1, 2]) mfcc_norm = mfcc / tf.reduce_sum(mfcc, [1, 2]) return sound, spectrogram, mfcc, spect_norm, mfcc_norm
def wav2Input(filename, model_settings): #print('wav2Input ', filename) desired_samples = model_settings['desired_samples'] foreground_volume = 1 time_shift_padding = [[0, 0], [0, 0]] time_shift_offset = [0, 0] #background_data=0 #background_volume=0 wav_loader = io_ops.read_file(filename) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) #scaled_foreground = tf.multiply(wav_decoder.audio, foreground_volume) #padded_foreground = tf.pad(scaled_foreground, time_shift_padding, mode='CONSTANT') #sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [desired_samples, -1]) #background_mul = tf.multiply(background_data, background_volume) #background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(wav_decoder.audio, -1.0, 1.0) spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) mfcc = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count']) mfcc = tf.clip_by_value(mfcc, -10.0, 127.0) with tf.Session() as sess: mfcc = sess.run(mfcc) return mfcc
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, feature_bin_count, model_architecture, preprocess): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. feature_bin_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. preprocess: How the spectrogram is processed to produce features, for example 'mfcc' or 'average'. Raises: Exception: If the preprocessing mode isn't recognized. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, feature_bin_count, preprocess) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) if preprocess == 'average': fingerprint_input = tf.nn.pool( tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') elif preprocess == 'mfcc': fingerprint_input = contrib_audio.mfcc( spectrogram, sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) else: raise Exception('Unknown preprocess mode "%s" (should be "mfcc" or' ' "average")' % (preprocess)) fingerprint_size = model_settings['fingerprint_size'] reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size]) logits = models.create_model( reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, feature_bin_count, model_architecture, preprocess): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. feature_bin_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. preprocess: How the spectrogram is processed to produce features, for example 'mfcc', 'average', or 'micro'. Raises: Exception: If the preprocessing mode isn't recognized. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, feature_bin_count, preprocess) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) if preprocess == 'average': fingerprint_input = tf.nn.pool( tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') elif preprocess == 'mfcc': fingerprint_input = contrib_audio.mfcc( spectrogram, sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) elif preprocess == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running TensorFlow' ' directly from Python, you need to build and run through Bazel, for' ' example' ' `bazel run tensorflow/examples/speech_commands:freeze_graph`' ) sample_rate = model_settings['sample_rate'] window_size_ms = (model_settings['window_size_samples'] * 1000) / sample_rate window_step_ms = (model_settings['window_stride_samples'] * 1000) / sample_rate int16_input = tf.cast( tf.multiply(decoded_sample_data.audio, 32767), tf.int16) micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=model_settings['fingerprint_width'], out_scale=1, out_type=tf.float32) fingerprint_input = tf.multiply(micro_frontend, (10.0 / 256.0)) else: raise Exception('Unknown preprocess mode "%s" (should be "mfcc",' ' "average", or "micro")' % (preprocess)) fingerprint_size = model_settings['fingerprint_size'] reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size]) logits = models.create_model( reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def prepare_processing_graph(self, model_settings, summaries_dir): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - output_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. summaries_dir: Path to save training summary information to. Raises: ValueError: If the preprocessing mode isn't recognized. """ with tf.get_default_graph().name_scope('data'): desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder( tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volume') scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder( tf.int32, [2, 2], name='time_shift_padding') self.time_shift_offset_placeholder_ = tf.placeholder( tf.int32, [2], name='time_shift_offset') padded_foreground = tf.pad( scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) tf.summary.image( 'spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1) # The number of buckets in each FFT row in the spectrogram will depend on # how many input samples there are in each window. This can be quite # large, with a 160 sample window producing 127 buckets for example. We # don't need this level of detail for classification, so we often want to # shrink them down to produce a smaller result. That's what this section # implements. One method is to use average pooling to merge adjacent # buckets, but a more sophisticated approach is to apply the MFCC # algorithm to shrink the representation. if model_settings['preprocess'] == 'average': self.output_ = tf.nn.pool( tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') tf.summary.image('shrunk_spectrogram', self.output_, max_outputs=1) elif model_settings['preprocess'] == 'mfcc': self.output_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) tf.summary.image( 'mfcc', tf.expand_dims(self.output_, -1), max_outputs=1) else: raise ValueError('Unknown preprocess mode "%s" (should be "mfcc" or' ' "average")' % (model_settings['preprocess'])) # Merge all the summaries and write them out to /tmp/retrain_logs (by # default) self.merged_summaries_ = tf.summary.merge_all(scope='data') self.summary_writer_ = tf.summary.FileWriter(summaries_dir + '/data', tf.get_default_graph())