def testStaticShapeInference_NegativeChannelCountInvalid(self): with self.test_session(): with six.assertRaisesRegex(self, Exception, r'channel_count must be positive'): ffmpeg.decode_audio(b'~~~ wave ~~~', file_format='wav', samples_per_second=44100, channel_count=-2)
def testInvalidFile(self): with self.test_session(): contents = 'invalid file' audio_op = ffmpeg.decode_audio(contents, file_format='wav', samples_per_second=10000, channel_count=2) audio = audio_op.eval() self.assertEqual(audio.shape, (0, 0))
def testStaticShapeInference_ConstantChannelCount(self): with self.test_session(): audio_op = ffmpeg.decode_audio(b'~~~ wave ~~~', file_format='wav', samples_per_second=44100, channel_count=2) self.assertEqual([None, 2], audio_op.shape.as_list())
def _loadFileAndTest(self, filename, file_format, duration_sec, samples_per_second, channel_count): """Loads an audio file and validates the output tensor. Args: filename: The filename of the input file. file_format: The format of the input file. duration_sec: The duration of the audio contained in the file in seconds. samples_per_second: The desired sample rate in the output tensor. channel_count: The desired channel count in the output tensor. """ with self.test_session(): path = os.path.join(resource_loader.get_data_files_path(), 'testdata', filename) with open(path, 'rb') as f: contents = f.read() audio_op = ffmpeg.decode_audio( contents, file_format=file_format, samples_per_second=samples_per_second, channel_count=channel_count) audio = audio_op.eval() self.assertEqual(len(audio.shape), 2) self.assertNear( duration_sec * samples_per_second, audio.shape[0], # Duration should be specified within 10%: 0.1 * audio.shape[0]) self.assertEqual(audio.shape[1], channel_count)
def testStaticShapeInference_NonConstantChannelCount(self): with self.test_session(): channel_count = array_ops.placeholder(dtypes.int32) audio_op = ffmpeg.decode_audio(b'~~~ wave ~~~', file_format='wav', samples_per_second=44100, channel_count=channel_count) self.assertEqual([None, None], audio_op.shape.as_list())
def testRoundTripWithPlaceholderSampleRate(self): with self.test_session(): placeholder = array_ops.placeholder(dtypes.int32) audio_op = ffmpeg.decode_audio(self._contents, file_format='wav', samples_per_second=placeholder, channel_count=1) encode_op = ffmpeg.encode_audio(audio_op, file_format='wav', samples_per_second=placeholder) encoded_contents = encode_op.eval(feed_dict={placeholder: 10000}) self._compareWavFiles(self._contents, encoded_contents)
def testRoundTrip(self): """Reads a wav file, writes it, and compares them.""" with self.test_session(): audio_op = ffmpeg.decode_audio(self._contents, file_format='wav', samples_per_second=10000, channel_count=1) encode_op = ffmpeg.encode_audio(audio_op, file_format='wav', samples_per_second=10000) encoded_contents = encode_op.eval() self._compareWavFiles(self._contents, encoded_contents)
def testRoundTrip(self): """Reads a wav file, writes it, and compares them.""" with self.cached_session(): audio_op = ffmpeg.decode_audio( self._contents, file_format='wav', samples_per_second=10000, channel_count=1) encode_op = ffmpeg.encode_audio( audio_op, file_format='wav', samples_per_second=10000) encoded_contents = encode_op.eval() self._compareWavFiles(self._contents, encoded_contents)
def testRoundTripWithPlaceholderSampleRate(self): with self.cached_session(): placeholder = array_ops.placeholder(dtypes.int32) audio_op = ffmpeg.decode_audio( self._contents, file_format='wav', samples_per_second=placeholder, channel_count=1) encode_op = ffmpeg.encode_audio( audio_op, file_format='wav', samples_per_second=placeholder) encoded_contents = encode_op.eval(feed_dict={placeholder: 10000}) self._compareWavFiles(self._contents, encoded_contents)
def mp3_tensors_from_directory(directory, batch_size, channels=2, format='mp3', seconds=30, bitrate=16384): filenames = glob.glob(directory + "/**/*." + format) labels, total_labels = build_labels(sorted(glob.glob(directory + "/*"))) num_examples_per_epoch = 10000 # Create a queue that produces the filenames to read. classes = [labels[f.split('/')[-2]] for f in filenames] print("Found files", len(filenames)) filenames = tf.convert_to_tensor(filenames, dtype=tf.string) classes = tf.convert_to_tensor(classes, dtype=tf.int32) print("[0]", filenames[0], classes[0]) input_queue = tf.train.slice_input_producer([filenames, classes]) # Read examples from files in the filename queue. print("INPUT_QUEUE", input_queue[0]) value = tf.read_file(input_queue[0]) #preprocess = tf.read_file(input_queue[0]+'.preprocess') print("Preloaded data", value) #print("Loaded data", data) label = input_queue[1] min_fraction_of_examples_in_queue = 0.4 min_queue_examples = int(num_examples_per_epoch * min_fraction_of_examples_in_queue) #data = tf.cast(data, tf.float32) data = ffmpeg.decode_audio(value, file_format=format, samples_per_second=bitrate, channel_count=channels) data = shared.resize_audio_patch.resize_audio_with_crop_or_pad( data, seconds * bitrate * channels, 0, True) #data = tf.slice(data, [0,0], [seconds*bitrate, channels]) tf.Tensor.set_shape(data, [seconds * bitrate, channels]) #data = tf.minimum(data, 1) #data = tf.maximum(data, -1) data = data / tf.reduce_max(tf.reshape(tf.abs(data), [-1])) print("DATA IS", data) x, y = _get_data(data, label, min_queue_examples, batch_size) return x, y, total_labels, num_examples_per_epoch
def _loadFileAndTest(self, filename, file_format, duration_sec, samples_per_second, channel_count, samples_per_second_tensor=None, feed_dict=None, stream=None): """Loads an audio file and validates the output tensor. Args: filename: The filename of the input file. file_format: The format of the input file. duration_sec: The duration of the audio contained in the file in seconds. samples_per_second: The desired sample rate in the output tensor. channel_count: The desired channel count in the output tensor. samples_per_second_tensor: The value to pass to the corresponding parameter in the instantiated `decode_audio` op. If not provided, will default to a constant value of `samples_per_second`. Useful for providing a placeholder. feed_dict: Used when evaluating the `decode_audio` op. If not provided, will be empty. Useful when providing a placeholder for `samples_per_second_tensor`. stream: A string specifying which stream from the content file should be decoded. The default value is '' which leaves the decision to ffmpeg. """ if samples_per_second_tensor is None: samples_per_second_tensor = samples_per_second with self.test_session(): path = os.path.join(resource_loader.get_data_files_path(), 'testdata', filename) with open(path, 'rb') as f: contents = f.read() audio_op = ffmpeg.decode_audio( contents, file_format=file_format, samples_per_second=samples_per_second_tensor, channel_count=channel_count, stream=stream) audio = audio_op.eval(feed_dict=feed_dict or {}) self.assertEqual(len(audio.shape), 2) self.assertNear( duration_sec * samples_per_second, audio.shape[0], # Duration should be specified within 10%: 0.1 * audio.shape[0]) self.assertEqual(audio.shape[1], channel_count)
def testRoundTrip(self): """Reads a wav file, writes it, and compares them.""" with self.test_session(): path = os.path.join( resource_loader.get_data_files_path(), 'testdata/mono_10khz.wav') with open(path, 'rb') as f: original_contents = f.read() audio_op = ffmpeg.decode_audio( original_contents, file_format='wav', samples_per_second=10000, channel_count=1) encode_op = ffmpeg.encode_audio( audio_op, file_format='wav', samples_per_second=10000) encoded_contents = encode_op.eval() self._compareWavFiles(original_contents, encoded_contents)
def testRoundTrip(self): """Fabricates some audio, creates a wav file, reverses it, and compares.""" with self.test_session(): path = os.path.join( resource_loader.get_data_files_path(), 'testdata/mono_10khz.wav') with open(path, 'r') as f: original_contents = f.read() audio_op = ffmpeg.decode_audio( original_contents, file_format='wav', samples_per_second=10000, channel_count=1) encode_op = ffmpeg.encode_audio( audio_op, file_format='wav', samples_per_second=10000) encoded_contents = encode_op.eval() self.assertEqual(original_contents, encoded_contents)
def get_songs(folder, sample_rate): """ Gather up all the singy-songs you want to train the net on :param sample_rate: An integer representing the samples per second :param folder: String of the path to the folder containing the data, put a / at the end :return: returns a TensorArray with the decoded audio """ files = listdir(folder) songs = tf.TensorArray(tf.float32, size=len(files)) for file_name, i in zip(files, range(len(files))): file = tf.read_file(folder + file_name) # I set the channel count to 1 because I am unsure how to make more work waveform = decode_audio(file, 'wav', sample_rate, channel_count=1) songs = songs.write(i, waveform) return songs
def testRoundTrip(self): """Fabricates some audio, creates a wav file, reverses it, and compares.""" with self.test_session(): path = os.path.join(resource_loader.get_data_files_path(), 'testdata/mono_10khz.wav') with open(path, 'r') as f: original_contents = f.read() audio_op = ffmpeg.decode_audio(original_contents, file_format='wav', samples_per_second=10000, channel_count=1) encode_op = ffmpeg.encode_audio(audio_op, file_format='wav', samples_per_second=10000) encoded_contents = encode_op.eval() self.assertEqual(original_contents, encoded_contents)
def mp3_tensors_from_directory(directory, batch_size, channels=2, format='mp3', seconds=30, bitrate=16384): filenames = glob.glob(directory+"/**/*."+format) labels,total_labels = build_labels(sorted(glob.glob(directory+"/*"))) num_examples_per_epoch = 10000 # Create a queue that produces the filenames to read. classes = [labels[f.split('/')[-2]] for f in filenames] print("Found files", len(filenames)) filenames = tf.convert_to_tensor(filenames, dtype=tf.string) classes = tf.convert_to_tensor(classes, dtype=tf.int32) print("[0]", filenames[0], classes[0]) input_queue = tf.train.slice_input_producer([filenames, classes]) # Read examples from files in the filename queue. print("INPUT_QUEUE", input_queue[0]) value = tf.read_file(input_queue[0]) #preprocess = tf.read_file(input_queue[0]+'.preprocess') print("Preloaded data", value) #print("Loaded data", data) label = input_queue[1] min_fraction_of_examples_in_queue = 0.4 min_queue_examples = int(num_examples_per_epoch * min_fraction_of_examples_in_queue) #data = tf.cast(data, tf.float32) data = ffmpeg.decode_audio(value, file_format=format, samples_per_second=bitrate, channel_count=channels) data = shared.resize_audio_patch.resize_audio_with_crop_or_pad(data, seconds*bitrate*channels, 0,True) #data = tf.slice(data, [0,0], [seconds*bitrate, channels]) tf.Tensor.set_shape(data, [seconds*bitrate, channels]) #data = tf.minimum(data, 1) #data = tf.maximum(data, -1) data = data/tf.reduce_max(tf.reshape(tf.abs(data),[-1])) print("DATA IS", data) x,y=_get_data(data, label, min_queue_examples, batch_size) return x, y, total_labels, num_examples_per_epoch
def loadfiles(fname): binary = tf.read_file(fname) print ("binary is: ", binary) return ffmpeg.decode_audio(binary, file_format='wav', samples_per_second=48000, channel_count=2)
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture, model_size_info=None): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) if (model_architecture == 'dnc'): model_settings['batch_size'] = 1 wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') audio_binary = tf.read_file(wav_data_placeholder) decoded_sample_data = ffmpeg.decode_audio( audio_binary, file_format='wav', samples_per_second=model_settings['desired_samples'], channel_count=1) decoded_sample_data = tf.reshape(decoded_sample_data, shape=(model_settings['desired_samples'])) spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=dct_coefficient_count) fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape( fingerprint_input, [-1, fingerprint_time_size * fingerprint_frequency_size]) logits = models.create_model(reshaped_input, model_settings, model_architecture, model_size_info=model_size_info, is_training=False) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def parse_csv_line(line, vocabulary, config): # tf.decode_csv converts CSV records to tensors. Not read CSV files! # Standard procedure to read any file is with tf.data.TextLineDataset # After reading the file into a tensor (NUM_LINES x 1), we interpret the tensor as being in CSV format # Each line in that tensor is a scalar string # Which means we assume every row of tensor (corresponding to every line in file) has # multiple columns delimited by the specified delimiter # The output we get is a tensor (NUM_LINES, NUM_COLUMNS) fields = tf.decode_csv(line, config['data']['csv_column_defaults']) # Note that INPUT_CSV_COLUMNS is (1 x NUM_COLUMNS) while fields is (NUM_LINES, NUM_COLUMNS) # So zipping gives NUM_COLUMNS tuples (COLUMN_NAME, (NUM_LINES x 1)), from which we create a dict features = dict(zip(config['data']['csv_columns'], fields)) # Split string into characters # IMPORTANT NOTE: tf.string_split returns a SparseTensor of rank 2, # the strings split according to the delimiter. Read more about how SparseTensors are represented text = tf.string_split([features[config['data']['csv_columns'][0]]], delimiter="") # Once we have character SparseTensors, we need to encode the characters as numbers # Traditional way is to have one hot encoding or a one hot encoding + embedding matrix # When you use one hot encoding + embedding matrix, you are basically choosing a row of embedding matrix # So to make it faster, tensorflow expects input to embedding layer as the index of the row, # instead of having one hot vectors to be multiplied with embedding matrix # So we will maintain a Vocabulary where every character we care about has an associated number as 1-to-1 # This looks like a map operation for which tensorflow has tf.map_fn # Now note that SparseTensors do not support all usual Tensor operations # To use tf.map_fn on a SparseTensor, we have to create a new SparseTensor in the following way # Also note that embedding layer will expect indexes of dtype tf.int64 # Also, the vocabulary dict stores values as int64 text_idx = tf.SparseTensor( text.indices, tf.map_fn(vocabulary.text2idx, text.values, dtype=tf.int64), text.dense_shape) # We have to convert this SparseTensor back to dense to support future operations text_idx = tf.sparse_tensor_to_dense(text_idx) # Shape - (1, T) text_idx = tf.squeeze(text_idx) # Shape - (T,) # We also require lengths of every input sequence as inputs to model # This ia because we will create batches of variable length input # where all sequences are forced to same length by padding at the end with 0s # This batch will be passed to an Dynamic RNN which will use sequence lengths # to mask the outputs appropriately. The RNN will be unrolled to the common length though # This method enables us to do mini batch SGD for variable length inputs input_sequence_lengths = tf.size(text_idx) # Scalar # We are done with processing text (which is out input to Tacotron) # Lets move onto audio (which will be our targets) # This part is standard code for obtaining MFCC from audio as given in TF documentation # You can read more about what are fourier transform, spectrograms and MFCCs to get an idea audio_binary = tf.read_file(features[config['data']['csv_columns'][1]]) # Sample rate used in paper is 16000, channel count should be 1 for tacotron 2 # STFT configuration values specified in paper waveform = ffmpeg.decode_audio( audio_binary, file_format='wav', samples_per_second=config['data']['wav_sample_rate'], channel_count=1) stfts = tf.contrib.signal.stft(tf.transpose(waveform), frame_length=config['data']['frame_length'], frame_step=config['data']['frame_step'], fft_length=config['data']['fft_length']) magnitude_spectrograms = tf.abs(stfts) num_spectrogram_bins = magnitude_spectrograms.shape[-1].value # These are to be set according to human speech. Values specified in the paper lower_edge_hertz, upper_edge_hertz, num_mel_bins = config['data']['lower_edge_hertz'], \ config['data']['upper_edge_hertz'], \ config['data']['num_mel_bins'] linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, config['data']['wav_sample_rate'], lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot(magnitude_spectrograms, linear_to_mel_weight_matrix, 1) mel_spectrograms = tf.squeeze( mel_spectrograms) # Removes all dimensions that are 1 # This finishes processing of audio # Now we build the targets and inputs to the decoder # We append a frame of 0s at the end of targets to signal end of target end_tensor = tf.tile([[0.0]], multiples=[1, tf.shape(mel_spectrograms)[-1]]) targets = tf.concat([mel_spectrograms, end_tensor], axis=0) # We append a frame of 0s at the start of decoder_inputs to set input at t=1 start_tensor = tf.tile([[0.0]], multiples=[1, tf.shape(mel_spectrograms)[-1]]) target_inputs = tf.concat([start_tensor, mel_spectrograms], axis=0) # Again, we require lengths of every target sequence as inputs to model # This ia because we will create batches of variable length input # where all sequences are forced to same length by padding at the end with 0s # This batch will be passed to an Dynamic RNN which will use sequence lengths # to mask the outputs appropriately. The RNN will be unrolled to the common length though # This method enables us to do mini batch SGD for variable length inputs target_sequence_lengths = tf.shape(targets)[0] # Now we return the values that our model requires as a dict (just like old feed_dict structure) return { 'inputs': text_idx, 'targets': targets, 'input_sequence_lengths': input_sequence_lengths, 'target_sequence_lengths': target_sequence_lengths, 'target_inputs': target_inputs, 'debug_data': waveform }
#https://www.tensorflow.org/api_guides/python/contrib.ffmpeg import tensorflow as tf from tensorflow.contrib import ffmpeg audio_binary = tf.read_file('shibuya.mp3') waveform = ffmpeg.decode_audio(audio_binary, file_format='mp3', samples_per_second=44100, channel_count=2) uncompressed_binary = ffmpeg.encode_audio(waveform, file_format='wav', samples_per_second=44100) print(waveform) print(uncompressed_binary)