def __init__(self, coordinator, in_dir, logger): super(DataFeeder, self).__init__() self._coordinator = coordinator self._in_dir = in_dir self._logger = logger self._metadata = load_metadata(os.path.join(in_dir, 'train.txt'), self._logger) random.shuffle(self._metadata) self._cursor = 0 # index of the next sample self._num_samples = len(self._metadata) self._hparams = hparams self.batch_size = hparams.get('batch_size') self.superbatch_size = hparams.get('superbatch_size') self.outputs_per_step = hparams.get('outputs_per_step') # Placeholders for inputs and targets. self._placeholders = [ tf.placeholder(tf.int32, [None, None], 'inputs'), tf.placeholder(tf.int32, [None], 'input_lengths'), tf.placeholder(tf.float32, [None, None, hparams.get('num_mels')], 'mel_targets'), tf.placeholder(tf.float32, [None, None, hparams.get('num_freq')], 'linear_targets') ] # Create queue of capacity 8 for buffering data which # will buffer 8 superbatches onto the FIFO queue queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32], name='input_queue') self._enqueue_operation = queue.enqueue(self._placeholders) self.current_batch = Batch(queue.dequeue(), prep=False) self.current_batch.set_shapes(self._placeholders)
def _build_mel_basis(): ''' Creates a filterbank matrix to combine FFT bins into mel-frequency bins ''' return librosa.filters.mel(hparams.get('sample_rate'), hparams.get('n_fft'), n_mels=hparams.get('num_mels'))
def spectrogram_tensorflow_inv(spect): '''Builds computational graph to convert spectrogram to waveform using TensorFlow. Unlike spectrogram_inv, this does NOT invert the preemphasis. The caller should call inv_preemphasis on the output after running the graph. ''' S = _db_to_amp_tensorflow( _denormalize_tensorflow(spect) + hparams.get('ref_level_db')) return _griffin_lim_tensorflow(tf.pow(S, hparams.get('power')))
def _normalize_inv(S): ''' Input S: Spectrogram Unwinds the normalization function applied to the spectrogram. This is used in synthesizing ''' return (np.clip(S, 0, 1) * -float(hparams.get('min_level_db'))) + hparams.get('min_level_db')
def _normalize(S): ''' Input S: Spectrogram Returns a normalized version of the spectrogram. Since we don't care about absolute volume and only care about relatve volume, we pin the spectrogram frequency ''' return np.clip((S - hparams.get('min_level_db')) / -float(hparams.get('min_level_db')), 0, 1)
def spectrogram_inv(spect): ''' Input spect: A linear spectrogram Convert a spectrogram back to a waveform using the Griffin-lim algorithm. This is used in synthesizing ''' # Unwind normalization and dB-scaling S = _db_to_amp(_normalize_inv(spect) + hparams.get('ref_level_db')) # Apply the Griffin-lim algorithm and unwind the pre-emphasis return pre_emphasis_inv(_griffin_lim(S**hparams.get('power')))
def _prepare_batch(self, outputs_per_step=hparams.get('outputs_per_step')): ''' Prepares both inputs and targets for inference ''' self._prepare_inputs() self._prepare_targets()
def get_embedds(self): embedding_table = tf.get_variable( 'embedding', [len(chars), hparams.get('embedded_depth')], dtype=tf.float32, initializer=tf.truncated_normal_initializer(stddev=0.5)) return tf.nn.embedding_lookup(embedding_table, self._inputs)
def _stft_params(): ''' Output Given the hyper parameters, return the needed parameters for the lirosa STFT method n_fft: The FFT window size or the num hop_length: The number of audio frames between STFT columns win_length: Each frame of audio is windowed, where each window will be of length win_length and then zero-padded to match up with n_fft ''' n_fft = hparams.get('n_fft') hop_length = int( hparams.get('frame_shift_ms') / 1000 * hparams.get('sample_rate')) win_length = int( hparams.get('frame_length_ms') / 1000 * hparams.get('sample_rate')) return n_fft, hop_length, win_length
def round_up(x): ''' Given an integer, x, round up x to the closest product of the outputs_per_step hyperparameter (5) Param: x: an integer Output: x rounded up to outputs_per_step ''' remainder = x % hparams.get('outputs_per_step') if remainder == 0: return x else: return x + hparams.get('outputs_per_step') - remainder
def find_endpoint(wav, threshold_db=-40, min_silence_sec=0.8): window_length = int(hparams.get('sample_rate') * min_silence_sec) hop_length = int(window_length / 4) threshold = _db_to_amp(threshold_db) for x in range(hop_length, len(wav) - window_length, hop_length): if np.max(wav[x:x + window_length]) < threshold: return x + hop_length return len(wav)
def load_metadata(path, logger): ''' Loads the metadata generated by the prep functions at the given path ''' with open(path, encoding='utf-8') as f: metadata = [line.strip().split('|') for line in f] hours = sum( (int(x[2]) for x in metadata)) * hparams.get('frame_shift_ms') / (3600 * 1000) logger.log('Loaded metadata for %d examples (%.2f hours)' % (len(metadata), hours)) return metadata
def _griffin_lim_tensorflow(S): '''TensorFlow implementation of Griffin-Lim Based on https://github.com/Kyubyong/tensorflow-exercises/blob/master/Audio_Processing.ipynb ''' with tf.variable_scope('griffinlim'): # TensorFlow's stft and istft operate on a batch of spectrograms; create batch of size 1 S = tf.expand_dims(S, 0) S_complex = tf.identity(tf.cast(S, dtype=tf.complex64)) y = _istft_tensorflow(S_complex) for i in range(hparams.get('griffin_lim_iters')): est = _stft_tensorflow(y) angles = est / tf.cast(tf.maximum(1e-8, tf.abs(est)), tf.complex64) y = _istft_tensorflow(S_complex * angles) return tf.squeeze(y, 0)
def _griffin_lim(spect): ''' Input spect: A spectrogram Apply the Griffin-Lim Algorithm (GLA) on the spectrogram to estimate the signal that has been STFTed ''' angles = np.exp(2j * np.pi * np.random.rand(*spect.shape)) S_complex = np.abs(spect).astype(np.complex) y = _stft_inv(S_complex * angles) for _ in range(hparams.get('griffin_lim_iters')): angles = np.exp(1j * np.angle(_stft(y))) y = _stft_inv(S_complex * angles) return y
def pad_input(x, length): ''' Given a list, x, and an int, length, add length - len(x) pad values to the back of the list and return a numpy vector Param: x: a list length: an integer Output: A padded numpy vector ''' return np.pad(x, (0, length - x.shape[0]), mode='constant', constant_values=hparams.get('pad_value'))
def mel_spectrogram(y): ''' Input y: a numpy array representing a sound signal Output A normalized mel-scaled spectrogram. A spectrogram is a 3d structure (Time (ms), Frequency (Hz), Volume (dB)) TODO Thresholding at ref_level_db is never discussed in the tacotron paper ''' D = _stft(pre_emphasis(y)) S = _amp_to_db(_linspect_to_melspect( np.abs(D))) - hparams.get('ref_level_db') return _normalize(S)
def write_metadata(metadata, output_dir): ''' Writes dataset metadata to train.txt into the given output directory that contains the following information for all files: "{lin spec file name} | {mel spec file name} | {num frames} | {text}" ''' with open(os.path.join(output_dir, 'train.txt'), 'w', encoding='utf-8') as f: for m in metadata: f.write('|'.join([str(x) for x in m]) + '\n') frames = sum([m[2] for m in metadata]) hours = frames * hparams.get('frame_shift_ms') / (3600 * 1000) print('Wrote %d utterances, %d frames (%.2f hours)' % (len(metadata), frames, hours)) print('Max input length: %d' % max(len(m[3]) for m in metadata)) print('Max output length: %d' % max(m[2] for m in metadata))
def pad_target(t, length): ''' Given an 2d array representing the target spectrogram where the first axis represents time and the second one frequency, and an integer, length, add length - len(time axis) pad values to the back of the time axis and return the array Param: t: a numpy 2d array length: an integer Output: A padded 2d numpy array ''' return np.pad(t, [(0, length - t.shape[0]), (0, 0)], mode='constant', constant_values=hparams.get('pad_value'))
def pre_emphasis(x): ''' Input x: a numpy array representing a sound signal Output Applies a pre-emphasis filter on the signal to amplify the high frequencies. Given an input signal x, the emphasized signal y is described by y(t) = x(t) - a*x(t-1), where a is the pre emphasis coefficient. This is done with lfilter where lfilter(a, b, x) implements a[0]*y[n] = b[0]*x[n] + b[1]*x[n-1] + ... + b[M]*x[n-M] - a[1]*y[n-1] - ... - a[N]*y[n-N] ''' return signal.lfilter([1, -float(hparams.get('preemphasis'))], [1], x)
def spectrogram(y): ''' Input y: a numpy array representing a sound signal Output A normalized linear-scale spectrogram. A spectrogram is a 2d structure ([Time (ms), Frequency (Hz)] where values are Volume (dB)) TODO Thresholding at ref_level_db is never discussed in the tacotron paper ''' # D is the short-time Fourier transform result of # the pre-emphasizes version of the input signal D = _stft(pre_emphasis(y)) # Convert to a dB-scaled spectrogram and threshold # the output at ref_level_db S = _amp_to_db(np.abs(D)) - hparams.get('ref_level_db') # Finally normalize the output return _normalize(S)
def load_wav(path): ''' Loads a single waveform file from disk at the given path ''' return librosa.core.load(path, sr=hparams.get('sample_rate'))[0]
def _denormalize_tensorflow(S): return (tf.clip_by_value(S, 0, 1) * -float(hparams.get('min_level_db'))) + hparams.get('min_level_db')
def pre_emphasis_inv(x): ''' Rewinds the pre emphasis filter. This is used in synthesizing ''' return signal.lfilter([1], [1, -float(hparams.get('preemphasis'))], x)
def save_wav(wav, path): wav *= 32767 / max(0.01, np.max(np.abs(wav))) librosa.output.write_wav(path, wav.astype(np.int16), hparams.get('sample_rate'))