def _process_utterance(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'ljspeech-audio-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index # np.save(os.path.join(out_dir, audio_filename), # out.astype(out_dtype), allow_pickle=False) # np.save(os.path.join(out_dir, mel_filename), # mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text)
def adjust_wav_lws(labels, wav_length, fft_size=1024, hop_size=256, sample_rate=22050): l, r = audio.lws_pad_lr(wav_length, fft_size, hop_size) pad_l = int(l / sample_rate * 10000000) labels.start_times = [x + pad_l for x in labels.start_times] labels.end_times = [x + pad_l for x in labels.end_times] labels.start_times[0] = 0 labels.end_times[-1] = int((wav_length + l + r) / sample_rate * 10000000)
def _extract_mel(wav_path): # Load the audio to a numpy array. Resampled if needed. wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjast time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjastment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 assert len(out) // N == audio.get_hop_size() timesteps = len(out) return out, mel_spectrogram, timesteps, out_dtype
def _process_utterance(out_dir, index, wav_path, text, silence_threshold, fft_size): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Mu-law quantize quantized = P.mulaw_quantize(wav) # Trim silences start, end = audio.start_and_end_indices(quantized, silence_threshold) quantized = quantized[start:end] wav = wav[start:end] # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjast time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, fft_size, audio.get_hop_size()) # zero pad for quantized signal quantized = np.pad(quantized, (l, r), mode="constant", constant_values=P.mulaw_quantize(0)) N = mel_spectrogram.shape[0] assert len(quantized) >= N * audio.get_hop_size() # time resolution adjastment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample quantized = quantized[:N * audio.get_hop_size()] assert len(quantized) % audio.get_hop_size() == 0 timesteps = len(quantized) wav_id = wav_path.split('/')[-1].split('.')[0] # Write the spectrograms to disk: audio_filename = '{}-audio.npy'.format(wav_id) mel_filename = '{}-mel.npy'.format(wav_id) np.save(os.path.join(out_dir, audio_filename), quantized.astype(np.int16), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text)
def wavenet_data(): out = P.mulaw_quantize(wav, hparams.quantize_channels) out8 = P.mulaw_quantize(wav, 256) # WAVENENT TRANFSORMATIONS # Mu-law quantize # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) import matplotlib.pyplot as plt plt.subplot(3, 1, 1) specshow(mel_spectrogram.T, sr=20000, hop_length=hparams.hop_size) plt.subplot(3, 1, 2) plt.plot(out) plt.xlim(0, len(out)) plt.subplot(3, 1, 3) plt.plot(wav) plt.xlim(0, len(wav)) plt.show() out /= out.max()
def _process_utterance(out_dir, index, speaker_id, wav_path, text): sr = hparams.sample_rate # Load the audio to a numpy array. Resampled if needed wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available # TODO if exists(lab_path) and False: labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=20) else: wav, _ = librosa.effects.trim(wav, top_db=20) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'cmu_arctic-audio-%05d.npy' % index mel_filename = 'cmu_arctic-mel-%05d.npy' % index np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text, speaker_id)
def _process_utterance(out_dir, index, audio_filepath, text): # Load the audio to a numpy array: wav_whole = audio.load_wav(audio_filepath) if hparams.rescaling: wav_whole = wav_whole / np.abs(wav_whole).max() * hparams.rescaling_max # This is a librivox source, so the audio files are going to be v. long # compared to a typical 'utterance' : So split the wav into chunks tup_results = [] n_samples = int(8.0 * hparams.sample_rate) # All 8 second utterances n_chunks = wav_whole.shape[0] // n_samples for chunk_idx in range(n_chunks): chunk_start, chunk_end = chunk_idx * \ n_samples, (chunk_idx + 1) * n_samples if chunk_idx == n_chunks - 1: # This is the last chunk - allow it # to extend to the end of the file chunk_end = None wav = wav_whole[chunk_start:chunk_end] # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution # between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'librivox-audio-%04d-%05d.npy' % ( index, chunk_idx, ) mel_filename = 'librivox-mel-%04d-%05d.npy' % ( index, chunk_idx, ) text_idx = '%s - %05d' % ( text, chunk_idx, ) np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Add results tuple describing this training example: tup_results.append((audio_filename, mel_filename, timesteps, text_idx)) # Return all the audio results tuples (unpack in caller) return tup_results
def _process_utterance(out_dir, index, wav_path, text): # Load the audio to a np array: wav = audio.load_wav(wav_path) fs = hparams.sample_rate if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) p_vt = 5 p_gl = 5 d = 0.99 hpfilt = 2 preflt = p_vt + 1 # High-pass filter speech in order to remove possible low frequency # fluctuations (Linear-phase FIR, Fc = 70 Hz) Fstop = 40 # Stopband Frequency Fpass = 70 # Passband Frequency Nfir = np.round(300 / 16000 * fs) # FIR numerator order if (Nfir % 2 == 0): Nfir = Nfir + 1 ''' it is very very expensive to calculate the firls filter! However, as long as the fs does not change, the firls filter does not change. Therefore, the computed filter is returned and can be passed to this function later on to avoid the calculated of the (same) filter. ''' B = dsp.firls(Nfir, [0, Fstop / (fs / 2), Fpass / (fs / 2), fs / 2], [0, 0, 1, 1], [1, 1]) ''' % Estimate the combined effect of the glottal flow and the lip radiation % (Hg1) and cancel it out through inverse filtering. Note that before % filtering, a mean-normalized pre-frame ramp is appended in order to % diminish ripple in the beginning of the frame. The ramp is removed after % filtering. ''' le = np.int(len(out) / hparams.hop_size) glot = np.zeros([le, 254]) vtfilter = np.zeros([le, 5]) for j in range(le): w = out[hparams.hop_size * (j):hparams.hop_size * (j + 1)] #print(wav[(hparams.hop_size)*(i):(hparams.hop_size)*(i+1)]) for i in range(hpfilt): w = dsp.lfilter( B, 1, np.concatenate((w, np.zeros(np.int(len(B) / 2) - 1)))) w = w[np.int(len(B) / 2):] win = np.hanning(len(w)) a = np.array([1]) b = np.array([1, -d]) signal = np.concatenate((np.linspace(-w[0], w[0], preflt), w)) windowed = w * win t = dsp.correlate(windowed, windowed) l = len(win) try: Hg1 = lpcsolve(t[l:l + 1], t[l + 1:l + 2]) except np.linalg.linalg.LinAlgError: Hg1 = np.zeros(1) y = dsp.lfilter(Hg1, 1, signal) y = y[preflt:] ''' % Estimate the effect of the vocal tract (Hvt1) and cancel it out through % inverse filtering. The effect of the lip radiation is canceled through % intergration. Signal g1 is the first estimate of the glottal flow. ''' windowedy = y * win r = dsp.correlate(windowedy, windowedy) try: Hvt1 = lpcsolve(r[l:l + p_vt], r[l + 1:l + p_vt + 1]) except np.linalg.linalg.LinAlgError: Hvt1 = np.zeros(p_vt) g1 = dsp.lfilter(Hvt1, 1, signal) g1 = dsp.lfilter(a, b, g1) g1 = g1[preflt:] ''' % Re-estimate the effect of the glottal flow (Hg2). Cancel the contribution % of the glottis and the lip radiation through inverse filtering and % integration, respectively. ''' windowedg1 = w * g1 u = dsp.correlate(windowedg1, windowedg1) try: Hg2 = lpcsolve(u[l:l + p_gl], u[l + 1:l + p_gl + 1]) except np.linalg.linalg.LinAlgError: Hg2 = np.zeros(p_gl) y = dsp.lfilter(Hg2, 1, signal) y = dsp.lfilter(a, b, y) y = y[preflt:] ''' % Estimate the model for the vocal tract (Hvt2) and cancel it out through % inverse filtering. The final estimate of the glottal flow is obtained % through canceling the effect of the lip radiation. ''' windowedynew = y * win t = dsp.correlate(windowedynew, windowedynew) try: Hvt2 = lpcsolve(t[l:l + p_vt], t[l + 1:l + p_vt + 1]) except np.linalg.linalg.LinAlgError: Hvt2 = np.zeros(p_vt) dg = dsp.lfilter(Hvt2, 1, signal) g = dsp.lfilter(a, b, dg) g = g[preflt:] dg = dg[preflt:] # Set vocal tract model to 'a' and glottal source spectral model to 'ag' a = Hvt2 ag = Hg2 #print(j) glot[j - 1] = g.T vtfilter[j - 1] = a.T glot = np.reshape(glot, [254 * le]) #print(glot) # Write the spectrograms to disk: audio_filename = 'ljspeech-audio-%05d.npy' % index glot_filename = 'ljspeech-glot-%05d.npy' % index vt_filename = 'ljspeech-vt-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, glot_filename), glot.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, vt_filename), vtfilter.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, glot_filename, vt_filename, timesteps, text)
def _process_utterance_single(out_dir, text, wav_path, hparams=hparams): # modified version of LJSpeech _process_utterance audio.set_hparams(hparams) # Load the audio to a numpy array: wav = audio.load_wav(wav_path) sr = hparams.sample_rate # Added from the multispeaker version lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") if not exists(lab_path): lab_path = os.path.splitext(wav_path)[0]+'.lab' # Trim silence from hts labels if available if exists(lab_path): labels = hts.load(lab_path) wav = clean_by_phoneme(labels, wav, sr) wav, _ = librosa.effects.trim(wav, top_db=25) else: if hparams.process_only_htk_aligned: return None wav, _ = librosa.effects.trim(wav, top_db=15) # End added from the multispeaker version if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max if hparams.max_audio_length != 0 and librosa.core.get_duration(y=wav, sr=sr) > hparams.max_audio_length: return None if hparams.min_audio_length != 0 and librosa.core.get_duration(y=wav, sr=sr) < hparams.min_audio_length: return None # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: # Get filename from wav_path wav_name = os.path.basename(wav_path) wav_name = os.path.splitext(wav_name)[0] out_filename = 'audio-{}.npy'.format(wav_name) mel_filename = 'mel-{}.npy'.format(wav_name) np.save(os.path.join(out_dir, out_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (out_filename, mel_filename, timesteps, text)
def _process_utterance(wav_path, out_dir): fname = wav_path.split(os.sep)[-1].split(".")[0] audio_filename = '{}_resolved.npy'.format(fname) mel_filename = '{}_mel.npy'.format(fname) apth = os.path.join(out_dir, audio_filename) mpth = os.path.join(out_dir, mel_filename) if os.path.exists(apth) and os.path.exists(mpth): print("File {} already processed".format(wav_path)) return # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: np.save(apth, out.astype(out_dtype), allow_pickle=False) np.save(mpth, mel_spectrogram.astype(np.float32), allow_pickle=False)
def _process_utterance( out_dir, index, speaker_id, wav_path, text, silence_threshold, fft_size, ): sr = hparams.sample_rate # Load the audio to a numpy array. Resampled if needed wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available # TODO if exists(lab_path) and False: labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=20) else: wav, _ = librosa.effects.trim(wav, top_db=20) # Mu-law quantize quantized = P.mulaw_quantize(wav) # Trim silences start, end = audio.start_and_end_indices(quantized, silence_threshold) quantized = quantized[start:end] wav = wav[start:end] # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjast time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, fft_size, audio.get_hop_size()) # zero pad for quantized signal quantized = np.pad(quantized, (l, r), mode="constant", constant_values=P.mulaw_quantize(0)) N = mel_spectrogram.shape[0] assert len(quantized) >= N * audio.get_hop_size() # time resolution adjastment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample quantized = quantized[:N * audio.get_hop_size()] assert len(quantized) % audio.get_hop_size() == 0 timesteps = len(quantized) wav_id = wav_path.split('/')[-1].split('.')[0] # Write the spectrograms to disk: audio_filename = '{}-audio.npy'.format(wav_id) mel_filename = '{}-mel.npy'.format(wav_id) np.save(os.path.join(out_dir, audio_filename), quantized.astype(np.int16), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text, speaker_id)
def _process_utterance(out_dir, index, wav_path, text, sample_rate, fft_size, hop_size, n_mels, redis_connection): # Load the audio to a numpy array: wav = load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize # this really gets called if input_type in hparams # is changed from raw to mulaw if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) # mel_spectrogram = # audio.melspectrogram(wav, 22050, 1024, 40).astype(np.float32).T # change this line to adjust hyperparams mel_spectrogram = melspectrogram(wav, sample_rate, fft_size, hop_size, n_mels).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution # between audio and mel-spectrogram l, r = lws_pad_lr(wav, fft_size, hop_size) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] # assert len(out) >= N * audio.get_hop_size() assert len(out) >= N * hop_size # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample # out = out[:N * audio.get_hop_size()] # assert len(out) % audio.get_hop_size() == 0 out = out[:N * hop_size] assert len(out) % hop_size == 0 timesteps = len(out) # compute example reconstruction # change this line to adjust hparams # signal = audio.inv_mel_spectrogram(mel_spectrogram, # sample_rate, fft_size, n_mels) # mel_spectrogram = mel_spectrogram.T # Write the spectrograms to disk: audio_filename = 'ljspeech-audio-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index # recon_audio_filename = 'ljspeech-audio-%05d.wav' % index data = out.tobytes() target = np.asarray(text).tobytes() redis_connection.set(index, data + target) # np.save(os.path.join(out_dir, audio_filename), # out.astype(out_dtype), allow_pickle=False) # np.save(os.path.join(out_dir, mel_filename), # mel_spectrogram.astype(np.float32), allow_pickle=False) # audio.save_wav(signal, os.path.join(out_dir, recon_audio_filename)) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text)