def _process_utterance(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) sr = hparams.sample_rate if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Trim silence from hts labels if available lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") if exists(lab_path): labels = hts.load(lab_path) assert "sil" in labels[0][-1] assert "sil" in labels[-1][-1] b = int(labels[0][1] * 1e-7 * sr) e = int(labels[-1][0] * 1e-7 * sr) wav = wav[b:e] else: wav, _ = librosa.effects.trim(wav, top_db=30) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution # between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'jsut-audio-%05d.npy' % index mel_filename = 'jsut-mel-%05d.npy' % index np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text)
def _process_utterance(out_dir, index, audio_filepath, text): # Load the audio to a numpy array: wav_whole = audio.load_wav(audio_filepath) if hparams.rescaling: wav_whole = wav_whole / np.abs(wav_whole).max() * hparams.rescaling_max # This is a librivox source, so the audio files are going to be v. long # compared to a typical 'utterance' : So split the wav into chunks tup_results = [] n_samples = int(8.0 * hparams.sample_rate) # All 8 second utterances n_chunks = wav_whole.shape[0] // n_samples for chunk_idx in range(n_chunks): chunk_start, chunk_end = chunk_idx * \ n_samples, (chunk_idx + 1) * n_samples if chunk_idx == n_chunks - 1: # This is the last chunk - allow it # to extend to the end of the file chunk_end = None wav = wav_whole[chunk_start:chunk_end] # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution # between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'librivox-audio-%04d-%05d.npy' % ( index, chunk_idx, ) mel_filename = 'librivox-mel-%04d-%05d.npy' % ( index, chunk_idx, ) text_idx = '%s - %05d' % ( text, chunk_idx, ) np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Add results tuple describing this training example: tup_results.append((audio_filename, mel_filename, timesteps, text_idx)) # Return all the audio results tuples (unpack in caller) return tup_results
def _process_utterance(out_dir, index, wav_path, text, sample_rate, fft_size, hop_size, n_mels, redis_connection): # Load the audio to a numpy array: wav = load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize # this really gets called if input_type in hparams # is changed from raw to mulaw if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) # mel_spectrogram = # audio.melspectrogram(wav, 22050, 1024, 40).astype(np.float32).T # change this line to adjust hyperparams mel_spectrogram = melspectrogram(wav, sample_rate, fft_size, hop_size, n_mels).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution # between audio and mel-spectrogram l, r = lws_pad_lr(wav, fft_size, hop_size) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] # assert len(out) >= N * audio.get_hop_size() assert len(out) >= N * hop_size # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample # out = out[:N * audio.get_hop_size()] # assert len(out) % audio.get_hop_size() == 0 out = out[:N * hop_size] assert len(out) % hop_size == 0 timesteps = len(out) # compute example reconstruction # change this line to adjust hparams # signal = audio.inv_mel_spectrogram(mel_spectrogram, # sample_rate, fft_size, n_mels) # mel_spectrogram = mel_spectrogram.T # Write the spectrograms to disk: audio_filename = 'ljspeech-audio-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index # recon_audio_filename = 'ljspeech-audio-%05d.wav' % index data = out.tobytes() target = np.asarray(text).tobytes() redis_connection.set(index, data + target) # np.save(os.path.join(out_dir, audio_filename), # out.astype(out_dtype), allow_pickle=False) # np.save(os.path.join(out_dir, mel_filename), # mel_spectrogram.astype(np.float32), allow_pickle=False) # audio.save_wav(signal, os.path.join(out_dir, recon_audio_filename)) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text)