def _process_utterance(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'ljspeech-audio-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index # np.save(os.path.join(out_dir, audio_filename), # out.astype(out_dtype), allow_pickle=False) # np.save(os.path.join(out_dir, mel_filename), # mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text)
def _process_utterance(out_dir, wav_path): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T return mel_spectrogram.astype(np.float32)
def _extract_mel(wav_path): # Load the audio to a numpy array. Resampled if needed. wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjast time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjastment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 assert len(out) // N == audio.get_hop_size() timesteps = len(out) return out, mel_spectrogram, timesteps, out_dtype
def _process_utterance(out_dir, index, wav_path, text, silence_threshold, fft_size): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Mu-law quantize quantized = P.mulaw_quantize(wav) # Trim silences start, end = audio.start_and_end_indices(quantized, silence_threshold) quantized = quantized[start:end] wav = wav[start:end] # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjast time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, fft_size, audio.get_hop_size()) # zero pad for quantized signal quantized = np.pad(quantized, (l, r), mode="constant", constant_values=P.mulaw_quantize(0)) N = mel_spectrogram.shape[0] assert len(quantized) >= N * audio.get_hop_size() # time resolution adjastment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample quantized = quantized[:N * audio.get_hop_size()] assert len(quantized) % audio.get_hop_size() == 0 timesteps = len(quantized) wav_id = wav_path.split('/')[-1].split('.')[0] # Write the spectrograms to disk: audio_filename = '{}-audio.npy'.format(wav_id) mel_filename = '{}-mel.npy'.format(wav_id) np.save(os.path.join(out_dir, audio_filename), quantized.astype(np.int16), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text)
def wavenet_data(): out = P.mulaw_quantize(wav, hparams.quantize_channels) out8 = P.mulaw_quantize(wav, 256) # WAVENENT TRANFSORMATIONS # Mu-law quantize # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) import matplotlib.pyplot as plt plt.subplot(3, 1, 1) specshow(mel_spectrogram.T, sr=20000, hop_length=hparams.hop_size) plt.subplot(3, 1, 2) plt.plot(out) plt.xlim(0, len(out)) plt.subplot(3, 1, 3) plt.plot(wav) plt.xlim(0, len(wav)) plt.show() out /= out.max()
def _process_utterance(out_dir, index, wav_path, text, trim_silence=False): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Trim begin/end silences # NOTE: the threshold was chosen for clean signals # TODO: Remove, get this out of here. if trim_silence: wav, _ = librosa.effects.trim(wav, top_db=60, frame_length=2048, hop_length=512) if hparams.highpass_cutoff > 0.0: wav = audio.low_cut_filter(wav, hparams.sample_rate, hparams.highpass_cutoff) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # Trim silences in mul-aw quantized domain silence_threshold = 0 if silence_threshold > 0: # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) start, end = audio.start_and_end_indices(out, silence_threshold) wav = wav[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels - 1) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] constant_values = P.mulaw(0.0, hparams.quantize_channels - 1) out_dtype = np.float32 else: # [-1, 1] constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.logmelspectrogram(wav).astype(np.float32).T if hparams.global_gain_scale > 0: wav *= hparams.global_gain_scale # Time domain preprocessing if hparams.preprocess is not None and hparams.preprocess not in [ "", "none" ]: f = getattr(audio, hparams.preprocess) wav = f(wav) # Clip if np.abs(wav).max() > 1.0: print("""Warning: abs max value exceeds 1.0: {}""".format( np.abs(wav).max())) # ignore this sample return ("dummy", "dummy", -1, "dummy") wav = np.clip(wav, -1.0, 1.0) # Set waveform target (out) if is_mulaw_quantize(hparams.input_type): out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): out = P.mulaw(wav, hparams.quantize_channels - 1) else: out = wav # zero pad # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.pad_lr(out, hparams.fft_size, audio.get_hop_size()) if l > 0 or r > 0: out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 assert_ready_for_upsampling(out, mel_spectrogram, cin_pad=0, debug=True) # Write the spectrograms to disk: name = splitext(basename(wav_path))[0] audio_filename = "%s-wave.npy" % (name) mel_filename = "%s-feats.npy" % (name) np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save( os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False, ) # Return a tuple describing this training example: return (audio_filename, mel_filename, N, text)
def _process_utterance(out_dir, index, speaker_id, wav_path, text): sr = hparams.sample_rate # Load the audio to a numpy array. Resampled if needed wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available # TODO if exists(lab_path) and False: labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=20) else: wav, _ = librosa.effects.trim(wav, top_db=20) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'cmu_arctic-audio-%05d.npy' % index mel_filename = 'cmu_arctic-mel-%05d.npy' % index np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text, speaker_id)
def _process_utterance(out_dir, index, audio_filepath, text): # Load the audio to a numpy array: wav_whole = audio.load_wav(audio_filepath) if hparams.rescaling: wav_whole = wav_whole / np.abs(wav_whole).max() * hparams.rescaling_max # This is a librivox source, so the audio files are going to be v. long # compared to a typical 'utterance' : So split the wav into chunks tup_results = [] n_samples = int(8.0 * hparams.sample_rate) # All 8 second utterances n_chunks = wav_whole.shape[0] // n_samples for chunk_idx in range(n_chunks): chunk_start, chunk_end = chunk_idx * \ n_samples, (chunk_idx + 1) * n_samples if chunk_idx == n_chunks - 1: # This is the last chunk - allow it # to extend to the end of the file chunk_end = None wav = wav_whole[chunk_start:chunk_end] # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution # between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: audio_filename = 'librivox-audio-%04d-%05d.npy' % ( index, chunk_idx, ) mel_filename = 'librivox-mel-%04d-%05d.npy' % ( index, chunk_idx, ) text_idx = '%s - %05d' % ( text, chunk_idx, ) np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Add results tuple describing this training example: tup_results.append((audio_filename, mel_filename, timesteps, text_idx)) # Return all the audio results tuples (unpack in caller) return tup_results
def _process_utterance_single(out_dir, text, wav_path, hparams=hparams): # modified version of LJSpeech _process_utterance audio.set_hparams(hparams) # Load the audio to a numpy array: wav = audio.load_wav(wav_path) sr = hparams.sample_rate # Added from the multispeaker version lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab") if not exists(lab_path): lab_path = os.path.splitext(wav_path)[0]+'.lab' # Trim silence from hts labels if available if exists(lab_path): labels = hts.load(lab_path) wav = clean_by_phoneme(labels, wav, sr) wav, _ = librosa.effects.trim(wav, top_db=25) else: if hparams.process_only_htk_aligned: return None wav, _ = librosa.effects.trim(wav, top_db=15) # End added from the multispeaker version if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max if hparams.max_audio_length != 0 and librosa.core.get_duration(y=wav, sr=sr) > hparams.max_audio_length: return None if hparams.min_audio_length != 0 and librosa.core.get_duration(y=wav, sr=sr) < hparams.min_audio_length: return None # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: # Get filename from wav_path wav_name = os.path.basename(wav_path) wav_name = os.path.splitext(wav_name)[0] out_filename = 'audio-{}.npy'.format(wav_name) mel_filename = 'mel-{}.npy'.format(wav_name) np.save(os.path.join(out_dir, out_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (out_filename, mel_filename, timesteps, text)
def _process_utterance(wav_path, out_dir): fname = wav_path.split(os.sep)[-1].split(".")[0] audio_filename = '{}_resolved.npy'.format(fname) mel_filename = '{}_mel.npy'.format(fname) apth = os.path.join(out_dir, audio_filename) mpth = os.path.join(out_dir, mel_filename) if os.path.exists(apth) and os.path.exists(mpth): print("File {} already processed".format(wav_path)) return # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 timesteps = len(out) # Write the spectrograms to disk: np.save(apth, out.astype(out_dtype), allow_pickle=False) np.save(mpth, mel_spectrogram.astype(np.float32), allow_pickle=False)
def _process_utterance( out_dir, index, speaker_id, wav_path, text, silence_threshold, fft_size, ): sr = hparams.sample_rate # Load the audio to a numpy array. Resampled if needed wav = audio.load_wav(wav_path) lab_path = wav_path.replace("wav/", "lab/").replace(".wav", ".lab") # Trim silence from hts labels if available # TODO if exists(lab_path) and False: labels = hts.load(lab_path) b = int(start_at(labels) * 1e-7 * sr) e = int(end_at(labels) * 1e-7 * sr) wav = wav[b:e] wav, _ = librosa.effects.trim(wav, top_db=20) else: wav, _ = librosa.effects.trim(wav, top_db=20) # Mu-law quantize quantized = P.mulaw_quantize(wav) # Trim silences start, end = audio.start_and_end_indices(quantized, silence_threshold) quantized = quantized[start:end] wav = wav[start:end] # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjast time resolution between audio and mel-spectrogram l, r = audio.lws_pad_lr(wav, fft_size, audio.get_hop_size()) # zero pad for quantized signal quantized = np.pad(quantized, (l, r), mode="constant", constant_values=P.mulaw_quantize(0)) N = mel_spectrogram.shape[0] assert len(quantized) >= N * audio.get_hop_size() # time resolution adjastment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample quantized = quantized[:N * audio.get_hop_size()] assert len(quantized) % audio.get_hop_size() == 0 timesteps = len(quantized) wav_id = wav_path.split('/')[-1].split('.')[0] # Write the spectrograms to disk: audio_filename = '{}-audio.npy'.format(wav_id) mel_filename = '{}-mel.npy'.format(wav_id) np.save(os.path.join(out_dir, audio_filename), quantized.astype(np.int16), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text, speaker_id)
def _process_utterance(out_dir,wav_path,sp2ind_dir,text): sp_f = open(sp2ind_dir,'r') sp2ind = json.load(sp_f) sp = wav_path.split('/')[-1].split('.')[0].split('_')[0] if sp in sp2ind: sp_ind = sp2ind[sp] else: sp_ind = -1 wav = audio.load_wav(wav_path) if not 'test' in wav_path: wav,_ = librosa.effects.trim(wav,top_db=60,frame_length=2048,hop_length=512) if hparams.highpass_cutoff > 0.0: wav = audio.low_cut_filter(wav, hparams.sample_rate, hparams.highpass_cutoff) if is_mulaw_quantize(hparams.input_type): # Trim silences in mul-aw quantized domain silence_threshold = 0 if silence_threshold > 0: # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) start, end = audio.start_and_end_indices(out, silence_threshold) wav = wav[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels - 1) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] constant_values = P.mulaw(0.0, hparams.quantize_channels - 1) out_dtype = np.float32 else: # [-1, 1] constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) mel_spectrogram = audio.logmelspectrogram(wav).astype(np.float32).T mfcc = audio.mfcc(wav).astype(np.float32).T if hparams.global_gain_scale > 0: wav *= hparams.global_gain_scale # Time domain preprocessing if hparams.preprocess is not None and hparams.preprocess not in ["", "none"]: f = getattr(audio, hparams.preprocess) wav = f(wav) # Clip if np.abs(wav).max() > 1.0: print("""Warning: abs max value exceeds 1.0: {}""".format(np.abs(wav).max())) # ignore this sample #return ("dummy", "dummy","dummy", -1,-1, "dummy") wav = np.clip(wav, -1.0, 1.0) # Set waveform target (out) if is_mulaw_quantize(hparams.input_type): out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): out = P.mulaw(wav, hparams.quantize_channels - 1) else: out = wav # zero pad # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.pad_lr(out, hparams.fft_size, audio.get_hop_size()) if l > 0 or r > 0: out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 # Write the spectrograms to disk: #name = splitext(basename(wav_path))[0] #audio_filename = '%s-wave.npy' % (name) #mel_filename = '%s-feats.npy' % (name) audio_filename = f'{out_dir}wave.npy' mel_filename = f'{out_dir}mel.npy' mfcc_filename = f'{out_dir}mfcc.npy' assert mfcc.shape[0] == N np.save(audio_filename, out.astype(out_dtype), allow_pickle=False) np.save(mel_filename, mel_spectrogram.astype(np.float32), allow_pickle=False) np.save(mfcc_filename, mfcc.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (out_dir, N, sp_ind,text)
def _process_song(out_dir, index, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Trim begin/end silences # NOTE: the threshold was chosen for clean signals wav, _ = librosa.effects.trim(wav, top_db=60, frame_length=2048, hop_length=512) if hparams.highpass_cutoff > 0.0: wav = audio.low_cut_filter(wav, hparams.sample_rate, hparams.highpass_cutoff) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # Trim silences in mul-aw quantized domain silence_threshold = 0 if silence_threshold > 0: # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) start, end = audio.start_and_end_indices(out, silence_threshold) wav = wav[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels - 1) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] constant_values = P.mulaw(0.0, hparams.quantize_channels - 1) out_dtype = np.float32 else: # [-1, 1] constant_values = 0.0 out_dtype = np.float32 #### CLAIRE Work here wav_name = os.path.splitext(os.path.basename(wav_path))[0] os.makedirs('./pwavs', exist_ok=True) pwav_path = './pwavs/{0}.wav'.format(wav_name) scipy.io.wavfile.write(pwav_path, 16000, wav) # make the chord directory if it does not exist chord_dir = "chord_dir" os.makedirs(chord_dir, exist_ok=True) # create xml file with notes and timestamps #subprocess.check_call(['./extract_chord_notes.sh', wav_path, chord_dir], shell=True) #os.system('./extract_chord_notes.sh {0} {1}'.format(pwav_path, chord_dir)) os.system('./extract_chromagram.sh {0} {1} > /dev/null 2>&1'.format( pwav_path, chord_dir)) note_filename = '{0}/{1}.csv'.format(chord_dir, wav_name) #### Instead of computing the Mel Spectrogram, here return a time series of one hot encoded chords. # vector with 1 in row for each note played # 1000 samples per second note_samples = int(len(wav) / 2048) # 12 notes per octave chords_time_series = np.zeros((24, note_samples)) #print(np.shape(chords_time_series)) with open(note_filename, newline='\n') as csvfile: #chordreader = csv.reader(csvfile, delimeter=',') chordreader = csvfile.readlines() #print(chordreader) for idx, row in enumerate(chordreader): row = row.split(",") chromogram_samples = np.array(row).astype(np.float)[1:] chords_time_series[:, idx] = chromogram_samples chords_time_series = chords_time_series.T # if hparams.global_gain_scale > 0: # wav *= hparams.global_gain_scale # Time domain preprocessing if hparams.preprocess is not None and hparams.preprocess not in [ "", "none" ]: f = getattr(audio, hparams.preprocess) wav = f(wav) # wav = np.clip(wav, -1.0, 1.0) # Set waveform target (out) if is_mulaw_quantize(hparams.input_type): out = P.mulaw_quantize(wav, hparams.quantize_channels - 1) elif is_mulaw(hparams.input_type): out = P.mulaw(wav, hparams.quantize_channels - 1) else: out = wav # zero pad # this is needed to adjust time resolution between audio and mel-spectrogram l, r = audio.pad_lr(out, hparams.fft_size, audio.get_hop_size()) if l > 0 or r > 0: out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = chords_time_series.shape[0] assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample out = out[:N * audio.get_hop_size()] assert len(out) % audio.get_hop_size() == 0 # Write the spectrograms to disk: name = splitext(basename(wav_path))[0] audio_filename = '%s-wave.npy' % (name) chords_filename = '%s-feats.npy' % (name) np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, chords_filename), chords_time_series.astype(out_dtype), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, chords_filename, N, text)
def _process_utterance(out_dir, index, speaker_id, wav_path, text): sr = hparams.sample_rate # Load the audio to a numpy array. Resampled if needed wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 #print("Wavepath is ", wav_path) filename = wav_path.split('/wav/')[-1].split('.wav')[0] fname = filename filename = ccoeffs_feats_path + '/' + filename + '.mcep' mel_spectrogram = np.loadtxt(filename) #print("Shape of mel scptrogram is ", mel_spectrogram.shape) # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) #mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution between audio and mel-spectrogram #l, r = audio.lws_pad_lr(wav, hparams.fft_size, audio.get_hop_size()) # zero pad for quantized signal #out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] #out = ensure_divisible(out, N) #print("Length of out: ", len(out), "N ", N) #print("Out and N: ", len(out), N) #if len(out) < N * audio.get_hop_size(): #print("Out and N: ", filename, len(out), N, N * audio.get_hop_size()) # sys.exit() #assert len(out) >= N * audio.get_hop_size() # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample #out = out[:N * 80] #out = ensure_divisible(out, N) g = open('logfile','a') g.write("Processing " + fname + '\n') g.close() out,mel_spectrogram = ensure_frameperiod(out,mel_spectrogram) #out = ensure_divisible(out, audio.get_hop_size()) #assert len(out) % audio.get_hop_size() == 0 #assert len(out) % N == 0 timesteps = len(out) g = open('logfile','a') g.write(fname + ' ' + str(len(out)) + ' ' + str(N) + ' ' + str(len(out) % N) + '\n') g.write('\n') g.close() # Write the spectrograms to disk: audio_filename = fname + '-audio-%05d.npy' % index mel_filename = fname + '-mel-%05d.npy' % index np.save(os.path.join(out_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.astype(np.float32), allow_pickle=False) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text, speaker_id)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams, step_factor=1): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate * step_factor) if step_factor > 1: wav = wav[::step_factor] audio_time = len(wav) / hparams.sample_rate except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #Trim lead/trail silences if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Pre-emphasize preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize) #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max #Assert all audio is in [-1, 1] if (wav > 1.).any() or (wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) if (preem_wav > 1.).any() or (preem_wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] preem_wav = preem_wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(preem_wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames if hparams.use_lws: #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams), hparams.wavenet_pad_sides) #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'audio-{}.npy'.format(index) mel_filename = 'mel-{}.npy'.format(index) linear_filename = 'linear-{}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (wav_path, audio_filename, mel_filename, linear_filename, time_steps, mel_frames, audio_time, text, len(text))
def _process_utterance(out_dir, index, wav_path, text, sample_rate, fft_size, hop_size, n_mels, redis_connection): # Load the audio to a numpy array: wav = load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max # Mu-law quantize # this really gets called if input_type in hparams # is changed from raw to mulaw if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = P.mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = P.mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = P.mulaw(wav, hparams.quantize_channels) constant_values = P.mulaw(0.0, hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0.0 out_dtype = np.float32 # Compute a mel-scale spectrogram from the trimmed wav: # (N, D) # mel_spectrogram = # audio.melspectrogram(wav, 22050, 1024, 40).astype(np.float32).T # change this line to adjust hyperparams mel_spectrogram = melspectrogram(wav, sample_rate, fft_size, hop_size, n_mels).astype(np.float32).T # lws pads zeros internally before performing stft # this is needed to adjust time resolution # between audio and mel-spectrogram l, r = lws_pad_lr(wav, fft_size, hop_size) # zero pad for quantized signal out = np.pad(out, (l, r), mode="constant", constant_values=constant_values) N = mel_spectrogram.shape[0] # assert len(out) >= N * audio.get_hop_size() assert len(out) >= N * hop_size # time resolution adjustment # ensure length of raw audio is multiple of hop_size so that we can use # transposed convolution to upsample # out = out[:N * audio.get_hop_size()] # assert len(out) % audio.get_hop_size() == 0 out = out[:N * hop_size] assert len(out) % hop_size == 0 timesteps = len(out) # compute example reconstruction # change this line to adjust hparams # signal = audio.inv_mel_spectrogram(mel_spectrogram, # sample_rate, fft_size, n_mels) # mel_spectrogram = mel_spectrogram.T # Write the spectrograms to disk: audio_filename = 'ljspeech-audio-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index # recon_audio_filename = 'ljspeech-audio-%05d.wav' % index data = out.tobytes() target = np.asarray(text).tobytes() redis_connection.set(index, data + target) # np.save(os.path.join(out_dir, audio_filename), # out.astype(out_dtype), allow_pickle=False) # np.save(os.path.join(out_dir, mel_filename), # mel_spectrogram.astype(np.float32), allow_pickle=False) # audio.save_wav(signal, os.path.join(out_dir, recon_audio_filename)) # Return a tuple describing this training example: return (audio_filename, mel_filename, timesteps, text)