def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): """ <<<<<<< HEAD Preprocesses a single utterance wavs/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wavs into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns:rescaling_max - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: # catch missing wavs exception print('file {} present in csv metadata is not present in wavs folder. skipping!'.format( wav_path)) return None # rescale wavs if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max # M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start: end] out = out[start: end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0. out_dtype = np.float32
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text): try: # Load the audio as numpy array wav = audio.load_wav(wav_path) except: print('file {} present in csv not in folder'.format(wav_path)) return None if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max if hparams.trim_silence: wav = audio.trim_silence(wav) out = mulaw_quantize(wav, hparams.quantize_channels) start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) mel_frames = mel_spectrogram.shape[1] linear_spectrogram = audio.linearspectrogram(wav).astype(np.float32) linear_frames = linear_spectrogram.shape[1] assert linear_frames == mel_frames l, r = audio.pad_lr(wav, hparams.fft_size, audio.get_hop_size()) out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) time_steps = len(out) assert time_steps >= mel_frames * audio.get_hop_size() out = out[:mel_frames * audio.get_hop_size()] assert time_steps % audio.get_hop_size() == 0 audio_filename = 'speech-audio-{:05d}.npy'.format(index) mel_filename = 'speech-mel-{:05d}.npy'.format(index) linear_filename = 'speech-linear-{:05d}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad for quantized signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'audio-{}.npy'.format(index) mel_filename = 'mel-{}.npy'.format(index) linear_filename = 'linear-{}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
def _process_utterance(mel_dir, wav_dir, index, wav_path, hparams, speaker_id): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectrogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None if hparams.use_lws: #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, pad, mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = os.path.join(wav_dir, 'audio-{}.npy'.format(index)) mel_filename = os.path.join(mel_dir, 'mel-{}.npy'.format(index)) np.save(audio_filename, out.astype(out_dtype), allow_pickle=False) np.save(mel_filename, mel_spectrogram.T, allow_pickle=False) #global condition features if hparams.gin_channels > 0: # raise RuntimeError('When activating global conditions, please set your speaker_id rules in line 129 of datasets/wavenet_preprocessor.py to use them during training') speaker_id = speaker_id #put the rule to determine how to assign speaker ids (using file names maybe? file basenames are available in "index" variable) else: speaker_id = speaker_id # Return a tuple describing this training example return (audio_filename, mel_filename, '_', speaker_id, time_steps, mel_frames)
def _process_utterance(mel_dir, linear_dir, wav_dir, spkid, uttid, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #Trim lead/trail silences if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Pre-emphasize preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize) #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max #Assert all audio is in [-1, 1] if (wav > 1.).any() or (wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) if (preem_wav > 1.).any() or (preem_wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] preem_wav = preem_wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(preem_wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames if hparams.use_lws: #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams), hparams.wavenet_pad_sides) #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk sub_wav_dir = os.path.join(wav_dir, spkid) sub_mel_dir = os.path.join(mel_dir, spkid) sub_linear_dir = os.path.join(linear_dir, spkid) os.makedirs(sub_wav_dir, exist_ok=True) os.makedirs(sub_mel_dir, exist_ok=True) os.makedirs(sub_linear_dir, exist_ok=True) audio_filename = 'audio-{}.npy'.format(uttid) mel_filename = 'mel-{}.npy'.format(uttid) linear_filename = 'linear-{}.npy'.format(uttid) np.save(os.path.join(sub_wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(sub_mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(sub_linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (spkid, audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
def _process_utterance(out_dir, index, wav_path, pinyin, hparams): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input pinyin: The pinyin of Chinese spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' mel_dir = out_dir + "/mels" linear_dir = out_dir + "/linear" wav_dir = out_dir + "/audio" # Load the audio to a numpy array: wav = audio.load_wav(wav_path, sr=hparams.sample_rate) print("debug wav_path:", wav_path) #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute a mel-scale spectrogram from the wav: #mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: print("debug --- drop wav_path:", wav_path, "mel_frames:", mel_frames) return None # Compute the linear-scale spectrogram from the wav: #spectrogram = audio.spectrogram(wav).astype(np.float32) #n_frames = spectrogram.shape[1] linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames if hparams.use_lws: #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, pad, mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrograms to disk: #spectrogram_filename = 'thchs30-spec-%05d.npy' % index #mel_filename = 'thchs30-mel-%05d.npy' % index #np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) #np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) audio_filename = 'audio-{}.npy'.format(index) mel_filename = 'mel-{}.npy'.format(index) linear_filename = 'linear-{}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) print("debug save wav file:", os.path.join(wav_dir, audio_filename)) # Return a tuple describing this training example: return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, pinyin)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format( wav_path)) return None #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start: end] out = out[start: end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad for quantized signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'speech-audio-{:05d}.npy'.format(index) mel_filename = 'speech-mel-{:05d}.npy'.format(index) linear_filename = 'speech-linear-{:05d}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path) except : print('file {} present in csv not in folder'.format( wav_path)) return None if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav) #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start: end] out = out[start: end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) mel_frames = mel_spectrogram.shape[1] #Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames #Ensure time resolution adjustement between audio and mel-spectrogram l, r = audio.pad_lr(wav, hparams.fft_size, audio.get_hop_size()) #Zero pad for quantized signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) time_steps = len(out) assert time_steps >= mel_frames * audio.get_hop_size() #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size()] assert time_steps % audio.get_hop_size() == 0 # Write the spectrogram and audio to disk audio_filename = 'speech-audio-{:05d}.npy'.format(index) mel_filename = 'speech-mel-{:05d}.npy'.format(index) linear_filename = 'speech-linear-{:05d}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path, sr=hparams.sample_rate) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None # Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] # sanity check assert linear_frames == mel_frames # Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) # Zero pad for quantized signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) # time resolution adjustement # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'speech-audio-{:05d}.npy'.format(index) mel_filename = 'speech-mel-{:05d}.npy'.format(index) linear_filename = 'speech-linear-{:05d}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): wav = _trim_wav(audio.load_wav(wav_path, sr=hparams.sample_rate)) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0. out_dtype = np.float32 mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) name = os.path.splitext(os.path.basename(wav_path))[0] speaker_id = _speaker_re.match(name).group(1) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None # Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] # sanity check assert linear_frames == mel_frames # Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) # Zero pad for quantized signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) # time resolution adjustement # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'speech-audio-{:05d}.npy'.format(index) mel_filename = 'speech-mel-{:05d}.npy'.format(index) linear_filename = 'speech-linear-{:05d}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, speaker_id, text)