def _process_utterance(audio_dir, label_dir, index, wav_path, text_path, args): """ Preprocesses a single utterance wav/text_jamo pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text_jamo: text_jamo spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text_jamo) """ try: # Load the audio as numpy array # wav = audio.load_wav(wav_path, sr=args.sample_rate) with open(wav_path, 'rb') as pcmfile: buf = pcmfile.read() wav = np.frombuffer(buf, dtype='int16') except FileNotFoundError: #catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format(wav_path)) return None # rescale wav if args.rescale: wav = wav / np.abs(wav).max() * args.rescaling_max # M-AILABS extra silence specific if args.trim_silence: wav = audio.trim_silence(wav, args) # [-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, args).astype(out_dtype) mel_frames = mel_spectrogram.shape[1] # Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, args.n_fft, audio.get_hop_size(args)) # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (0, pad), mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(args) # time resolution adjustment # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(args)] assert len(out) % audio.get_hop_size(args) == 0 time_steps = len(out) # text_jamo sequence with open(text_path, 'r', encoding='CP949') as f: line = f.readline() # ETRI transcription rule line = sentence_filter(line).upper() label_sequence = normalize(line) print(label_sequence) # Write the spectrogram and audio to disk mel_filename = 'mel-{}.npy'.format(index) label_filename = 'label-{}.txt'.format(index) np.save(os.path.join(audio_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) with open(os.path.join(label_dir, label_filename), 'w', encoding='utf-8') as f_out: f_out.write(label_sequence) # Return a tuple describing this training example return (wav_path, text_path, mel_filename, label_filename, time_steps, mel_frames)
def _process_utterance(mel_dir, wav_dir, index, wav_path, speaker_id, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectrogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Pre-emphasize preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize) #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max #Assert all audio is in [-1, 1] if (wav > 1.).any() or (wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) if (preem_wav > 1.).any() or (preem_wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] preem_wav = preem_wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None if hparams.use_lws: #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk if hparams.gin_channels > 0: speaker_name = hparams.speakers[speaker_id] index = speaker_name + '_' + index audio_filename = os.path.join(wav_dir, 'audio-{}.npy'.format(index)) mel_filename = os.path.join(mel_dir, 'mel-{}.npy'.format(index)) np.save(audio_filename, out.astype(out_dtype), allow_pickle=False) np.save(mel_filename, mel_spectrogram.T, allow_pickle=False) #global condition features if hparams.gin_channels > 0: speaker_id = speaker_id #put the rule to determine how to assign speaker ids (using file names maybe? file basenames are available in "index" variable) else: speaker_id = '<no_g>' # Return a tuple describing this training example return (audio_filename, mel_filename, mel_filename, speaker_id, time_steps, mel_frames)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: # catch missing wav exception print( 'file {} present in txt metadata is not present in wav folder. skipping!' .format(wav_path)) return None # rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max # M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) # Mu-law quantize if is_mulaw_quantize(hparams.input_type): # [0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) # Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): # [-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: # [-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None # Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] # sanity check assert linear_frames == mel_frames if hparams.use_lws: # Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) # Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: # Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, pad, mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(hparams) # time resolution adjustement # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrogram and audio to disk audio_filename = 'audio-{}.npy'.format(index) mel_filename = 'mel-{}.npy'.format(index) linear_filename = 'linear-{}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, text)
def _process_utterance(dataset, mel_dir, linear_dir, audio_dir, spk_emb_dir, index, audio_path, text, emt_label, spk_label, sex, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array aud = audio.load_audio(audio_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(audio_path)) return None #Trim lead/trail silences if hparams.trim_silence: aud = audio.trim_silence(aud, hparams) #Pre-emphasize preem_aud = audio.preemphasis(aud, hparams.preemphasis, hparams.preemphasize) #rescale audio if hparams.rescale: aud = aud / np.abs(aud).max() * hparams.rescaling_max preem_aud = preem_aud / np.abs(preem_aud).max() * hparams.rescaling_max #Assert all audio is in [-1, 1] if (aud > 1.).any() or (aud < -1.).any(): raise RuntimeError( 'audio has invalid value: {}'.format(audio_path)) if (preem_aud > 1.).any() or (preem_aud < -1.).any(): raise RuntimeError( 'audio has invalid value: {}'.format(audio_path)) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(aud, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) aud = aud[start:end] preem_aud = preem_aud[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(aud, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = aud constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the audio mel_spectrogram = audio.melspectrogram(preem_aud, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] # if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: # return None #Compute the linear scale spectrogram from the audui # linear_spectrogram = audio.linearspectrogram(preem_aud, hparams).astype(np.float32) # linear_frames = linear_spectrogram.shape[1] #sanity check # assert linear_frames == mel_frames if hparams.use_lws: #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(aud, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram l_pad, r_pad = audio.librosa_pad_lr(aud, hparams.n_fft, audio.get_hop_size(hparams), hparams.wavenet_pad_sides) #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) #Get speaker embedding #spk_emb = scoring.get_embedding(spk_emb_model, spk_emb_buckets, audio_path) # Write the spectrogram and audio to disk audio_filename = 'audio-{}.npy'.format(index) mel_filename = 'mel-{}.npy'.format(index) linear_filename = 'linear-{}.npy'.format(index) spk_emb_filename = 'spkemb-{}.npy'.format(index) # np.save(os.path.join(audio_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) #np.save(os.path.join(spk_emb_dir, spk_emb_filename), spk_emb, allow_pickle=False) basename = os.path.basename(audio_path) # Return a tuple describing this training example return (dataset, audio_filename, mel_filename, linear_filename, spk_emb_filename, time_steps, mel_frames, text, emt_label, spk_label, basename, sex)
def _process_utterance(out_dir, index, wav_path, pinyin, hparams): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input pinyin: The pinyin of Chinese spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' mel_dir = out_dir + "/mels" linear_dir = out_dir + "/linear" wav_dir = out_dir + "/audio" # Load the audio to a numpy array: wav = audio.load_wav(wav_path, sr=hparams.sample_rate) print("debug wav_path:", wav_path) #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #Mu-law quantize if is_mulaw_quantize(hparams.input_type): #[0, quantize_channels) out = mulaw_quantize(wav, hparams.quantize_channels) #Trim silences start, end = audio.start_and_end_indices(out, hparams.silence_threshold) wav = wav[start:end] out = out[start:end] constant_values = mulaw_quantize(0, hparams.quantize_channels) out_dtype = np.int16 elif is_mulaw(hparams.input_type): #[-1, 1] out = mulaw(wav, hparams.quantize_channels) constant_values = mulaw(0., hparams.quantize_channels) out_dtype = np.float32 else: #[-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute a mel-scale spectrogram from the wav: #mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: print("debug --- drop wav_path:", wav_path, "mel_frames:", mel_frames) return None # Compute the linear-scale spectrogram from the wav: #spectrogram = audio.spectrogram(wav).astype(np.float32) #n_frames = spectrogram.shape[1] linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames if hparams.use_lws: #Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) #Zero pad audio signal out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: #Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) #Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, pad, mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) # Write the spectrograms to disk: #spectrogram_filename = 'thchs30-spec-%05d.npy' % index #mel_filename = 'thchs30-mel-%05d.npy' % index #np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) #np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) audio_filename = 'audio-{}.npy'.format(index) mel_filename = 'mel-{}.npy'.format(index) linear_filename = 'linear-{}.npy'.format(index) np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) print("debug save wav file:", os.path.join(wav_dir, audio_filename)) # Return a tuple describing this training example: return (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, pinyin)
def re_save_all(wav_path, audio_filename, mel_filename, linear_filename): try: # Load the audio as numpy array aud = audio.load_audio(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print( 'file {} present in csv metadata is not present in wav folder. skipping!' .format(wav_path)) return None #Trim lead/trail silences if hparams.trim_silence: aud = audio.trim_silence(aud, hparams) #Pre-emphasize preem_aud = audio.preemphasis(aud, hparams.preemphasis, hparams.preemphasize) #rescale audio if hparams.rescale: aud = aud / np.abs(aud).max() * hparams.rescaling_max preem_aud = preem_aud / np.abs(preem_aud).max() * hparams.rescaling_max #Assert all audio is in [-1, 1] if (aud > 1.).any() or (aud < -1.).any(): raise RuntimeError('audio has invalid value: {}'.format(wav_path)) if (preem_aud > 1.).any() or (preem_aud < -1.).any(): raise RuntimeError('audio has invalid value: {}'.format(wav_path)) #[-1, 1] out = aud constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the audio mel_spectrogram = audio.melspectrogram(preem_aud, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] #Compute the linear scale spectrogram from the audui linear_spectrogram = audio.linearspectrogram(preem_aud, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check assert linear_frames == mel_frames #Ensure time resolution adjustement between audio and mel-spectrogram l_pad, r_pad = audio.librosa_pad_lr(aud, hparams.n_fft, audio.get_hop_size(hparams), hparams.wavenet_pad_sides) #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 # Write the spectrogram and audio to disk np.save(audio_filename, out.astype(out_dtype), allow_pickle=False) np.save(mel_filename, mel_spectrogram.T, allow_pickle=False) np.save(linear_filename, linear_spectrogram.T, allow_pickle=False)
# sanity check assert linear_frames == mel_frames >>>>>>> f33090dba9ba4bc52db8367abdc48841d13c48f8 if hparams.use_lws: # Ensure time resolution adjustement between audio and mel-spectrogram fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams)) # Zero pad audio signal <<<<<<< HEAD out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: # Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) ======= out = np.pad(out, (l, r), mode='constant', constant_values=constant_values) else: # Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr( wav, hparams.n_fft, audio.get_hop_size(hparams)) >>>>>>> f33090dba9ba4bc52db8367abdc48841d13c48f8 # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, pad, mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(hparams) # time resolution adjustement
def _process_utterance(mel_dir, index, wav_path, start, end, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - start, end: start, end points of speech - hparams: hyper parameters Returns: - A tuple: (wav_path, mel_filename, time_steps, mel_frames, start, end) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: #catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format(wav_path)) return None start += 1 * hparams.sample_rate end += 1 * hparams.sample_rate #rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max #M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) #[-1, 1] out = wav out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(out_dtype) mel_frames = mel_spectrogram.shape[1] # Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (0, pad), mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(hparams) # time resolution adjustement # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) start = round(start/int(time_steps / mel_frames)) end = round(end/int(time_steps / mel_frames)) # Write the spectrogram and audio to disk mel_filename = 'mel-{}.npy'.format(index) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (wav_path, mel_filename, time_steps, mel_frames, start, end)
def build_from_path_ispl(hparams, input_dirs, mel_dir, label_dir, tqdm=lambda x: x): """ Preprocesses the speech dataset from a gven input path to given output directories Args: - hparams: hyper parameters - input_dirs: input directory that contains the files to prerocess - mel_dir: output directory of the preprocessed speech mel-spectrogram dataset - label_dir: the directory to write the label into - tqdm: Optional, provides a nice progress bar Returns: - A list of tuple describing the train examples. this should be written to train.txt """ # We use ProcessPoolExecutor to parallelize across processes, this is just for # optimization purposes and it can be omited futures = [] index = 1 for input_dir in input_dirs: files = find_files(os.path.join(input_dir)) for wav_path in files: file_name = wav_path.split("\\")[-1] if int(file_name.split('.')[0]) <= 10: label_path = wav_path.split("\\")[0] + '/label.txt' with open(label_path, encoding='utf-8') as f: lines = f.readlines() for line in lines: if file_name in line: labels = line.replace('[', '').replace(']', '').split(':')[1].replace(',\n', '').split(',') start = [] end = [] for idx in range(0, len(labels), 2): start.append(int(labels[idx])) end.append(int(labels[idx+1])) try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: # catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format(wav_path)) return None # rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max # M-AILABS extra silence specific if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) # [-1, 1] out = wav out_dtype = np.float32 if int(file_name.split('.')[0]) <= 10: label = np.zeros_like(out) for idx in range(len(start)): start[idx] = int(start[idx] / 1000 * hparams.sample_rate) end[idx] = int(end[idx] / 1000 * hparams.sample_rate) label[start[idx]:end[idx]] = 1. else: label = wav_path.split('.')[0] + '.label' with open(label, encoding='utf-8') as f: lines = f.readlines() label = np.asarray([int(line.strip('\n')) for line in lines]) # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav, hparams).astype(out_dtype) mel_spectrogram = mel_spectrogram[:, -len(label):] mel_frames = mel_spectrogram.shape[1] # Ensure time resolution adjustement between audio and mel-spectrogram pad = audio.librosa_pad_lr(wav, hparams.n_fft, audio.get_hop_size(hparams)) if int(file_name.split('.')[0]) <= 10: # Reflect pad audio signal (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (0, pad), mode='reflect') label = np.pad(label, (0, pad), mode='reflect') assert len(out) >= mel_frames * audio.get_hop_size(hparams) # time resolution adjustement # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] label = label[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 label = label[::audio.get_hop_size(hparams)] time_steps = len(out) else: time_steps = len(out) # Write the spectrogram and audio to disk mel_filename = 'mel-{}.npy'.format(index) label_filename = 'label-{}.npy'.format(index) np.save(os.path.join(mel_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(label_dir, label_filename), label, allow_pickle=False) futures.append((wav_path, mel_filename, time_steps, mel_frames, label_filename)) index += 1 return [future for future in tqdm(futures)]
def _process_utterance(out_dir, index, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - out_dir: the directory to write the msgpack into - index: the numeric index to use in the spectogram filename - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ try: # Load the audio as numpy array wav = audio.load_wav(wav_path, sr=hparams.sample_rate) except FileNotFoundError: # catch missing wav exception print('file {} present in csv metadata is not present in wav folder. skipping!'.format( wav_path)) return None # Trim lead/trail silences if hparams.trim_silence: wav = audio.trim_silence(wav, hparams) # Pre-emphasize preem_wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize) # rescale wav if hparams.rescale: wav = wav / np.abs(wav).max() * hparams.rescaling_max preem_wav = preem_wav / np.abs(preem_wav).max() * hparams.rescaling_max # Assert all audio is in [-1, 1] if (wav > 1.).any() or (wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) if (preem_wav > 1.).any() or (preem_wav < -1.).any(): raise RuntimeError('wav has invalid value: {}'.format(wav_path)) # [-1, 1] out = wav constant_values = 0. out_dtype = np.float32 # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(preem_wav, hparams).astype(np.float32) mel_frames = mel_spectrogram.shape[1] if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: return None # Compute the linear scale spectrogram from the wav linear_spectrogram = audio.linearspectrogram(preem_wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] # sanity check assert linear_frames == mel_frames # Ensure time resolution adjustement between audio and mel-spectrogram l_pad, r_pad = audio.librosa_pad_lr(wav, audio.get_hop_size(hparams), hparams.pad_sides) # Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency) out = np.pad(out, (l_pad, r_pad), mode='constant', constant_values=constant_values) assert len(out) >= mel_frames * audio.get_hop_size(hparams) # time resolution adjustement # ensure length of raw audio is multiple of hop size so that we can use # transposed convolution to upsample out = out[:mel_frames * audio.get_hop_size(hparams)] assert len(out) % audio.get_hop_size(hparams) == 0 time_steps = len(out) npz_filename = '{}.npz'.format(index) r = hparams.outputs_per_step if hparams.symmetric_mels: _pad_value = -hparams.max_abs_value else: _pad_value = 0. # +2r for head and tail silence mel_spec = np.pad(mel_spectrogram.T, [[r, r], [0, 0]], 'constant', constant_values=_pad_value) linear_spec = np.pad(linear_spectrogram.T, [[r, r], [0, 0]], 'constant', constant_values=_pad_value) target_length = len(linear_spec) target_frames = (target_length // r + 1) * r num_pad = target_frames - target_length if num_pad != 0: linear_spec = np.pad(linear_spec, ((0, num_pad), (0, 0)), "constant", constant_values=_pad_value) mel_spec = np.pad(mel_spec, ((0, num_pad), (0, 0)), "constant", constant_values=_pad_value) stop_token = np.concatenate( [np.zeros(target_frames - 1, dtype=np.float32), np.ones(1, dtype=np.float32)], axis=0) data = { 'mel': mel_spec, 'linear': linear_spec, 'audio': out.astype(out_dtype), 'input_data': np.asarray(text_to_sequence(text)), 'time_steps': time_steps, 'mel_frames': target_frames, 'text': text, 'stop_token': stop_token, } dumps_msgpack(data, os.path.join(out_dir, npz_filename)) # Return a tuple describing this training example return npz_filename, time_steps, mel_frames, text