def get_preprocessed_wav(wav_path, tg_path): # Get alignments textgrid = tgt.io.read_textgrid(tg_path) _, duration, start, end = get_alignment( textgrid.get_tier_by_name('phones')) # Read and trim wav files sr, wav = read(wav_path) wav = wav[int(hparams.sampling_rate * start):int(hparams.sampling_rate * end)].astype(np.float32) return wav, sr, duration
def process_utterance(in_dir, out_dir, basename): wav_path = os.path.join(in_dir, 'wavn', '{}.wav'.format(basename)) tg_path = os.path.join(out_dir, 'TextGrid', '{}.TextGrid'.format(basename)) # Get alignments textgrid = tgt.io.read_textgrid(tg_path) phone, duration, start, end = get_alignment(textgrid.get_tier_by_name('phones')) text = '{'+ ' '.join(phone) + '}' text = text.replace(' $ ', '} {') # $ represents silent phones if start >= end: return None # Read and trim wav files _, wav = read(wav_path) wav = wav[int(hp.sampling_rate*start):int(hp.sampling_rate*end)].astype(np.float32) # Compute fundamental frequency f0, _ = pw.dio(wav.astype(np.float64), hp.sampling_rate, frame_period=hp.hop_length/hp.sampling_rate*1000) f0 = f0[:sum(duration)] # Compute mel-scale spectrogram mel_spectrogram = Audio.tools.get_mel_from_wav(torch.FloatTensor(wav)).numpy().astype(np.float32) mel_spectrogram = mel_spectrogram[:, :sum(duration)] if mel_spectrogram.shape[1] >= hp.max_seq_len: return None # Compute energy energy = np.linalg.norm(mel_spectrogram, axis=0) # Save alignment ali_filename = '{}-ali-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'alignment', ali_filename), duration, allow_pickle=False) # Save fundamental prequency f0_filename = '{}-f0-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'f0', f0_filename), f0, allow_pickle=False) # Save energy energy_filename = '{}-energy-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'energy', energy_filename), energy, allow_pickle=False) # Save spectrogram mel_filename = '{}-mel-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'mel', mel_filename), mel_spectrogram.T, allow_pickle=False) return '|'.join([basename, text]), max(f0), min([f for f in f0 if f != 0]), max(energy), min(energy), mel_spectrogram.shape[1]
def process_utterance(in_dir, out_dir, dirname, basename): wav_path = os.path.join(in_dir, dirname, '{}.wav'.format(basename)) tg_path = os.path.join(out_dir, 'TextGrid', dirname, '{}.TextGrid'.format(basename)) if not os.path.exists(tg_path): return None # Get alignments textgrid = tgt.io.read_textgrid(tg_path) phone, duration, start, end = get_alignment( textgrid.get_tier_by_name('phones')) text = '{' + '}{'.join( phone) + '}' # '{A}{B}{$}{C}', $ represents silent phones text = text.replace('{$}', ' ') # '{A}{B} {C}' text = text.replace('}{', ' ') # '{A B} {C}' if start >= end: return None # Read and trim wav files wav, _ = librosa.load(wav_path, sr=hp.sampling_rate) wav = wav[int(hp.sampling_rate * start):int(hp.sampling_rate * end)].astype(np.float32) # Compute fundamental frequency f0, _ = pw.dio(wav.astype(np.float64), hp.sampling_rate, frame_period=hp.hop_length / hp.sampling_rate * 1000) f0 = f0[:sum(duration)] # Compute mel-scale spectrogram and energy mel_spectrogram, energy = Audio.tools.get_mel_from_wav( torch.FloatTensor(wav)) mel_spectrogram = mel_spectrogram.cpu().numpy().astype( np.float32)[:, :sum(duration)] energy = energy.cpu().numpy().astype(np.float32)[:sum(duration)] if mel_spectrogram.shape[1] >= hp.max_seq_len: return None # if the shape is not right, you can check get_alignment function try: assert (f0.shape[0] == energy.shape[0] == mel_spectrogram.shape[1]) except AssertionError as e: print("duration problem: {}".format(wav_path)) return None # Save alignment ali_filename = '{}-ali-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'alignment', ali_filename), duration, allow_pickle=False) # Save fundamental prequency f0_filename = '{}-f0-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'f0', f0_filename), f0, allow_pickle=False) # Save energy energy_filename = '{}-energy-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'energy', energy_filename), energy, allow_pickle=False) # Save spectrogram mel_filename = '{}-mel-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'mel', mel_filename), mel_spectrogram.T, allow_pickle=False) try: return '|'.join([basename, text]), max(f0), min([ f for f in f0 if f != 0 ]), max(energy), min(energy), mel_spectrogram.shape[1] except: #print(basename) return None
def process_utterance(in_dir, out_dir, basename): wav_path = os.path.join(in_dir, 'wavs', '{}.wav'.format(basename)) tg_path = os.path.join(out_dir, 'TextGrid', '{}.TextGrid'.format(basename)) # Get alignments textgrid = tgt.io.read_textgrid(tg_path) # phone: list<112, phone string>, 已去掉前后静音 # duration: list<112, frames number per phone>, 每个里面是此phone持续的帧数 # start, end: float,表示的是去掉音频文件中前后空白silence音后的区间。 phone, duration, start, end = get_alignment( textgrid.get_tier_by_name('phones')) if start >= end: return None phone, duration = add_pad_between_word(phone, duration, textgrid) sum_duration = sum(duration) text = '{' + '}{'.join( phone) + '}' # '{A}{B}{$}{C}', $ represents silent phones text = text.replace('{$}', ' ') # '{A}{B} {C}' text = text.replace('}{', ' ') # '{A B} {C}' # Read and trim wav files # wav ndarray<212893> _, wav = read(wav_path) wav = wav[int(hp.sampling_rate * start):int(hp.sampling_rate * end)].astype(np.float32) # Compute mel-scale spectrogram and energy # mel_spectrogram: ndarray<80, 831> 梅尔普,这里范围是0-8000HZ内,再分成80段,怎么分还不知道 # energy: ndarray<831> 音量,这里范围是0到315 mel_spectrogram, energy = Audio.tools.get_mel_from_wav( torch.FloatTensor(wav)) mel_spectrogram = mel_spectrogram.numpy().astype( np.float32)[:, :sum_duration] if mel_spectrogram.shape[1] >= hp.max_seq_len: return None # energy = energy.numpy().astype(np.float32)[:sum_duration] # # # Compute fundamental frequency # # f0 ndarray<832> # f0, _ = pw.dio(wav.astype(np.float64), hp.sampling_rate, frame_period=hp.hop_length / hp.sampling_rate * 1000) # # f0 ndarray<831> 基础频率,也可以认为是声带振动的频率,人类一般是140HZ,这里范围是70-800HZ # f0 = f0[:sum_duration] # Save alignment ali_filename = '{}-ali-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'alignment', ali_filename), duration, allow_pickle=False) # # Save fundamental prequency # f0_filename = '{}-f0-{}.npy'.format(hp.dataset, basename) # np.save(os.path.join(out_dir, 'f0', f0_filename), f0, allow_pickle=False) # # # Save energy # energy_filename = '{}-energy-{}.npy'.format(hp.dataset, basename) # np.save(os.path.join(out_dir, 'energy', energy_filename), energy, allow_pickle=False) # # # Save spectrogram # mel_filename = '{}-mel-{}.npy'.format(hp.dataset, basename) # np.save(os.path.join(out_dir, 'mel', mel_filename), mel_spectrogram.T, allow_pickle=False) # return '|'.join([basename, text]), max(f0), min([f for f in f0 if f != 0]), max(energy), min(energy), \ # mel_spectrogram.shape[1] return '|'.join([basename, text])
def process_utterance(in_dir, out_dir, basename, scalers): wav_bak_basename=basename.replace('.wav','') basename = wav_bak_basename[2:] wav_bak_path = os.path.join(in_dir, "wavs_bak", "{}.wav".format(wav_bak_basename)) wav_path = os.path.join(in_dir, 'wavs', '{}.wav'.format(basename)) # Convert kss data into PCM encoded wavs if not os.path.isfile(wav_path): os.system("ffmpeg -i {} -ac 1 -ar 22050 {}".format(wav_bak_path, wav_path)) tg_path = os.path.join(out_dir, 'TextGrid', '{}.TextGrid'.format(basename)) # Get alignments textgrid = tgt.io.read_textgrid(tg_path) phone, duration, start, end = get_alignment(textgrid.get_tier_by_name('phones')) text = '{'+ '}{'.join(phone) + '}' # '{A}{B}{$}{C}', $ represents silent phones text = text.replace('{$}', ' ') # '{A}{B} {C}' text = text.replace('}{', ' ') # '{A B} {C}' if start >= end: return None # Read and trim wav files _, wav = read(wav_path) wav = wav[int(hp.sampling_rate*start):int(hp.sampling_rate*end)].astype(np.float32) # Compute fundamental frequency f0, _ = pw.dio(wav.astype(np.float64), hp.sampling_rate, frame_period=hp.hop_length/hp.sampling_rate*1000) f0 = f0[:sum(duration)] # Compute mel-scale spectrogram and energy mel_spectrogram, energy = Audio.tools.get_mel_from_wav(torch.FloatTensor(wav)) mel_spectrogram = mel_spectrogram.numpy().astype(np.float32)[:, :sum(duration)] energy = energy.numpy().astype(np.float32)[:sum(duration)] f0, energy = remove_outlier(f0), remove_outlier(energy) f0, energy = average_by_duration(f0, duration), average_by_duration(energy, duration) if mel_spectrogram.shape[1] >= hp.max_seq_len: return None # Save alignment ali_filename = '{}-ali-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'alignment', ali_filename), duration, allow_pickle=False) # Save fundamental prequency f0_filename = '{}-f0-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'f0', f0_filename), f0, allow_pickle=False) # Save energy energy_filename = '{}-energy-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'energy', energy_filename), energy, allow_pickle=False) # Save spectrogram mel_filename = '{}-mel-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'mel', mel_filename), mel_spectrogram.T, allow_pickle=False) mel_scaler, f0_scaler, energy_scaler = scalers mel_scaler.partial_fit(mel_spectrogram.T) f0_scaler.partial_fit(f0[f0!=0].reshape(-1, 1)) energy_scaler.partial_fit(energy[energy != 0].reshape(-1, 1)) return '|'.join([basename, text]), mel_spectrogram.shape[1]
def process_utterance(in_dir, out_dir, basename): wav_path = os.path.join(in_dir, 'wavs', '{}.wav'.format(basename)) tg_path = os.path.join(out_dir, 'TextGrid', '{}.TextGrid'.format(basename)) # Get alignments textgrid = tgt.io.read_textgrid(tg_path) phone, duration, start, end = get_alignment( textgrid.get_tier_by_name('phones')) ''' print("basename:",basename) print("phone:",phone) print("duration:",duration) print("start:",start) print("end",end) ''' text = '{' + '}{'.join( phone) + '}' # '{A}{B}{$}{C}', $ represents silent phones text = text.replace('{$}', ' ') # '{A}{B} {C}' text = text.replace('}{', ' ') # '{A B} {C}' if start >= end: return None # Read and trim wav files _, wav = read(wav_path) #print("len of wav(before):", len(wav)) wav = wav[int(hp.sampling_rate * start):int(hp.sampling_rate * end)].astype(np.float32) #print(np.size(wav,0)) #自加: remove the wav files that are too short if np.size(wav, 0) < 1024: return None ''' print("sum of duration:", sum(duration)) print("len of wav(after)", len(wav)) ''' # Compute fundamental frequency f0, _ = pw.dio(wav.astype(np.float64), hp.sampling_rate, frame_period=hp.hop_length / hp.sampling_rate * 1000) #change from dio to harvest f0 = f0[:sum(duration)] if max(f0) == 0: #自加: remove the wav files which f0 are all 0 return None # Compute mel-scale spectrogram and energy mel_spectrogram, energy = Audio.tools.get_mel_from_wav( torch.FloatTensor(wav)) mel_spectrogram = mel_spectrogram.numpy().astype( np.float32)[:, :sum(duration)] energy = energy.numpy().astype(np.float32)[:sum(duration)] if mel_spectrogram.shape[1] >= hp.max_seq_len: return None ''' #added by eric print("wav:\n",wav) print("f0:\n",f0) print("mel_spectrogram:\n",mel_spectrogram) print("energy:",energy) ''' # Save alignment ali_filename = '{}-ali-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'alignment', ali_filename), duration, allow_pickle=False) # Save fundamental prequency f0_filename = '{}-f0-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'f0', f0_filename), f0, allow_pickle=False) # Save energy energy_filename = '{}-energy-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'energy', energy_filename), energy, allow_pickle=False) # Save spectrogram mel_filename = '{}-mel-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'mel', mel_filename), mel_spectrogram.T, allow_pickle=False) return '|'.join([basename, text]), max(f0), min( [f for f in f0 if f > 0]), max(energy), min( energy), mel_spectrogram.shape[1] #change: f0 can be zero
def process_utterance(in_dir, out_dir, dirname, basename): wav_path = os.path.join(in_dir, dirname, '{}.wav'.format(basename)) tg_path = os.path.join(out_dir, 'TextGrid', dirname, '{}.TextGrid'.format(basename)) if not os.path.exists(tg_path): return None # Get alignments textgrid = tgt.io.read_textgrid(tg_path) phone, duration, start, end = utils.get_alignment( textgrid.get_tier_by_name('phones')) text = '{' + '}{'.join( phone) + '}' # '{A}{B}{$}{C}', $ represents silent phones text = text.replace('{$}', ' ') # '{A}{B} {C}' text = text.replace('}{', ' ') # '{A B} {C}' if start >= end: return None # Read and trim wav files sr, wav = read(wav_path) wav = wav[int(hp.sampling_rate * start):int(hp.sampling_rate * end)].astype(np.float32) # Compute fundamental frequency f0, _ = pw.dio(wav.astype(np.float64), hp.sampling_rate, frame_period=hp.hop_length / hp.sampling_rate * 1000) f0 = f0[:sum(duration)] # Compute mel-scale spectrogram and energy mel_spectrogram, energy, _ = Audio.tools.get_mel_from_wav( torch.FloatTensor(wav)) mel_spectrogram = mel_spectrogram.numpy().astype( np.float32)[:, :sum(duration)] energy = energy.numpy().astype(np.float32)[:sum(duration)] if mel_spectrogram.shape[1] >= hp.max_seq_len: return None # Save alignment ali_filename = '{}-ali-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'alignment', ali_filename), duration, allow_pickle=False) # Save fundamental prequency f0_filename = '{}-f0-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'f0', f0_filename), f0, allow_pickle=False) # Save normalized fundamental prequency f0_norm = utils.f0_normalization(f0) np.save(os.path.join(out_dir, 'f0_norm', f0_filename), f0_norm, allow_pickle=False) # Save energy energy_filename = '{}-energy-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'energy', energy_filename), energy, allow_pickle=False) # Save rescaled energy energy_0to1 = utils.energy_rescaling(energy) np.save(os.path.join(out_dir, 'energy_0to1', energy_filename), energy_0to1, allow_pickle=False) # Save spectrogram mel_filename = '{}-mel-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'mel_clean', mel_filename), mel_spectrogram.T, allow_pickle=False) return '|'.join([basename, text]), max(f0), min( [f for f in f0 if f != 0]), max(energy), min(energy), mel_spectrogram.shape[1]
def process_utterance(in_dir, out_dir, basename): wav_path = os.path.join(in_dir, '{}.wav'.format(basename)) tg_path = os.path.join(out_dir, 'TextGrid', '{}.TextGrid'.format(basename)) if not os.path.exists(tg_path): print(tg_path, ' is not found') return None # Get alignments textgrid = tgt.io.read_textgrid(tg_path) phone, duration, start, end = get_alignment( textgrid.get_tier_by_name('phones')) # phone = [i for i in phone if len(i) >= 1] for i, v in enumerate(phone): if len(v) == 0: phone[i] = '{sp}' elif 'sp' in v: phone[i] = '{sp}' elif v == 'sil': phone[i] = '{sp}' while '{sp}' in phone[-1]: phone = phone[:-1] text = ''.join(phone) # text = '{' + '}{'.join(phone) + '}' # '{A}{B}{$}{C}', $ represents silent phones # text = text.replace('{$}', ' ') # '{A}{B} {C}' # text = text.replace('}{', ' ') # '{A B} {C}' duration = duration[:len(phone)] if start >= end: return None # Read and trim wav files _, wav = read(wav_path) # print(_) wav = wav[int(hp.sampling_rate * start):int(hp.sampling_rate * end)].astype(np.float32) # wav = wav[:, 0] # print(wav.shape, len(wav)) # Compute fundamental frequency f0, _ = pw.dio(wav.astype(np.float64), hp.sampling_rate, frame_period=hp.hop_length / hp.sampling_rate * 1000) f0 = f0[:sum(duration)] # Compute mel-scale spectrogram and energy mel_spectrogram, energy = Audio.tools.get_mel_from_wav( torch.FloatTensor(wav)) mel_spectrogram = mel_spectrogram.numpy().astype( np.float32)[:, :sum(duration)] energy = energy.numpy().astype(np.float32)[:sum(duration)] if mel_spectrogram.shape[1] >= hp.max_seq_len: return None # Save alignment ali_filename = '{}-ali-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'alignment', ali_filename), duration, allow_pickle=False) # Save fundamental prequency f0_filename = '{}-f0-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'f0', f0_filename), f0, allow_pickle=False) # Save energy energy_filename = '{}-energy-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'energy', energy_filename), energy, allow_pickle=False) # Save spectrogram mel_filename = '{}-mel-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'mel', mel_filename), mel_spectrogram.T, allow_pickle=False) return '|'.join([basename, text]), max(f0), min([f for f in f0 if f != 0]), max(energy), min(energy), \ mel_spectrogram.shape[1]