def _preprocess_speaker_aishell2(speaker_dir, suffix, out_dir: Path, skip_existing: bool, hparams, others_params): speaker_dir, spk_id = speaker_dir trans_dict = others_params["trans_dict"] detach_label_and_embed_utt = others_params["detach_label_and_embed_utt"] metadata = [] wav_fpath_list = speaker_dir.glob("*." + suffix) utt_fpath_list = list(speaker_dir.glob("*." + suffix)) utt_num = len(utt_fpath_list) # Iterate over each wav for wav_fpath in wav_fpath_list: assert wav_fpath.exists(), str(wav_fpath) + " not exist." # Process each utterance wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate) wav_abs_max = np.max(np.abs(wav)) wav_abs_max = wav_abs_max if wav_abs_max > 0.0 else 1e-8 wav = wav / wav_abs_max * hparams.rescaling_max # norm # wav_bak = wav # denoise if len(wav) > hparams.sample_rate * (0.3 + 0.1): noise_wav = np.concatenate([ wav[:int(hparams.sample_rate * 0.15)], wav[-int(hparams.sample_rate * 0.15):] ]) profile = logmmse.profile_noise(noise_wav, hparams.sample_rate) wav = logmmse.denoise(wav, profile, eta=0) # trim silence wav = audio.trim_silence(wav, 30) # top_db: smaller for noisy # audio.save_wav(wav_bak, str(wav_fpath.name), hparams.sample_rate) # audio.save_wav(wav, str(wav_fpath.name).replace('.wav','_trimed.wav'), # hparams.sample_rate) text = trans_dict[wav_fpath.stem] # Chinese to Pinyin pinyin = " ".join(get_pinyin(text, std=True, pb=True)) # print(wav_fpath.name, wav_fpath.stem) random_uttBasename_forSpkEmbedding = None if detach_label_and_embed_utt: random_uttBasename_forSpkEmbedding = utt_fpath_list[ np.random.randint(utt_num)].stem metadata.append( process_utterance(wav, pinyin, out_dir, wav_fpath.stem, skip_existing, hparams, random_uttBasename_forSpkEmbedding)) return [m for m in metadata if m is not None]
def _preprocess_speaker_SLR38(speaker_dir, suffix, out_dir: Path, skip_existing: bool, hparams, others_params): detach_label_and_embed_utt = others_params["detach_label_and_embed_utt"] wav_fpath_list = speaker_dir.glob("*." + suffix) text_fpath_list = speaker_dir.glob("*.txt") metadata = [] # Iterate over each wav utt_fpath_list = list(speaker_dir.glob("*." + suffix)) utt_num = len(utt_fpath_list) for wav_fpath, txt_fpath in zip(wav_fpath_list, text_fpath_list): assert wav_fpath.exists(), str(wav_fpath) + " not exist." assert txt_fpath.exists(), str(wav_fpath) + " not exist." # Process each utt wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate) wav = wav / np.max(np.abs(wav)) * hparams.rescaling_max # wav_bak = wav # denoise if len(wav) > hparams.sample_rate * (0.3 + 0.1): noise_wav = np.concatenate([ wav[:int(hparams.sample_rate * 0.15)], wav[-int(hparams.sample_rate * 0.15):] ]) profile = logmmse.profile_noise(noise_wav, hparams.sample_rate) wav = logmmse.denoise(wav, profile, eta=0) # trim silence wav = audio.trim_silence(wav, 30) # audio.save_wav(wav_bak, str(wav_fpath.name), hparams.sample_rate) # audio.save_wav(wav, str(wav_fpath.name).replace('.wav','_trimed.wav'), # hparams.sample_rate) # get text text = txt_fpath.read_text() # Chinese to Pinyin pinyin = " ".join(get_pinyin(text, std=True, pb=True)) # print(wav_fpath.name, wav_fpath.stem) random_uttBasename_forSpkEmbedding = None if detach_label_and_embed_utt: random_uttBasename_forSpkEmbedding = utt_fpath_list[ np.random.randint(utt_num)].stem metadata.append( process_utterance(wav, pinyin, out_dir, wav_fpath.stem, skip_existing, hparams, random_uttBasename_forSpkEmbedding)) return [m for m in metadata if m is not None]
def _preprocess_speaker_SLR68(speaker_dir, suffix, out_dir: Path, skip_existing: bool, hparams, others_params): trans_dict = others_params["trans_dict"] metadata = [] wav_fpath_list = speaker_dir.glob("*."+suffix) # Iterate over each entry in the alignments file for wav_fpath in wav_fpath_list: assert wav_fpath.exists(), str(wav_fpath)+" not exist." # Process each utterance wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate) text = trans_dict[wav_fpath.name]["text"] # Chinese to Pinyin pinyin = " ".join(get_pinyin(text, std=True, pb=True)) # print(wav_fpath.name, wav_fpath.stem) metadata.append(process_utterance(wav, pinyin, out_dir, wav_fpath.stem, skip_existing, hparams)) return [m for m in metadata if m is not None]
def _preprocess_speaker_SLR68(speaker_dir, suffix, out_dir: Path, skip_existing: bool, hparams, others_params): trans_dict = others_params["trans_dict"] metadata = [] wav_fpath_list = speaker_dir.glob("*." + suffix) # Iterate over each entry in the alignments file for wav_fpath in wav_fpath_list: assert wav_fpath.exists(), str(wav_fpath) + " not exist." # Process each utterance wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate) # wav_bak = wav wav = wav / np.max(np.abs(wav)) * hparams.rescaling_max # norm # denoise if len(wav) > hparams.sample_rate * (0.3 + 0.1): noise_wav = np.concatenate([ wav[:int(hparams.sample_rate * 0.15)], wav[-int(hparams.sample_rate * 0.15):] ]) profile = logmmse.profile_noise(noise_wav, hparams.sample_rate) wav = logmmse.denoise(wav, profile, eta=0) # trim silence wav = audio.trim_silence(wav, 20) # top_db: smaller for noisy # audio.save_wav(wav_bak, str(wav_fpath.name), hparams.sample_rate) # audio.save_wav(wav, str(wav_fpath.name).replace('.wav','_trimed.wav'), # hparams.sample_rate) text = trans_dict[wav_fpath.name]["text"] # Chinese to Pinyin pinyin = " ".join(get_pinyin(text, std=True, pb=True)) # print(wav_fpath.name, wav_fpath.stem) metadata.append( process_utterance(wav, pinyin, out_dir, wav_fpath.stem, skip_existing, hparams)) return [m for m in metadata if m is not None]