Ejemplo n.º 1
0
def _preprocess_speaker_aishell2(speaker_dir, suffix, out_dir: Path,
                                 skip_existing: bool, hparams, others_params):
    speaker_dir, spk_id = speaker_dir
    trans_dict = others_params["trans_dict"]
    detach_label_and_embed_utt = others_params["detach_label_and_embed_utt"]
    metadata = []
    wav_fpath_list = speaker_dir.glob("*." + suffix)

    utt_fpath_list = list(speaker_dir.glob("*." + suffix))
    utt_num = len(utt_fpath_list)
    # Iterate over each wav
    for wav_fpath in wav_fpath_list:
        assert wav_fpath.exists(), str(wav_fpath) + " not exist."

        # Process each utterance
        wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
        wav_abs_max = np.max(np.abs(wav))
        wav_abs_max = wav_abs_max if wav_abs_max > 0.0 else 1e-8
        wav = wav / wav_abs_max * hparams.rescaling_max  # norm
        # wav_bak = wav

        # denoise
        if len(wav) > hparams.sample_rate * (0.3 + 0.1):
            noise_wav = np.concatenate([
                wav[:int(hparams.sample_rate * 0.15)],
                wav[-int(hparams.sample_rate * 0.15):]
            ])
            profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
            wav = logmmse.denoise(wav, profile, eta=0)

        # trim silence
        wav = audio.trim_silence(wav, 30)  # top_db: smaller for noisy
        # audio.save_wav(wav_bak, str(wav_fpath.name), hparams.sample_rate)
        # audio.save_wav(wav, str(wav_fpath.name).replace('.wav','_trimed.wav'),
        #                hparams.sample_rate)

        text = trans_dict[wav_fpath.stem]

        # Chinese to Pinyin
        pinyin = " ".join(get_pinyin(text, std=True, pb=True))

        # print(wav_fpath.name, wav_fpath.stem)
        random_uttBasename_forSpkEmbedding = None
        if detach_label_and_embed_utt:
            random_uttBasename_forSpkEmbedding = utt_fpath_list[
                np.random.randint(utt_num)].stem
        metadata.append(
            process_utterance(wav, pinyin, out_dir, wav_fpath.stem,
                              skip_existing, hparams,
                              random_uttBasename_forSpkEmbedding))
    return [m for m in metadata if m is not None]
Ejemplo n.º 2
0
def _preprocess_speaker_SLR38(speaker_dir, suffix, out_dir: Path,
                              skip_existing: bool, hparams, others_params):
    detach_label_and_embed_utt = others_params["detach_label_and_embed_utt"]
    wav_fpath_list = speaker_dir.glob("*." + suffix)
    text_fpath_list = speaker_dir.glob("*.txt")
    metadata = []
    # Iterate over each wav
    utt_fpath_list = list(speaker_dir.glob("*." + suffix))
    utt_num = len(utt_fpath_list)
    for wav_fpath, txt_fpath in zip(wav_fpath_list, text_fpath_list):
        assert wav_fpath.exists(), str(wav_fpath) + " not exist."
        assert txt_fpath.exists(), str(wav_fpath) + " not exist."

        # Process each utt
        wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
        wav = wav / np.max(np.abs(wav)) * hparams.rescaling_max
        # wav_bak = wav

        # denoise
        if len(wav) > hparams.sample_rate * (0.3 + 0.1):
            noise_wav = np.concatenate([
                wav[:int(hparams.sample_rate * 0.15)],
                wav[-int(hparams.sample_rate * 0.15):]
            ])
            profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
            wav = logmmse.denoise(wav, profile, eta=0)

        # trim silence
        wav = audio.trim_silence(wav, 30)
        # audio.save_wav(wav_bak, str(wav_fpath.name), hparams.sample_rate)
        # audio.save_wav(wav, str(wav_fpath.name).replace('.wav','_trimed.wav'),
        #                hparams.sample_rate)

        # get text
        text = txt_fpath.read_text()

        # Chinese to Pinyin
        pinyin = " ".join(get_pinyin(text, std=True, pb=True))

        # print(wav_fpath.name, wav_fpath.stem)
        random_uttBasename_forSpkEmbedding = None
        if detach_label_and_embed_utt:
            random_uttBasename_forSpkEmbedding = utt_fpath_list[
                np.random.randint(utt_num)].stem
        metadata.append(
            process_utterance(wav, pinyin, out_dir, wav_fpath.stem,
                              skip_existing, hparams,
                              random_uttBasename_forSpkEmbedding))
    return [m for m in metadata if m is not None]
Ejemplo n.º 3
0
def _preprocess_speaker_SLR68(speaker_dir, suffix, out_dir: Path, skip_existing: bool, hparams, others_params):
    trans_dict = others_params["trans_dict"]
    metadata = []
    wav_fpath_list = speaker_dir.glob("*."+suffix)
    # Iterate over each entry in the alignments file
    for wav_fpath in wav_fpath_list:
        assert wav_fpath.exists(), str(wav_fpath)+" not exist."

        # Process each utterance
        wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
        text = trans_dict[wav_fpath.name]["text"]

        # Chinese to Pinyin
        pinyin = " ".join(get_pinyin(text, std=True, pb=True))

        # print(wav_fpath.name, wav_fpath.stem)
        metadata.append(process_utterance(wav, pinyin, out_dir, wav_fpath.stem,
                                          skip_existing, hparams))
    return [m for m in metadata if m is not None]
def _preprocess_speaker_SLR68(speaker_dir, suffix, out_dir: Path,
                              skip_existing: bool, hparams, others_params):
    trans_dict = others_params["trans_dict"]
    metadata = []
    wav_fpath_list = speaker_dir.glob("*." + suffix)
    # Iterate over each entry in the alignments file
    for wav_fpath in wav_fpath_list:
        assert wav_fpath.exists(), str(wav_fpath) + " not exist."

        # Process each utterance
        wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
        # wav_bak = wav
        wav = wav / np.max(np.abs(wav)) * hparams.rescaling_max  # norm

        # denoise
        if len(wav) > hparams.sample_rate * (0.3 + 0.1):
            noise_wav = np.concatenate([
                wav[:int(hparams.sample_rate * 0.15)],
                wav[-int(hparams.sample_rate * 0.15):]
            ])
            profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
            wav = logmmse.denoise(wav, profile, eta=0)

        # trim silence
        wav = audio.trim_silence(wav, 20)  # top_db: smaller for noisy
        # audio.save_wav(wav_bak, str(wav_fpath.name), hparams.sample_rate)
        # audio.save_wav(wav, str(wav_fpath.name).replace('.wav','_trimed.wav'),
        #                hparams.sample_rate)

        text = trans_dict[wav_fpath.name]["text"]

        # Chinese to Pinyin
        pinyin = " ".join(get_pinyin(text, std=True, pb=True))

        # print(wav_fpath.name, wav_fpath.stem)
        metadata.append(
            process_utterance(wav, pinyin, out_dir, wav_fpath.stem,
                              skip_existing, hparams))
    return [m for m in metadata if m is not None]