Esempio n. 1
0
def _preprocess_speaker_aishell2(speaker_dir, suffix, out_dir: Path,
                                 skip_existing: bool, hparams, others_params):
    speaker_dir, spk_id = speaker_dir
    trans_dict = others_params["trans_dict"]
    detach_label_and_embed_utt = others_params["detach_label_and_embed_utt"]
    metadata = []
    wav_fpath_list = speaker_dir.glob("*." + suffix)

    utt_fpath_list = list(speaker_dir.glob("*." + suffix))
    utt_num = len(utt_fpath_list)
    # Iterate over each wav
    for wav_fpath in wav_fpath_list:
        assert wav_fpath.exists(), str(wav_fpath) + " not exist."

        # Process each utterance
        wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
        wav_abs_max = np.max(np.abs(wav))
        wav_abs_max = wav_abs_max if wav_abs_max > 0.0 else 1e-8
        wav = wav / wav_abs_max * hparams.rescaling_max  # norm
        # wav_bak = wav

        # denoise
        if len(wav) > hparams.sample_rate * (0.3 + 0.1):
            noise_wav = np.concatenate([
                wav[:int(hparams.sample_rate * 0.15)],
                wav[-int(hparams.sample_rate * 0.15):]
            ])
            profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
            wav = logmmse.denoise(wav, profile, eta=0)

        # trim silence
        wav = audio.trim_silence(wav, 30)  # top_db: smaller for noisy
        # audio.save_wav(wav_bak, str(wav_fpath.name), hparams.sample_rate)
        # audio.save_wav(wav, str(wav_fpath.name).replace('.wav','_trimed.wav'),
        #                hparams.sample_rate)

        text = trans_dict[wav_fpath.stem]

        # Chinese to Pinyin
        pinyin = " ".join(get_pinyin(text, std=True, pb=True))

        # print(wav_fpath.name, wav_fpath.stem)
        random_uttBasename_forSpkEmbedding = None
        if detach_label_and_embed_utt:
            random_uttBasename_forSpkEmbedding = utt_fpath_list[
                np.random.randint(utt_num)].stem
        metadata.append(
            process_utterance(wav, pinyin, out_dir, wav_fpath.stem,
                              skip_existing, hparams,
                              random_uttBasename_forSpkEmbedding))
    return [m for m in metadata if m is not None]
Esempio n. 2
0
def _preprocess_speaker_SLR38(speaker_dir, suffix, out_dir: Path,
                              skip_existing: bool, hparams, others_params):
    detach_label_and_embed_utt = others_params["detach_label_and_embed_utt"]
    wav_fpath_list = speaker_dir.glob("*." + suffix)
    text_fpath_list = speaker_dir.glob("*.txt")
    metadata = []
    # Iterate over each wav
    utt_fpath_list = list(speaker_dir.glob("*." + suffix))
    utt_num = len(utt_fpath_list)
    for wav_fpath, txt_fpath in zip(wav_fpath_list, text_fpath_list):
        assert wav_fpath.exists(), str(wav_fpath) + " not exist."
        assert txt_fpath.exists(), str(wav_fpath) + " not exist."

        # Process each utt
        wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
        wav = wav / np.max(np.abs(wav)) * hparams.rescaling_max
        # wav_bak = wav

        # denoise
        if len(wav) > hparams.sample_rate * (0.3 + 0.1):
            noise_wav = np.concatenate([
                wav[:int(hparams.sample_rate * 0.15)],
                wav[-int(hparams.sample_rate * 0.15):]
            ])
            profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
            wav = logmmse.denoise(wav, profile, eta=0)

        # trim silence
        wav = audio.trim_silence(wav, 30)
        # audio.save_wav(wav_bak, str(wav_fpath.name), hparams.sample_rate)
        # audio.save_wav(wav, str(wav_fpath.name).replace('.wav','_trimed.wav'),
        #                hparams.sample_rate)

        # get text
        text = txt_fpath.read_text()

        # Chinese to Pinyin
        pinyin = " ".join(get_pinyin(text, std=True, pb=True))

        # print(wav_fpath.name, wav_fpath.stem)
        random_uttBasename_forSpkEmbedding = None
        if detach_label_and_embed_utt:
            random_uttBasename_forSpkEmbedding = utt_fpath_list[
                np.random.randint(utt_num)].stem
        metadata.append(
            process_utterance(wav, pinyin, out_dir, wav_fpath.stem,
                              skip_existing, hparams,
                              random_uttBasename_forSpkEmbedding))
    return [m for m in metadata if m is not None]
Esempio n. 3
0
def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
                   source_sr: Optional[int] = None):
    """
    Applies the preprocessing operations used in training the Speaker Encoder to a waveform 
    either on disk or in memory. The waveform will be resampled to match the data hyperparameters.

    :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not 
    just .wav), either the waveform as a numpy array of floats.
    :param source_sr: if passing an audio waveform, the sampling rate of the waveform before 
    preprocessing. After preprocessing, the waveform's sampling rate will match the data 
    hyperparameters. If passing a filepath, the sampling rate will be automatically detected and 
    this argument will be ignored.
    """
    # Load the wav from disk if needed
    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
        wav, source_sr = librosa.load(str(fpath_or_wav), sr=sampling_rate)
    else:
        wav = fpath_or_wav

    # Resample the wav if needed
    # if source_sr is not None and source_sr != sampling_rate:
    #     wav = librosa.resample(wav, source_sr, sampling_rate)

    wav_abs_max = np.max(np.abs(wav))
    wav_abs_max = wav_abs_max if wav_abs_max > 0.0 else 1e-8
    wav = wav / wav_abs_max * 0.9
    # # Apply the preprocessing: normalize volume and shorten long silences
    # wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
    # wav = trim_long_silences(wav)
    # save_wav(wav, fpath_or_wav.name, sampling_rate) # TODO: rm DEBUG

    # denoise
    if len(wav) > sampling_rate * (0.3 + 0.1):
        noise_wav = np.concatenate([
            wav[:int(sampling_rate * 0.15)], wav[-int(sampling_rate * 0.15):]
        ])
        profile = logmmse.profile_noise(noise_wav, sampling_rate)
        wav = logmmse.denoise(wav, profile, eta=0)

    # trim silence
    wav = trim_silence(wav, 30)  # top_db: smaller for noisy
    wav = trim_long_silences(wav)
    # save_wav(wav, fpath_or_wav.name.replace(".wav","_trimed.wav"), sampling_rate) # TODO: rm DEBUG
    return wav
Esempio n. 4
0
    def load_preprocess_wav(fpath):
        """
        Loads and preprocesses an audio file under the same conditions the audio files were used to
        train the synthesizer. 
        """
        wav = librosa.load(fpath, hparams.sample_rate)[0]
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

        # denoise
        if len(wav) > hparams.sample_rate * (0.3 + 0.1):
            noise_wav = np.concatenate([
                wav[:int(hparams.sample_rate * 0.15)],
                wav[-int(hparams.sample_rate * 0.15):]
            ])
            profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
            wav = logmmse.denoise(wav, profile, eta=0)

        # trim silence
        wav = audio.trim_silence(wav, 20)  # top_db: smaller for noisy

        return wav
def _preprocess_speaker_SLR68(speaker_dir, suffix, out_dir: Path,
                              skip_existing: bool, hparams, others_params):
    trans_dict = others_params["trans_dict"]
    metadata = []
    wav_fpath_list = speaker_dir.glob("*." + suffix)
    # Iterate over each entry in the alignments file
    for wav_fpath in wav_fpath_list:
        assert wav_fpath.exists(), str(wav_fpath) + " not exist."

        # Process each utterance
        wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
        # wav_bak = wav
        wav = wav / np.max(np.abs(wav)) * hparams.rescaling_max  # norm

        # denoise
        if len(wav) > hparams.sample_rate * (0.3 + 0.1):
            noise_wav = np.concatenate([
                wav[:int(hparams.sample_rate * 0.15)],
                wav[-int(hparams.sample_rate * 0.15):]
            ])
            profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
            wav = logmmse.denoise(wav, profile, eta=0)

        # trim silence
        wav = audio.trim_silence(wav, 20)  # top_db: smaller for noisy
        # audio.save_wav(wav_bak, str(wav_fpath.name), hparams.sample_rate)
        # audio.save_wav(wav, str(wav_fpath.name).replace('.wav','_trimed.wav'),
        #                hparams.sample_rate)

        text = trans_dict[wav_fpath.name]["text"]

        # Chinese to Pinyin
        pinyin = " ".join(get_pinyin(text, std=True, pb=True))

        # print(wav_fpath.name, wav_fpath.stem)
        metadata.append(
            process_utterance(wav, pinyin, out_dir, wav_fpath.stem,
                              skip_existing, hparams))
    return [m for m in metadata if m is not None]
Esempio n. 6
0
def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
                   source_sr: Optional[int] = None):

    # 读取音频
    wav, source_sr = librosa.load(str(fpath_or_wav), sr=sampling_rate)
    wav_abs_max = np.max(np.abs(wav))
    wav_abs_max = wav_abs_max if wav_abs_max > 0.0 else 1e-8
    wav = wav / wav_abs_max * 0.9

    # 去噪
    if len(wav) > sampling_rate * (0.3 + 0.1):
        noise_wav = np.concatenate([
            wav[:int(sampling_rate * 0.15)], wav[-int(sampling_rate * 0.15):]
        ])
        profile = logmmse.profile_noise(noise_wav, sampling_rate)
        wav = logmmse.denoise(wav, profile, eta=0)

    # 去除静音
    wav = librosa.effects.trim(wav,
                               top_db=30,
                               frame_length=512,
                               hop_length=128)[0]
    return wav
Esempio n. 7
0
def split_on_silences(wav_fpath, words, end_times, hparams):
    # Load the audio waveform
    wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
    wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # denoise
    if len(wav) > hparams.sample_rate * (0.3 + 0.1):
        noise_wav = np.concatenate([
            wav[:int(hparams.sample_rate * 0.1)],
            wav[-int(hparams.sample_rate * 0.1):]
        ])
        profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
        wav = logmmse.denoise(wav, profile, eta=0)

    words = np.array(words)
    start_times = np.array([0.0] + end_times[:-1])
    end_times = np.array(end_times)
    assert len(words) == len(end_times) == len(start_times)
    assert words[0] == "" and words[-1] == ""

    # Find pauses that are too long
    mask = (words == "") & (end_times - start_times >=
                            hparams.silence_min_duration_split)
    mask[0] = mask[-1] = True
    breaks = np.where(mask)[0]  # first dim indexs

    # Profile the noise from the silences and perform noise reduction on the waveform
    silence_times = [[start_times[i], end_times[i]] for i in breaks]
    silence_times = (np.array(silence_times) * hparams.sample_rate).astype(
        np.int)
    noisy_wav = np.concatenate(
        [wav[stime[0]:stime[1]] for stime in silence_times])
    if len(noisy_wav) > hparams.sample_rate * 0.02:
        profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
        wav = logmmse.denoise(wav, profile, eta=0)

    # Re-attach(Re-join) segments that are too short
    segments = list(zip(breaks[:-1], breaks[1:]))
    segment_durations = [
        start_times[end] - end_times[start] for start, end in segments
    ]
    i = 0
    while i < len(segments) and len(segments) > 1:
        if segment_durations[i] < hparams.utterance_min_duration:
            # See if the segment can be re-attached with the right or the left segment
            left_duration = float("inf") if i == 0 else segment_durations[i -
                                                                          1]
            right_duration = float(
                "inf") if i == len(segments) - 1 else segment_durations[i + 1]
            joined_duration = segment_durations[i] + min(
                left_duration, right_duration)

            # Do not re-attach if it causes the joined utterance to be too long
            if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
                i += 1
                continue

            # Re-attach the segment with the neighbour of shortest duration
            j = i - 1 if left_duration <= right_duration else i
            segments[j] = (segments[j][0], segments[j + 1][1])
            segment_durations[j] = joined_duration
            del segments[j + 1], segment_durations[j + 1]
        else:
            i += 1

    # Split the utterance
    segment_times = [[end_times[start], start_times[end]]
                     for start, end in segments]
    segment_times = (np.array(segment_times) * hparams.sample_rate).astype(
        np.int)
    wavs = [
        wav[segment_time[0]:segment_time[1]] for segment_time in segment_times
    ]  # [N_seg, seg_time]
    texts = [
        " ".join(words[start + 1:end]).replace("  ", " ")
        for start, end in segments
    ]  # [N_seg]

    # # DEBUG: play the audio segments (run with -n=1)
    # import sounddevice as sd
    # if len(wavs) > 1:
    #     print("This sentence was split in %d segments:" % len(wavs))
    # else:
    #     print("There are no silences long enough for this sentence to be split:")
    # for wav, text in zip(wavs, texts):
    #     # Pad the waveform with 1 second of silence because sounddevice tends to cut them early
    #     # when playing them. You shouldn't need to do that in your parsers.
    #     wav = np.concatenate((wav, [0] * 16000))
    #     print("\t%s" % text)
    #     sd.play(wav, 16000, blocking=True)
    # print("")

    return wavs, texts
Esempio n. 8
0
def split_on_silences(wav_fpath, words, end_times, hparams):
    # Load the audio waveform
    wav, _ = librosa.load(wav_fpath, hparams.sample_rate)
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    text = ''.join(words)
    return [wav], [text]
    words = np.array(words)
    start_times = np.array([0.0] + end_times[:-1])
    end_times = np.array(end_times)
    assert len(words) == len(end_times) == len(start_times)
    assert words[0] == "" and words[-1] == ""

    # Find pauses that are too long
    mask = (words == "") & (end_times - start_times >=
                            hparams.silence_min_duration_split)
    mask[0] = mask[-1] = True
    breaks = np.where(mask)[0]

    # Profile the noise from the silences and perform noise reduction on the waveform
    silence_times = [[start_times[i], end_times[i]] for i in breaks]
    silence_times = (np.array(silence_times) * hparams.sample_rate).astype(
        np.int)
    noisy_wav = np.concatenate(
        [wav[stime[0]:stime[1]] for stime in silence_times])
    if len(noisy_wav) > hparams.sample_rate * 0.02:
        profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
        wav = logmmse.denoise(wav, profile, eta=0)

    # Re-attach segments that are too short
    segments = list(zip(breaks[:-1], breaks[1:]))
    segment_durations = [
        start_times[end] - end_times[start] for start, end in segments
    ]
    i = 0
    while i < len(segments) and len(segments) > 1:
        if segment_durations[i] < hparams.utterance_min_duration:
            # See if the segment can be re-attached with the right or the left segment
            left_duration = float("inf") if i == 0 else segment_durations[i -
                                                                          1]
            right_duration = float(
                "inf") if i == len(segments) - 1 else segment_durations[i + 1]
            joined_duration = segment_durations[i] + min(
                left_duration, right_duration)

            # Do not re-attach if it causes the joined utterance to be too long
            if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
                i += 1
                continue

            # Re-attach the segment with the neighbour of shortest duration
            j = i - 1 if left_duration <= right_duration else i
            segments[j] = (segments[j][0], segments[j + 1][1])
            segment_durations[j] = joined_duration
            del segments[j + 1], segment_durations[j + 1]
        else:
            i += 1

    # Split the utterance
    segment_times = [[end_times[start], start_times[end]]
                     for start, end in segments]
    segment_times = (np.array(segment_times) * hparams.sample_rate).astype(
        np.int)
    wavs = [
        wav[segment_time[0]:segment_time[1]] for segment_time in segment_times
    ]
    texts = [
        " ".join(words[start + 1:end]).replace("  ", " ")
        for start, end in segments
    ]
    return wavs, texts
Esempio n. 9
0
def split_on_silences(wav_fpath, words, end_times, hparams):
    """
    wav_fpath: one single audio file of speaker
    words: all words of that file from alignment file with empty string ("") on silence
    end_times: timing info for that file from alignment file
    hparams: audio processing params -> need to trace back this

    load audio file -> reuired
    find long pauses -> not required
    remove noise from them and reattach them to origin wav -> not required
    split sentense on pauses and return arrays of all sentenses with wav for those sentences -> required
    """

    # Load the audio waveform
    wav, _ = librosa.load(wav_fpath, hparams.sample_rate)
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    words = np.array(words)

    start_times = np.array([0.0] + end_times[:-1])
    end_times = np.array(end_times)
    print(f"words {words} start time {start_times} end time {end_times}")

    assert len(words) == len(end_times) == len(start_times)
    assert words[0] == "" and words[-1] == ""

    # Find pauses that are too long
    mask = (words == "") & (end_times - start_times >=
                            hparams.silence_min_duration_split)
    mask[0] = mask[-1] = True
    breaks = np.where(mask)[0]

    # Profile the noise from the silences and perform noise reduction on the waveform
    silence_times = [[start_times[i], end_times[i]] for i in breaks]
    silence_times = (np.array(silence_times) * hparams.sample_rate).astype(
        np.int)
    noisy_wav = np.concatenate(
        [wav[stime[0]:stime[1]] for stime in silence_times])
    if len(noisy_wav) > hparams.sample_rate * 0.02:
        profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
        wav = logmmse.denoise(wav, profile, eta=0)

    # Re-attach segments that are too short
    segments = list(zip(breaks[:-1], breaks[1:]))
    segment_durations = [
        start_times[end] - end_times[start] for start, end in segments
    ]
    i = 0
    while i < len(segments) and len(segments) > 1:
        if segment_durations[i] < hparams.utterance_min_duration:
            # See if the segment can be re-attached with the right or the left segment
            left_duration = float("inf") if i == 0 else segment_durations[i -
                                                                          1]
            right_duration = float(
                "inf") if i == len(segments) - 1 else segment_durations[i + 1]
            joined_duration = segment_durations[i] + min(
                left_duration, right_duration)

            # Do not re-attach if it causes the joined utterance to be too long
            if joined_duration > hparams.hop_size * hparams.max_mel_frames / hparams.sample_rate:
                i += 1
                continue

            # Re-attach the segment with the neighbour of shortest duration
            j = i - 1 if left_duration <= right_duration else i
            segments[j] = (segments[j][0], segments[j + 1][1])
            segment_durations[j] = joined_duration
            del segments[j + 1], segment_durations[j + 1]
        else:
            i += 1

    # Split the utterance
    segment_times = [[end_times[start], start_times[end]]
                     for start, end in segments]
    segment_times = (np.array(segment_times) * hparams.sample_rate).astype(
        np.int)
    wavs = [
        wav[segment_time[0]:segment_time[1]] for segment_time in segment_times
    ]
    texts = [
        " ".join(words[start + 1:end]).replace("  ", " ")
        for start, end in segments
    ]

    print(f"length of all wavs {len(wavs)} all texts {texts}")
    # # DEBUG: play the audio segments (run with -n=1)
    # import sounddevice as sd
    # if len(wavs) > 1:
    #     print("This sentence was split in %d segments:" % len(wavs))
    # else:
    #     print("There are no silences long enough for this sentence to be split:")
    # for wav, text in zip(wavs, texts):
    #     # Pad the waveform with 1 second of silence because sounddevice tends to cut them early
    #     # when playing them. You shouldn't need to do that in your parsers.
    #     wav = np.concatenate((wav, [0] * 16000))
    #     print("\t%s" % text)
    #     sd.play(wav, 16000, blocking=True)
    # print("")

    return wavs, texts