Beispiel #1
0
    def load_from_browser(self, fpath=None):
        if fpath is None:
            fpath = Path(self.datasets_root,
                         self.ui.current_dataset_name,
                         self.ui.current_speaker_name,
                         self.ui.current_utterance_name)
            # name = '/'.join(fpath.relative_to(self.datasets_root).parts)
            dat = self.ui.current_dataset_name.replace("\\", "#").replace("/", "#")
            spk = self.ui.current_speaker_name.replace("\\", "#").replace("/", "#")
            aud = self.ui.current_utterance_name.replace("\\", "#").replace("/", "#")
            speaker_name = "#".join((dat, spk))
            name = "#".join((speaker_name, aud))
            # name = '-'.join(fpath.relative_to(self.datasets_root.joinpath(self.ui.current_dataset_name)).parts)
            # speaker_name = self.ui.current_speaker_name.replace("\\", "-").replace("/", "-")
            # Select the next utterance
            if self.ui.auto_next_checkbox.isChecked():
                self.ui.browser_select_next()
        elif fpath == "":
            return
        else:
            name = fpath.name
            speaker_name = fpath.parent.name

        # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
        # playback, so as to have a fair comparison with the generated audio
        # wav = Synthesizer.load_preprocess_wav(fpath)
        wav = aukit.load_wav(fpath)
        self.ui.log("Loaded %s" % name)

        self.add_real_utterance(wav, name, speaker_name)
Beispiel #2
0
def run_joint(fpath, sr=_sr, outdir=Path("")):
    """
    run joint
    :param fpath:
    :param sr:
    :param outdir:
    :return:
    """
    curdir = Path(fpath).parent
    outdir = Path(outdir)
    outdir.mkdir(exist_ok=True, parents=True)
    with open(outdir.joinpath("metadata.csv"), "wt", encoding="utf8") as fout:
        load_pair = load_pairs(fpath)
        for spk, ptpairs_raw in tqdm(load_pair, desc="speaker", ncols=100):
            gen_pair = choice_pairs(ptpairs_raw, n_choice=100)
            for num, ptpairs_joint in enumerate(tqdm(gen_pair, desc="choice", ncols=100), 1):
                wtpairs_joint = [(aukit.load_wav(p, sr=sr), t) for p, t in ptpairs_joint]
                wav, text = joint_audio_and_text(wtpairs_joint)
                parts = list(Path(ptpairs_joint[0][0]).relative_to(curdir).parts)[:-1]
                parts.append("{}_{:06d}.wav".format(spk, num))
                outname = "/".join(parts)
                outpath = outdir.joinpath(outname)
                outpath.parent.mkdir(exist_ok=True, parents=True)
                aukit.save_wav(wav, sr=sr, path=outpath)
                fout.write("{}\t{}\n".format(outname, text))
Beispiel #3
0
def run_normalizer():
    import aukit
    from aukit.audio_player import play_sound
    from aukit import audio_normalizer as ano
    inpath = r"hello.wav"
    wav, sr = aukit.load_wav(inpath, with_sr=True)
    out = ano.remove_silence(wav)
    out = ano.tune_volume(wav, target_dBFS=-10)
    play_sound(out, sr)
def embed_utterance(src, skip_existing=True, encoder_model_fpath=Path()):
    if not encoder.is_loaded():
        encoder.load_model(encoder_model_fpath)

    wav_fpath, embed_fpath = src

    if skip_existing and embed_fpath.is_file():
        return

    wav = aukit.load_wav(wav_fpath, sr=hp.sampling_rate)
    wav = encoder.preprocess_wav(wav)
    embed = encoder.embed_utterance(wav)
    np.save(embed_fpath, embed, allow_pickle=False)
Beispiel #5
0
    def record(self):
        wav = self.ui.record_one(encoder.sampling_rate, 5)
        if wav is None:
            return

        self.ui.play(wav, encoder.sampling_rate)

        speaker_name = "user01"
        name = speaker_name + "_rec_{}".format(time_formatter())
        fpath = self._out_record_dir.joinpath(name + '.wav')
        audio.save_wav(wav, fpath, encoder.sampling_rate)  # save
        # wav = Synthesizer.load_preprocess_wav(fpath)  # 保持一致的数据格式
        wav = aukit.load_wav(fpath)
        self.add_real_utterance(wav, name, speaker_name)
Beispiel #6
0
def run_editor():
    import aukit
    from aukit.audio_player import play_sound, play_audio
    from aukit import audio_editor as aed
    inpath = r"hello.wav"
    wav, sr = aukit.load_wav(inpath, with_sr=True)
    aud = aed.wav2audiosegment(wav, sr)
    out = aed.strip_audio(aud)
    wav = aed.audiosegment2wav(out)

    out = aed.remove_silence_wave(wav, sr=sr)
    out = aed.strip_silence_wave(out, sr=sr)

    print(len(wav), len(out))
    play_audio(out, sr)
Beispiel #7
0
def waveglow():
    waveglow_model = torch.load(r'../models/waveglow/waveglow_v5_model.pt',
                                map_location='cpu')

    def waveglow_vocoder(mels):
        with torch.no_grad():
            wavs = waveglow_model.infer(mels, sigma=1.0)
        return wavs

    from mellotron.inference import transform_mel

    audio_ref, sr_ref = aukit.load_wav(audio, sr=None, with_sr=True)
    mel = transform_mel(audio_ref, stft=msyner.stft)
    spec_ref = mel

    wav_inputs = waveglow_vocoder(torch.from_numpy(spec_ref[None]))
    wav_ref = wav_inputs[0].cpu().numpy()
def change_speed_one(kwargs: dict):
    inpath = kwargs.get("inpath")
    outpath = kwargs.get("outpath")
    rate = kwargs.get("rate")
    if Path(outpath).exists() and os.path.getsize(outpath) > 8000:
        return
    Path(outpath).parent.mkdir(exist_ok=True, parents=True)
    hp = Dict2Obj()
    hp.update(melgan_hparams)
    hp.update({"hop_size": int(melgan_hparams["hop_size"] * rate)})

    try:
        wav = aukit.load_wav(inpath, sr=_sr)
        mel = wav2mel(wav, hparams=hp)
        out = infer_waveform_melgan(mel, load_path=_melgan_load_path)
        aukit.save_wav(out, outpath, sr=_sr)
    except Exception as e:
        print(e)
        print(kwargs)
    return kwargs
Beispiel #9
0
def wavs2mels(indir: Path, outdir: Path):
    for fpath in tqdm(indir.glob("*.wav")):
        wav = aukit.load_wav(fpath, sr=16000)
        wav = np.pad(wav.flatten(), (_pad_len, _pad_len), mode="reflect")
        mel = mel_spectrogram(wav, default_hparams)
        np.save(outdir.joinpath(fpath.stem + ".npy"), mel, allow_pickle=False)
Beispiel #10
0
        cur_text = filename_formatter_re.sub('',
                                             unidecode.unidecode(text))[:15]
        cur_time = time.strftime('%Y%m%d-%H%M%S')
        outpath = os.path.join(output_dir,
                               "demo_{}_{}_out.wav".format(cur_time, cur_text))

        wav_output = wavs.squeeze(0).cpu().numpy()
        aukit.save_wav(wav_output, outpath, sr=args.sampling_rate)

        if isinstance(audio, (Path, str)) and Path(audio).is_file():
            # # 原声
            # refpath_raw = os.path.join(output_dir, "demo_{}_{}_ref_copy.wav".format(cur_time, cur_text))
            # shutil.copyfile(audio, refpath_raw)

            # 重采样
            wav_input, sr = aukit.load_wav(audio, with_sr=True)
            wav_input = librosa.resample(wav_input, sr, args.sampling_rate)
            refpath = os.path.join(
                output_dir, "demo_{}_{}_ref.wav".format(cur_time, cur_text))
            aukit.save_wav(wav_input, refpath, sr=args.sampling_rate)

            # # 声码器
            # wavs_ref = waveglow.generate_wave(mel=mel_data, **waveglow_kwargs)
            # outpath_ref = os.path.join(output_dir, "demo_{}_{}_ref_waveglow.wav".format(cur_time, cur_text))
            # wav_output_ref = wavs_ref.squeeze(0).cpu().numpy()
            # aukit.save_wav(wav_output_ref, outpath_ref, sr=args.sampling_rate)

        fig_path = os.path.join(
            output_dir, "demo_{}_{}_fig.jpg".format(cur_time, cur_text))

        plot_mel_alignment_gate_audio(
Beispiel #11
0
def run_player():
    import aukit
    inpath = Path(r"E:\data\aliaudio\examples\ali_Aibao_000001.wav")
    wav = aukit.load_wav(inpath, sr=16000)
    wav = aukit.change_voice(wav, mode="assign_pitch", alpha=200)
    aukit.play_audio(wav, volume=0.5)
Beispiel #12
0
def run_noise_remover():
    import aukit
    inpath = r"hello.wav"
    wav = aukit.load_wav(inpath)
    out = aukit.remove_noise(wav)
    aukit.play_audio(out)
Beispiel #13
0
def remove_noise_audio(inpath, outpath):
    """音频降噪。"""
    import aukit
    wav = aukit.load_wav(inpath, sr=16000)
    out = aukit.remove_noise(wav, sr=16000)
    aukit.save_wav(out, outpath, sr=16000)
Beispiel #14
0
def voice_clone_interface(audio: str, text: str, speaker: str) -> str:
    denoise.noisy_processing(audio, audio)  # 对输入音频进行降噪处理
    #    for text_input in tqdm(zip(audio_lst, text_lst, speaker_lst), 'TTS', total=len(audio_lst), ncols=100):
    # for text_input in tqdm(text_inputs, 'TTS', ncols=100):
    # print('Running: {}'.format(text_input))

    # # audio, text, speaker = text_input  # .split('\t') # 遍历一个一个
    # print("audio的内容:",audio) # '/home/project/zhrtvc/data/samples/aishell3/wav/SSB00110401.wav'
    # print("text的内容:",text) # '三百零五千三百三十四。'
    # print("speaker的内容:",speaker) # 'SSB0011'

    text_data, style_data, speaker_data, f0_data, mel_data = transform_mellotron_input_data(
        dataloader=dataloader,
        text=text,
        speaker=speaker,
        audio=audio,
        device=_device)

    mels, mels_postnet, gates, alignments = mellotron.generate_mel(
        text_data, style_data, speaker_data, f0_data)

    out_gate = gates.cpu().numpy()[0]
    end_idx = np.argmax(out_gate > 0.2) or out_gate.shape[0]

    mels_postnet = mels_postnet[:, :, :end_idx]
    if _use_waveglow:
        print("use waveglow:")
        wavs = waveglow.generate_wave(mel=mels_postnet, **waveglow_kwargs)
    else:
        print("use waveglow:")
        wavs = _stft.griffin_lim(mels_postnet, n_iters=5)

    # 保存数据
    cur_text = filename_formatter_re.sub('', unidecode.unidecode(text))[:15]
    cur_time = time.strftime('%Y%m%d-%H%M%S')
    outpath = os.path.join(output_dir,
                           "demo_{}_{}_out.wav".format(cur_time, cur_text))
    # print("outpath的路径:",outpath)

    wav_output = wavs.squeeze(0).cpu().numpy()
    aukit.save_wav(wav_output, outpath,
                   sr=args.sampling_rate)  # sampling_rate=22050

    if isinstance(audio, (Path, str)) and Path(audio).is_file():
        # # 原声
        # refpath_raw = os.path.join(output_dir, "demo_{}_{}_ref_copy.wav".format(cur_time, cur_text))
        # shutil.copyfile(audio, refpath_raw)

        # 重采样
        wav_input, sr = aukit.load_wav(audio, with_sr=True)
        wav_input = librosa.resample(wav_input, sr, args.sampling_rate)
        refpath = os.path.join(output_dir,
                               "demo_{}_{}_ref.wav".format(cur_time, cur_text))
        aukit.save_wav(wav_input, refpath, sr=args.sampling_rate)

        # # 声码器
        # wavs_ref = waveglow.generate_wave(mel=mel_data, **waveglow_kwargs)
        # outpath_ref = os.path.join(output_dir, "demo_{}_{}_ref_waveglow.wav".format(cur_time, cur_text))
        # wav_output_ref = wavs_ref.squeeze(0).cpu().numpy()
        # aukit.save_wav(wav_output_ref, outpath_ref, sr=args.sampling_rate)

    fig_path = os.path.join(output_dir,
                            "demo_{}_{}_fig.jpg".format(cur_time, cur_text))

    plot_mel_alignment_gate_audio(
        mel=mels_postnet.squeeze(0).cpu().numpy(),
        alignment=alignments.squeeze(0).cpu().numpy(),
        gate=gates.squeeze(0).cpu().numpy(),
        audio=wav_output[::args.sampling_rate // 1000])
    plt.savefig(fig_path)
    plt.close()
    # 先屏蔽掉信息
    # yml_path = os.path.join(output_dir, "demo_{}_{}_info.yml".format(cur_time, cur_text))
    # info_dict = locals2dict(locals())
    # with open(yml_path, 'wt', encoding='utf8') as fout:
    #     yaml.dump(info_dict, fout, encoding='utf-8', allow_unicode=True)

    # log_path = os.path.join(output_dir, "info_dict.txt".format(cur_time))
    # with open(log_path, 'at', encoding='utf8') as fout:
    #     fout.write('{}\n'.format(json.dumps(info_dict, ensure_ascii=False)))

    print('Test success done.返回克隆的音频为:', outpath)
    denoise.noisy_processing(outpath, outpath)  # 对输出音频进行降噪处理
    return outpath
Beispiel #15
0
def run_noise_remover():
    import aukit
    inpath = r"E:\data\temp\01.wav"
    wav = aukit.load_wav(inpath)
    out = aukit.remove_noise(wav)
    aukit.play_audio(out)