def load_from_browser(self, fpath=None): if fpath is None: fpath = Path(self.datasets_root, self.ui.current_dataset_name, self.ui.current_speaker_name, self.ui.current_utterance_name) # name = '/'.join(fpath.relative_to(self.datasets_root).parts) dat = self.ui.current_dataset_name.replace("\\", "#").replace("/", "#") spk = self.ui.current_speaker_name.replace("\\", "#").replace("/", "#") aud = self.ui.current_utterance_name.replace("\\", "#").replace("/", "#") speaker_name = "#".join((dat, spk)) name = "#".join((speaker_name, aud)) # name = '-'.join(fpath.relative_to(self.datasets_root.joinpath(self.ui.current_dataset_name)).parts) # speaker_name = self.ui.current_speaker_name.replace("\\", "-").replace("/", "-") # Select the next utterance if self.ui.auto_next_checkbox.isChecked(): self.ui.browser_select_next() elif fpath == "": return else: name = fpath.name speaker_name = fpath.parent.name # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for # playback, so as to have a fair comparison with the generated audio # wav = Synthesizer.load_preprocess_wav(fpath) wav = aukit.load_wav(fpath) self.ui.log("Loaded %s" % name) self.add_real_utterance(wav, name, speaker_name)
def run_joint(fpath, sr=_sr, outdir=Path("")): """ run joint :param fpath: :param sr: :param outdir: :return: """ curdir = Path(fpath).parent outdir = Path(outdir) outdir.mkdir(exist_ok=True, parents=True) with open(outdir.joinpath("metadata.csv"), "wt", encoding="utf8") as fout: load_pair = load_pairs(fpath) for spk, ptpairs_raw in tqdm(load_pair, desc="speaker", ncols=100): gen_pair = choice_pairs(ptpairs_raw, n_choice=100) for num, ptpairs_joint in enumerate(tqdm(gen_pair, desc="choice", ncols=100), 1): wtpairs_joint = [(aukit.load_wav(p, sr=sr), t) for p, t in ptpairs_joint] wav, text = joint_audio_and_text(wtpairs_joint) parts = list(Path(ptpairs_joint[0][0]).relative_to(curdir).parts)[:-1] parts.append("{}_{:06d}.wav".format(spk, num)) outname = "/".join(parts) outpath = outdir.joinpath(outname) outpath.parent.mkdir(exist_ok=True, parents=True) aukit.save_wav(wav, sr=sr, path=outpath) fout.write("{}\t{}\n".format(outname, text))
def run_normalizer(): import aukit from aukit.audio_player import play_sound from aukit import audio_normalizer as ano inpath = r"hello.wav" wav, sr = aukit.load_wav(inpath, with_sr=True) out = ano.remove_silence(wav) out = ano.tune_volume(wav, target_dBFS=-10) play_sound(out, sr)
def embed_utterance(src, skip_existing=True, encoder_model_fpath=Path()): if not encoder.is_loaded(): encoder.load_model(encoder_model_fpath) wav_fpath, embed_fpath = src if skip_existing and embed_fpath.is_file(): return wav = aukit.load_wav(wav_fpath, sr=hp.sampling_rate) wav = encoder.preprocess_wav(wav) embed = encoder.embed_utterance(wav) np.save(embed_fpath, embed, allow_pickle=False)
def record(self): wav = self.ui.record_one(encoder.sampling_rate, 5) if wav is None: return self.ui.play(wav, encoder.sampling_rate) speaker_name = "user01" name = speaker_name + "_rec_{}".format(time_formatter()) fpath = self._out_record_dir.joinpath(name + '.wav') audio.save_wav(wav, fpath, encoder.sampling_rate) # save # wav = Synthesizer.load_preprocess_wav(fpath) # 保持一致的数据格式 wav = aukit.load_wav(fpath) self.add_real_utterance(wav, name, speaker_name)
def run_editor(): import aukit from aukit.audio_player import play_sound, play_audio from aukit import audio_editor as aed inpath = r"hello.wav" wav, sr = aukit.load_wav(inpath, with_sr=True) aud = aed.wav2audiosegment(wav, sr) out = aed.strip_audio(aud) wav = aed.audiosegment2wav(out) out = aed.remove_silence_wave(wav, sr=sr) out = aed.strip_silence_wave(out, sr=sr) print(len(wav), len(out)) play_audio(out, sr)
def waveglow(): waveglow_model = torch.load(r'../models/waveglow/waveglow_v5_model.pt', map_location='cpu') def waveglow_vocoder(mels): with torch.no_grad(): wavs = waveglow_model.infer(mels, sigma=1.0) return wavs from mellotron.inference import transform_mel audio_ref, sr_ref = aukit.load_wav(audio, sr=None, with_sr=True) mel = transform_mel(audio_ref, stft=msyner.stft) spec_ref = mel wav_inputs = waveglow_vocoder(torch.from_numpy(spec_ref[None])) wav_ref = wav_inputs[0].cpu().numpy()
def change_speed_one(kwargs: dict): inpath = kwargs.get("inpath") outpath = kwargs.get("outpath") rate = kwargs.get("rate") if Path(outpath).exists() and os.path.getsize(outpath) > 8000: return Path(outpath).parent.mkdir(exist_ok=True, parents=True) hp = Dict2Obj() hp.update(melgan_hparams) hp.update({"hop_size": int(melgan_hparams["hop_size"] * rate)}) try: wav = aukit.load_wav(inpath, sr=_sr) mel = wav2mel(wav, hparams=hp) out = infer_waveform_melgan(mel, load_path=_melgan_load_path) aukit.save_wav(out, outpath, sr=_sr) except Exception as e: print(e) print(kwargs) return kwargs
def wavs2mels(indir: Path, outdir: Path): for fpath in tqdm(indir.glob("*.wav")): wav = aukit.load_wav(fpath, sr=16000) wav = np.pad(wav.flatten(), (_pad_len, _pad_len), mode="reflect") mel = mel_spectrogram(wav, default_hparams) np.save(outdir.joinpath(fpath.stem + ".npy"), mel, allow_pickle=False)
cur_text = filename_formatter_re.sub('', unidecode.unidecode(text))[:15] cur_time = time.strftime('%Y%m%d-%H%M%S') outpath = os.path.join(output_dir, "demo_{}_{}_out.wav".format(cur_time, cur_text)) wav_output = wavs.squeeze(0).cpu().numpy() aukit.save_wav(wav_output, outpath, sr=args.sampling_rate) if isinstance(audio, (Path, str)) and Path(audio).is_file(): # # 原声 # refpath_raw = os.path.join(output_dir, "demo_{}_{}_ref_copy.wav".format(cur_time, cur_text)) # shutil.copyfile(audio, refpath_raw) # 重采样 wav_input, sr = aukit.load_wav(audio, with_sr=True) wav_input = librosa.resample(wav_input, sr, args.sampling_rate) refpath = os.path.join( output_dir, "demo_{}_{}_ref.wav".format(cur_time, cur_text)) aukit.save_wav(wav_input, refpath, sr=args.sampling_rate) # # 声码器 # wavs_ref = waveglow.generate_wave(mel=mel_data, **waveglow_kwargs) # outpath_ref = os.path.join(output_dir, "demo_{}_{}_ref_waveglow.wav".format(cur_time, cur_text)) # wav_output_ref = wavs_ref.squeeze(0).cpu().numpy() # aukit.save_wav(wav_output_ref, outpath_ref, sr=args.sampling_rate) fig_path = os.path.join( output_dir, "demo_{}_{}_fig.jpg".format(cur_time, cur_text)) plot_mel_alignment_gate_audio(
def run_player(): import aukit inpath = Path(r"E:\data\aliaudio\examples\ali_Aibao_000001.wav") wav = aukit.load_wav(inpath, sr=16000) wav = aukit.change_voice(wav, mode="assign_pitch", alpha=200) aukit.play_audio(wav, volume=0.5)
def run_noise_remover(): import aukit inpath = r"hello.wav" wav = aukit.load_wav(inpath) out = aukit.remove_noise(wav) aukit.play_audio(out)
def remove_noise_audio(inpath, outpath): """音频降噪。""" import aukit wav = aukit.load_wav(inpath, sr=16000) out = aukit.remove_noise(wav, sr=16000) aukit.save_wav(out, outpath, sr=16000)
def voice_clone_interface(audio: str, text: str, speaker: str) -> str: denoise.noisy_processing(audio, audio) # 对输入音频进行降噪处理 # for text_input in tqdm(zip(audio_lst, text_lst, speaker_lst), 'TTS', total=len(audio_lst), ncols=100): # for text_input in tqdm(text_inputs, 'TTS', ncols=100): # print('Running: {}'.format(text_input)) # # audio, text, speaker = text_input # .split('\t') # 遍历一个一个 # print("audio的内容:",audio) # '/home/project/zhrtvc/data/samples/aishell3/wav/SSB00110401.wav' # print("text的内容:",text) # '三百零五千三百三十四。' # print("speaker的内容:",speaker) # 'SSB0011' text_data, style_data, speaker_data, f0_data, mel_data = transform_mellotron_input_data( dataloader=dataloader, text=text, speaker=speaker, audio=audio, device=_device) mels, mels_postnet, gates, alignments = mellotron.generate_mel( text_data, style_data, speaker_data, f0_data) out_gate = gates.cpu().numpy()[0] end_idx = np.argmax(out_gate > 0.2) or out_gate.shape[0] mels_postnet = mels_postnet[:, :, :end_idx] if _use_waveglow: print("use waveglow:") wavs = waveglow.generate_wave(mel=mels_postnet, **waveglow_kwargs) else: print("use waveglow:") wavs = _stft.griffin_lim(mels_postnet, n_iters=5) # 保存数据 cur_text = filename_formatter_re.sub('', unidecode.unidecode(text))[:15] cur_time = time.strftime('%Y%m%d-%H%M%S') outpath = os.path.join(output_dir, "demo_{}_{}_out.wav".format(cur_time, cur_text)) # print("outpath的路径:",outpath) wav_output = wavs.squeeze(0).cpu().numpy() aukit.save_wav(wav_output, outpath, sr=args.sampling_rate) # sampling_rate=22050 if isinstance(audio, (Path, str)) and Path(audio).is_file(): # # 原声 # refpath_raw = os.path.join(output_dir, "demo_{}_{}_ref_copy.wav".format(cur_time, cur_text)) # shutil.copyfile(audio, refpath_raw) # 重采样 wav_input, sr = aukit.load_wav(audio, with_sr=True) wav_input = librosa.resample(wav_input, sr, args.sampling_rate) refpath = os.path.join(output_dir, "demo_{}_{}_ref.wav".format(cur_time, cur_text)) aukit.save_wav(wav_input, refpath, sr=args.sampling_rate) # # 声码器 # wavs_ref = waveglow.generate_wave(mel=mel_data, **waveglow_kwargs) # outpath_ref = os.path.join(output_dir, "demo_{}_{}_ref_waveglow.wav".format(cur_time, cur_text)) # wav_output_ref = wavs_ref.squeeze(0).cpu().numpy() # aukit.save_wav(wav_output_ref, outpath_ref, sr=args.sampling_rate) fig_path = os.path.join(output_dir, "demo_{}_{}_fig.jpg".format(cur_time, cur_text)) plot_mel_alignment_gate_audio( mel=mels_postnet.squeeze(0).cpu().numpy(), alignment=alignments.squeeze(0).cpu().numpy(), gate=gates.squeeze(0).cpu().numpy(), audio=wav_output[::args.sampling_rate // 1000]) plt.savefig(fig_path) plt.close() # 先屏蔽掉信息 # yml_path = os.path.join(output_dir, "demo_{}_{}_info.yml".format(cur_time, cur_text)) # info_dict = locals2dict(locals()) # with open(yml_path, 'wt', encoding='utf8') as fout: # yaml.dump(info_dict, fout, encoding='utf-8', allow_unicode=True) # log_path = os.path.join(output_dir, "info_dict.txt".format(cur_time)) # with open(log_path, 'at', encoding='utf8') as fout: # fout.write('{}\n'.format(json.dumps(info_dict, ensure_ascii=False))) print('Test success done.返回克隆的音频为:', outpath) denoise.noisy_processing(outpath, outpath) # 对输出音频进行降噪处理 return outpath
def run_noise_remover(): import aukit inpath = r"E:\data\temp\01.wav" wav = aukit.load_wav(inpath) out = aukit.remove_noise(wav) aukit.play_audio(out)