def hello(): waveglow.load_waveglow_torch('../models/waveglow/waveglow_v5_model.pt') # melgan.load_melgan_model(r'E:\githup\zhrtvc\models\vocoder\saved_models\melgan\melgan_multi_speaker.pt', # args_path=r'E:\githup\zhrtvc\models\vocoder\saved_models\melgan\args.yml') melgan.load_melgan_torch('../models/melgan/melgan_multi_speaker_model.pt') # mellotron.load_mellotron_model(r'E:\githup\zhrtvc\models\mellotron\samples\checkpoint\checkpoint-000000.pt', # hparams_path=r'E:\githup\zhrtvc\models\mellotron\samples\metadata\hparams.yml') # # torch.save(mellotron._model, '../models/mellotron/mellotron_samples_model.pt') mellotron.load_mellotron_torch( '../models/mellotron/mellotron_samples_model.pt') # text, mel, speaker, f0 text = torch.randint(0, 100, [4, 50]).cuda() style = 0 # torch.rand(4, 80, 400).cuda() speaker = torch.randint(0, 10, [4]).cuda() f0 = None # torch.rand(4, 400) mels = mellotron.generate_mel(text=text, style=style, speaker=speaker, f0=f0) for mel in mels: print(mel.shape) mel = torch.rand(4, 80, 400).cuda() wav = waveglow.generate_wave(mel) print(wav.shape)
def tts_sdk_base(text, speaker='biaobei', audio='24', output='', **kwargs): """语音合成函数式SDK接口。 text为待合成的文本。 speaker可设置为内置的发音人名称,可选名称见_reference_audio_dict;默认的发音人名称列表见resource/reference_audio/__init__.py。 audio如果是数字,则调用内置的语音作为发音人参考音频;如果是语音路径,则调用audio路径的语音作为发音人参考音频。 output如果以.wav结尾,则为保存语音文件的路径;如果以play开头,则合成语音后自动播放语音。 """ global _dataloader if _dataloader is None: load_models(**kwargs) load_audio(**kwargs) if str(audio).isdigit(): audio = _reference_audio_list[(int(audio) - 1) % len(_reference_audio_list)] elif os.path.isfile(audio): audio = str(audio) elif isinstance(audio, bytes): tmp_audio = tempfile.TemporaryFile(suffix='.wav') tmp_audio.write(audio) audio = tmp_audio.name elif isinstance(audio, str) and len(audio) >= 256: tmp_audio = tempfile.TemporaryFile(suffix='.wav') tmp_audio.write(base64.standard_b64decode(audio)) audio = tmp_audio.name elif speaker in _reference_audio_dict: audio = _reference_audio_dict[speaker] else: raise AssertionError text_data, style_data, speaker_data, f0_data, mel_data = transform_mellotron_input_data( dataloader=_dataloader, text=text, speaker=speaker, audio=audio, device=_device) mels, mels_postnet, gates, alignments = mellotron.generate_mel( text_data, style_data, speaker_data, f0_data) out_gate = gates.cpu().numpy()[0] end_idx = np.argmax(out_gate > kwargs.get('gate_threshold', 0.2) ) or np.argmax(out_gate) or out_gate.shape[0] mels_postnet = mels_postnet[:, :, :end_idx] vocoder_name = kwargs.get('vocoder', 'waveglow') if vocoder_name == 'waveglow': wavs = waveglow.generate_wave(mel=mels_postnet, **kwargs) else: wavs = _stft.griffin_lim(mels_postnet, n_iters=10) wav_output = wavs.squeeze(0).cpu().numpy() if output.startswith('play'): aukit.play_sound(wav_output, sr=_stft.sampling_rate) if output.endswith('.wav'): aukit.save_wav(wav_output, output, sr=_stft.sampling_rate) wav_output = aukit.anything2bytes(wav_output, sr=_stft.sampling_rate) return wav_output
def tts_sdk(text, speaker='biaobei', audio='0', **kwargs): global _dataloader if _dataloader is None: load_models(**kwargs) if str(audio).isdigit(): audio = _reference_audio_list[(int(audio) - 1) % len(_reference_audio_list)] elif os.path.isfile(audio): audio = str(audio) elif isinstance(audio, bytes): tmp_audio = tempfile.TemporaryFile(suffix='.wav') tmp_audio.write(audio) audio = tmp_audio.name elif isinstance(audio, str) and len(audio) >= 100: tmp_audio = tempfile.TemporaryFile(suffix='.wav') tmp_audio.write(base64.standard_b64decode(audio)) audio = tmp_audio.name elif speaker in _reference_audio_dict: audio = _reference_audio_dict[speaker] else: raise AssertionError text_data, style_data, speaker_data, f0_data, mel_data = transform_mellotron_input_data( dataloader=_dataloader, text=text, speaker=speaker, audio=audio, device=_device) mels, mels_postnet, gates, alignments = mellotron.generate_mel( text_data, style_data, speaker_data, f0_data) out_gate = gates.cpu().numpy()[0] end_idx = np.argmax( out_gate > 0.2) or np.argmax(out_gate) or out_gate.shape[0] mels_postnet = mels_postnet[:, :, :end_idx] if _use_waveglow: wavs = waveglow.generate_wave(mel=mels_postnet, **kwargs) else: wavs = _stft.griffin_lim(mels_postnet, n_iters=5) wav_output = wavs.squeeze(0).cpu().numpy() output = kwargs.get('output', '') if output.startswith('play'): aukit.play_sound(wav_output, sr=_stft.sampling_rate) if output.endswith('.wav'): aukit.save_wav(wav_output, output, sr=_stft.sampling_rate) wav_output = aukit.anything2bytes(wav_output, sr=_stft.sampling_rate) return wav_output
pydub.AudioSegment.silent(3000, frame_rate=args.sampling_rate).export( audio, format='wav') text = '这是个试水的例子。' speaker = 'speaker' text_data, style_data, speaker_data, f0_data = transform_mellotron_input_data( dataloader=dataloader, text=text, speaker=speaker, audio=audio, device=_device) mels, mels_postnet, gates, alignments = mellotron.generate_mel( text_data, style_data, speaker_data, f0_data) wavs = waveglow.generate_wave(mel=mels, **waveglow_kwargs) wav_output = wavs.squeeze().cpu().numpy() aukit.save_wav(wav_output, os.path.join(tmpdir, 'demo_example.wav'), sr=args.sampling_rate) print('Test success done.') # 模型推理 if os.path.isfile(texts_path): text_inputs = [w.strip() for w in open(texts_path, encoding='utf8')] if args.is_simple: text_inputs = np.random.choice(text_inputs, min(len(text_inputs), 10),
else: _device = args.device # 模型导入 load_models(args) # 模型测试 text_test = '这是个试水的例子。' text_data, style_data, speaker_data, f0_data = transform_mellotron_input_data( text=text_test, device=_device) mels, mels_postnet, gates, alignments = mellotron.generate_mel( text_data, style_data, speaker_data, f0_data) wavs = waveglow.generate_wave(mel=mels) with tempfile.TemporaryDirectory() as tmpdir: wav_output = wavs.squeeze().cpu().numpy() aukit.save_wav(wav_output, os.path.join(tmpdir, 'demo_example.wav'), sr=args.sampling_rate) # 模型推理 if os.path.isfile(args.input): text_inputs = [w.strip() for w in open(args.input, encoding='utf8')] else: text_inputs = [args.input] output_dir = args.output
def vocode(self): speaker_name, spec, breaks, _ = self.current_generated assert spec is not None # Synthesize the waveform # if not vocoder.is_loaded(): if not waveglow.is_loaded(): self.init_vocoder() def vocoder_progress(i, seq_len, b_size, gen_rate): real_time_factor = (gen_rate / aukit._sr) * 1000 # Synthesizer.sample_rate line = "Waveform generation: %d/%d (batch size: %d, rate: %.1fkHz - %.2fx real time)" \ % (i * b_size, seq_len * b_size, b_size, gen_rate, real_time_factor) self.ui.log(line, "overwrite") self.ui.set_loading(i, seq_len) wav = None vocname = "" if self.ui.current_vocoder_fpath is not None: model_fpath = self.ui.current_vocoder_fpath vocname = Path(model_fpath).parent.stem wav = waveglow.generate_wave(spec) # if Path(model_fpath).parent.stem == "melgan": # self.ui.log("Waveform generation with MelGAN... ") # wav = vocoder_melgan.infer_waveform_melgan(spec, model_fpath) # # elif Path(model_fpath).parent.stem == "wavernn": # self.ui.log("Waveform generation with WaveRNN... ") # wav = vocoder.infer_waveform(spec, progress_callback=vocoder_progress) # elif Path(model_fpath).parent.stem == "waveglow": # wav = waveglow.generate_wave(spec) if wav is None: vocname = "griffinlim" self.ui.log("Waveform generation with Griffin-Lim... ") # wav = Synthesizer.griffin_lim(spec) wav = aukit.inv_mel_spectrogram(spec) self.ui.set_loading(0) self.ui.log(" Done!", "append") # Play it wav = wav / np.abs(wav).max() * 0.97 self.ui.play(wav, aukit._sr) fref = self.ui.selected_utterance.name ftime = '{}'.format(time_formatter()) ftext = self.ui.text_prompt.toPlainText() fms = int(len(wav) * 1000 / aukit._sr) fvoc = vocname fname = filename_formatter('{}_{}_{}_{}ms_{}.wav'.format(fref, ftime, fvoc, fms, ftext)) audio.save_wav(wav, self._out_wav_dir.joinpath(fname), aukit._sr) # save # Compute the embedding # TODO: this is problematic with different sampling rates, gotta fix it if not encoder.is_loaded(): self.init_encoder() encoder_wav = encoder.preprocess_wav(wav) embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True) # Add the utterance name = speaker_name + "_gen_{}".format(time_formatter()) utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True) np.save(self._out_embed_dir.joinpath(name + '.npy'), embed, allow_pickle=False) # save self.utterances.add(utterance) # Plot it self.ui.draw_embed(embed, name, "generated") self.ui.draw_umap_projections(self.utterances)
audio, format='wav') text = '这是个试水的例子。' speaker = 'speaker' text_data, style_data, speaker_data, f0_data, mel_data = transform_mellotron_input_data( dataloader=dataloader, text=text, speaker=speaker, audio=audio, device=_device) mels, mels_postnet, gates, alignments = mellotron.generate_mel( text_data, style_data, speaker_data, f0_data) if _use_waveglow: wavs = waveglow.generate_wave(mel=mels_postnet, **waveglow_kwargs) else: wavs = _stft.griffin_lim(mels_postnet) wav_output = wavs.squeeze().cpu().numpy() aukit.save_wav(wav_output, os.path.join(tmpdir, 'demo_example.wav'), sr=args.sampling_rate) print('Test success done.') # 模型推理 if os.path.isfile(texts_path): text_inputs = [w.strip() for w in open(texts_path, encoding='utf8')] if args.is_simple: text_inputs = np.random.choice(text_inputs,
def voice_clone_interface(audio: str, text: str, speaker: str) -> str: denoise.noisy_processing(audio, audio) # 对输入音频进行降噪处理 # for text_input in tqdm(zip(audio_lst, text_lst, speaker_lst), 'TTS', total=len(audio_lst), ncols=100): # for text_input in tqdm(text_inputs, 'TTS', ncols=100): # print('Running: {}'.format(text_input)) # # audio, text, speaker = text_input # .split('\t') # 遍历一个一个 # print("audio的内容:",audio) # '/home/project/zhrtvc/data/samples/aishell3/wav/SSB00110401.wav' # print("text的内容:",text) # '三百零五千三百三十四。' # print("speaker的内容:",speaker) # 'SSB0011' text_data, style_data, speaker_data, f0_data, mel_data = transform_mellotron_input_data( dataloader=dataloader, text=text, speaker=speaker, audio=audio, device=_device) mels, mels_postnet, gates, alignments = mellotron.generate_mel( text_data, style_data, speaker_data, f0_data) out_gate = gates.cpu().numpy()[0] end_idx = np.argmax(out_gate > 0.2) or out_gate.shape[0] mels_postnet = mels_postnet[:, :, :end_idx] if _use_waveglow: print("use waveglow:") wavs = waveglow.generate_wave(mel=mels_postnet, **waveglow_kwargs) else: print("use waveglow:") wavs = _stft.griffin_lim(mels_postnet, n_iters=5) # 保存数据 cur_text = filename_formatter_re.sub('', unidecode.unidecode(text))[:15] cur_time = time.strftime('%Y%m%d-%H%M%S') outpath = os.path.join(output_dir, "demo_{}_{}_out.wav".format(cur_time, cur_text)) # print("outpath的路径:",outpath) wav_output = wavs.squeeze(0).cpu().numpy() aukit.save_wav(wav_output, outpath, sr=args.sampling_rate) # sampling_rate=22050 if isinstance(audio, (Path, str)) and Path(audio).is_file(): # # 原声 # refpath_raw = os.path.join(output_dir, "demo_{}_{}_ref_copy.wav".format(cur_time, cur_text)) # shutil.copyfile(audio, refpath_raw) # 重采样 wav_input, sr = aukit.load_wav(audio, with_sr=True) wav_input = librosa.resample(wav_input, sr, args.sampling_rate) refpath = os.path.join(output_dir, "demo_{}_{}_ref.wav".format(cur_time, cur_text)) aukit.save_wav(wav_input, refpath, sr=args.sampling_rate) # # 声码器 # wavs_ref = waveglow.generate_wave(mel=mel_data, **waveglow_kwargs) # outpath_ref = os.path.join(output_dir, "demo_{}_{}_ref_waveglow.wav".format(cur_time, cur_text)) # wav_output_ref = wavs_ref.squeeze(0).cpu().numpy() # aukit.save_wav(wav_output_ref, outpath_ref, sr=args.sampling_rate) fig_path = os.path.join(output_dir, "demo_{}_{}_fig.jpg".format(cur_time, cur_text)) plot_mel_alignment_gate_audio( mel=mels_postnet.squeeze(0).cpu().numpy(), alignment=alignments.squeeze(0).cpu().numpy(), gate=gates.squeeze(0).cpu().numpy(), audio=wav_output[::args.sampling_rate // 1000]) plt.savefig(fig_path) plt.close() # 先屏蔽掉信息 # yml_path = os.path.join(output_dir, "demo_{}_{}_info.yml".format(cur_time, cur_text)) # info_dict = locals2dict(locals()) # with open(yml_path, 'wt', encoding='utf8') as fout: # yaml.dump(info_dict, fout, encoding='utf-8', allow_unicode=True) # log_path = os.path.join(output_dir, "info_dict.txt".format(cur_time)) # with open(log_path, 'at', encoding='utf8') as fout: # fout.write('{}\n'.format(json.dumps(info_dict, ensure_ascii=False))) print('Test success done.返回克隆的音频为:', outpath) denoise.noisy_processing(outpath, outpath) # 对输出音频进行降噪处理 return outpath