model.seq2seq.decoder.max_decoder_steps = max_decoder_steps os.makedirs(dst_dir, exist_ok=True) with open(text_list_file_path, "rb") as f: lines = f.readlines() for idx, line in enumerate(lines): text = line.decode("utf-8")[:-1] words = nltk.word_tokenize(text) waveform, alignment, _, _ = tts( model, text, p=replace_pronunciation_prob, speaker_id=speaker_id, fast=True) dst_wav_path = join(dst_dir, "{}_{}{}.wav".format( idx, checkpoint_name, file_name_suffix)) dst_alignment_path = join( dst_dir, "{}_{}{}_alignment.png".format(idx, checkpoint_name, file_name_suffix)) plot_alignment(alignment.T, dst_alignment_path, info="{}, {}".format(hparams.builder, basename(checkpoint_path))) audio.save_wav(waveform, dst_wav_path) from os.path import basename, splitext name = splitext(basename(text_list_file_path))[0] if output_html: print(""" {} ({} chars, {} words) <audio controls="controls" > <source src="/audio/{}/{}/{}" autoplay/> Your browser does not support the audio element. </audio> <div align="center"><img src="/audio/{}/{}/{}" /></div>
model.load_state_dict(checkpoint["state_dict"]) model.decoder.max_decoder_steps = max_decoder_steps model.make_generation_fast_() os.makedirs(dst_dir, exist_ok=True) with open(text_list_file_path, "rb") as f: lines = f.readlines() for idx, line in enumerate(lines): text = line.decode("utf-8")[:-1] words = nltk.word_tokenize(text) # print("{}: {} ({} chars, {} words)".format(idx, text, len(text), len(words))) waveform, alignment, _, _ = tts(model, text) dst_wav_path = join(dst_dir, "{}{}.wav".format(idx, file_name_suffix)) dst_alignment_path = join(dst_dir, "{}{}_alignment.png".format(idx, file_name_suffix)) plot_alignment(alignment.T, dst_alignment_path, info="deepvoice3, {}".format(checkpoint_path)) audio.save_wav(waveform, dst_wav_path) from os.path import basename, splitext name = splitext(basename(text_list_file_path))[0] print(""" {} ({} chars, {} words) <audio controls="controls" > <source src="/audio/deepvoice3/{}/{}{}.wav" autoplay/> Your browser does not support the audio element. </audio> <div align="center"><img src="/audio/deepvoice3/{}/{}{}_alignment.png" /></div> """.format(text, len(text), len(words),
def synthesis(checkpoint_path, preset, dst_dir, srt_path, face_path): global _frontend checkpoint_seq2seq_path = None checkpoint_postnet_path = None max_decoder_steps = 500 file_name_suffix = "" replace_pronunciation_prob = float(0.0) # Load preset if specified if preset is not None: with open(preset) as f: hparams.parse_json(f.read()) # Override hyper parameters hparams.parse("") assert hparams.name == "deepvoice3" _frontend = getattr(frontend, hparams.frontend) print(_frontend) import train train._frontend = _frontend from train import plot_alignment, build_model # Model model = build_model() # Load checkpoints separately if checkpoint_postnet_path is not None and checkpoint_seq2seq_path is not None: checkpoint = _load(checkpoint_seq2seq_path) model.seq2seq.load_state_dict(checkpoint["state_dict"]) checkpoint = _load(checkpoint_postnet_path) model.postnet.load_state_dict(checkpoint["state_dict"]) checkpoint_name = splitext(basename(checkpoint_seq2seq_path))[0] else: checkpoint = _load(checkpoint_path) model.load_state_dict(checkpoint["state_dict"]) checkpoint_name = splitext(basename(checkpoint_path))[0] model.seq2seq.decoder.max_decoder_steps = max_decoder_steps os.makedirs(dst_dir, exist_ok=True) task = load_srt(srt_path, face_path) idx = 0 for i in task: speaker_id = i[3] text = i[4] words = nltk.word_tokenize(text) file_name = "{} speaker_{} {}-{}".format(idx, speaker_id, i[1], i[2]) print(text) waveform, alignment, _, _ = tts(model, text, p=replace_pronunciation_prob, speaker_id=speaker_id, fast=True) dst_wav_path = join(dst_dir, "{}.wav".format(file_name)) dst_alignment_path = join(dst_dir, "{}_alignment.png".format(file_name)) plot_alignment(alignment.T, dst_alignment_path, info="{}, {}".format(hparams.builder, basename(checkpoint_path))) audio.save_wav(waveform, dst_wav_path) print( idx, ": {}\n ({} chars, {} words)".format(text, len(text), len(words))) idx += 1 print( "Finished! Check out {} for generated audio samples.".format(dst_dir))
model.seq2seq.decoder.max_decoder_steps = max_decoder_steps os.makedirs(out_dir, exist_ok=True) with open(text_file, "r") as file_reader: lines = file_reader.readlines() lines = [line.strip("\n") for line in lines] for idx, line in enumerate(lines): text = line[:-1] waveform, alignment, _, _ = tts(model, text, speaker_id=speaker_id, fast=True) out_wav_path = join(out_dir, f"{idx}_{checkpoint_name}_synthesized.wav") out_alignment_path = join( out_dir, f"{idx}_{checkpoint_name}_synthesized_alignment.png") plot_alignment(alignment.T, out_alignment_path, info=f"{cfg.builder}, {basename(checkpoint_path)}") audio.save_wav(waveform, out_wav_path) print(f"Synthesis complete. Generated audio samples saved in {out_dir}") sys.exit(0)