def main(args): torch.cuda.manual_seed(13524532) print("... Load trained models ...\n") print(" Loding checkpoint of document-level TTS model: {}".format( tts_ckpt)) print(" Loding checkpoint of MelGAN TTS model: {}".format( args.mel_ckpt)) start = time.time() mel_ckpt = torch.load(args.mel_ckpt) if args.config is not None: hp = HParam(args.config) else: hp = load_hparam_str(mel_ckpt['hp_str']) model = Generator(hp.audio.n_mel_channels).cuda() model.load_state_dict(mel_ckpt['model_g']) model.eval(inference=False) mel_time = time.time() - start print('\n... Generate waveform ...\n') with torch.no_grad(): num_of_iter = args.iteration texts = [] with open(args.script_path, "r") as f: for line in f: line = line.strip() if len(line): texts.append(line) print(" * input text\n {} \n".format(texts[0])) for i in range(num_of_iter): start = time.time() mel, length, alignments = infer(args.tts_ckpt, texts[0]) if len(mel.shape) == 2: mel = mel.unsqueeze(0) mel = mel.cuda() audio = model.inference(mel) audio = audio.cpu().detach().numpy() save_path = os.path.join(args.out_dir, str(i) + '_audio.wav') write(save_path, hp.audio.sampling_rate, audio) audio_length = len(audio) / hp.audio.sampling_rate print(" {}. ".format(i + 1)) print(" - Path of generated audio file: {}".format(save_path)) print(" - Length of generated audio file: {}s".format( audio_length)) print( " - Time taken from text loading to generate spectrogram: : {}s" .format(time.time() - start)) print(" - Time taken to generate waveform: : {}s\n".format( time.time() - start + mel_time)) print("finished generation")
def main(args): torch.manual_seed(1234) torch.cuda.manual_seed(1234) text_path = "/media/sh/Workspace/긴문장합성/kor_Document-level_Neural_TTS_length/test/1.txt" doc_ckpt_kor = '/media/sh/Workspace/긴문장합성/kor_Document-level_Neural_TTS_length/outdir/checkpoint_29000' save_folder = '/media/sh/Workspace/긴문장합성/samples' today = datetime.datetime.today() time = str(today.month) + str(today.day) + str(today.hour) + str( today.minute) + str(today.second) # time = str(time) save_name = 'kor_audio_length_regul_' + doc_ckpt_kor.split( '_')[-1] + '_' + time + '.wav' save_path = os.path.join(save_folder, save_name) checkpoint = torch.load(args.checkpoint_path) if args.config is not None: hp = HParam(args.config) else: hp = load_hparam_str(checkpoint['hp_str']) model = Generator(hp.audio.n_mel_channels).cuda() model.load_state_dict(checkpoint['model_g']) model.eval(inference=False) with torch.no_grad(): # for melpath in tqdm.tqdm(glob.glob(os.path.join(args.input_folder, '*.mel'))): # mel = torch.load(melpath) texts = [] with open(text_path, "r") as f: for line in f: line = line.strip() if len(line): texts.append(line) mel, length, alignments = infer(doc_ckpt_kor, texts[0]) if len(mel.shape) == 2: mel = mel.unsqueeze(0) mel = mel.cuda() audio = model.inference(mel) audio = audio.cpu().detach().numpy() # out_path = melpath.replace('.mel', '_reconstructed_epoch%04d.wav' % checkpoint['epoch']) write(save_path, hp.audio.sampling_rate, audio) print('합성 끝')
def get_melgan(full_path=None): if not full_path: melgan = torch.hub.load('seungwonpark/melgan', 'melgan') melgan.eval() melgan.to(device) return melgan # make sure to clone seungwonpark/melgan print("use local vocoder") from melgan.utils.hparams import load_hparam_str from melgan.model.generator import Generator cp = torch.load(full_path, map_location=device) hp = load_hparam_str(cp["hp_str"]) model = Generator(hp.audio.n_mel_channels) if torch.cuda.is_available(): model = model.cuda() model.load_state_dict(cp["model_g"]) model.eval(inference=False) model.to(device) return model
def main(args): checkpoint = torch.load(args.checkpoint_path) if args.config is not None: hp = HParam(args.config) else: hp = load_hparam_str(checkpoint['hp_str']) model = Generator(hp.audio.n_mel_channels).cuda() model.load_state_dict(checkpoint['model_g']) model.eval(inference=False) with torch.no_grad(): # for melpath in tqdm.tqdm(glob.glob(os.path.join(args.input_folder, '*.mel'))): # mel = torch.load(melpath) texts = [] with open("/media/qw/data/Experiment/Encoder_selfAtt/test/1.txt", "r") as f: for line in f: line = line.strip() if len(line): texts.append(line) for i in range(10): mel, length, alignments = infer( '/media/qw/data/Experiment/Encoder_selfAtt/tacotron2_statedict.pt', texts[0]) # mel, length, alignments = infer('/media/qw/data/Experiment/Encoder_selfAtt/result/3sentence', texts[0]) # print('/'*i, '.'*(50-i)) # plt.figure() # plt.imshow(alignments[0].T.cpu()) # plt.savefig('./align/alignment{}.png'.format(i), dpi=300) # mel, length, alignments = infer('/media/qw/data/Experiment/Encoder_selfAtt/tacotron2_statedict.pt', 'Emil Sinclair is the protagonist of the novel. hello my name is sung woong hwang.') if len(mel.shape) == 2: mel = mel.unsqueeze(0) mel = mel.cuda() audio = model.inference(mel) audio = audio.cpu().detach().numpy() # out_path = melpath.replace('.mel', '_reconstructed_epoch%04d.wav' % checkpoint['epoch']) write('/media/qw/data/Experiment/Encoder_selfAtt/audio.wav', hp.audio.sampling_rate, audio)