def synthesis(text, num): m = Model() # m_post = ModelPostNet() m.load_state_dict(load_checkpoint(num, "transformer")) # m_post.load_state_dict(load_checkpoint(args.restore_step2, "postnet")) text = np.asarray(text_to_sequence(text, [hp.cleaners])) text = t.LongTensor(text).unsqueeze(0) text = text.cuda() mel_input = t.zeros([1, 1, 80]).cuda() pos_text = t.arange(1, text.size(1) + 1).unsqueeze(0) pos_text = pos_text.cuda() m = m.cuda() # m_post = m_post.cuda() m.train(False) # m_post.train(False) # pbar = tqdm(range(args.max_len)) with t.no_grad(): for _ in range(1000): pos_mel = t.arange(1, mel_input.size(1) + 1).unsqueeze(0).cuda() mel_pred, postnet_pred, attn, stop_token, _, attn_dec = m.forward( text, mel_input, pos_text, pos_mel) mel_input = t.cat([mel_input, postnet_pred[:, -1:, :]], dim=1) # mag_pred = m_post.forward(postnet_pred) # wav = spectrogram2wav(mag_pred.squeeze(0).cpu().numpy()) mel_postnet = postnet_pred[0].cpu().numpy().T plot_data([mel_postnet for _ in range(2)]) wav = audio.inv_mel_spectrogram(mel_postnet) wav = wav[0:audio.find_endpoint(wav)] audio.save_wav(wav, "result.wav")
def synthesis_griffin_lim(text_seq, model, alpha=1.0, mode="", num=100): text = text_to_sequence(text_seq, hp.hparams.text_cleaners) text = text + [0] text = np.stack([np.array(text)]) text = torch.from_numpy(text).long().to(device) pos = torch.stack([torch.Tensor([i + 1 for i in range(text.size(1))])]) pos = pos.long().to(device) start = int(round(time.time() * 1000)) model.eval() with torch.no_grad(): mel, mel_postnet = model(text, pos, alpha=alpha) end = int(round(time.time() * 1000)) tt = end - start print("Total - making mel : %d ms\n" % tt) mel = mel[0].cpu().numpy().T mel_postnet = mel_postnet[0].cpu().numpy().T #plot_data([mel, mel_postnet]) wav = audio.inv_mel_spectrogram(mel_postnet) print("Wav Have Been Synthesized.\n") if not os.path.exists("results"): os.mkdir("results") new_name = text_seq.replace(" ", "_") audio.save_wav( wav, os.path.join("results", new_name + str(num) + mode + ".wav")) return new_name
def get_tacotron2_alignment_test(text_seq): hparams = hp_tacotron2.create_hparams() hparams.sampling_rate = hp.sample_rate checkpoint_path = os.path.join( "Tacotron2", os.path.join("pre_trained_model", "tacotron2_statedict.pt")) tacotron2 = train_tacotron2.load_model(hparams) tacotron2.load_state_dict(torch.load(checkpoint_path)["state_dict"]) _ = tacotron2.cuda().eval().half() sequence = np.array(text_to_sequence(text_seq, hp.text_cleaners))[None, :] print("sequence size", np.shape(sequence)) sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() mel, mel_postnet, _, alignment = tacotron2.inference(sequence) plot_data((mel.float().data.cpu().numpy()[0], mel_postnet.float().data.cpu().numpy()[0], alignment.float().data.cpu().numpy()[0].T)) wav = audio.inv_mel_spectrogram(mel_postnet.float().data.cpu().numpy()[0]) audio.save_wav(wav, "test.wav") alignment = alignment.float().data.cpu().numpy()[0] print("alignment size", np.shape(alignment)) get_D(alignment) return alignment
def synthesis_griffin_lim(text_seq, model): text = text_to_sequence(text_seq, hp.text_cleaners) text = text + [0] text = np.stack([np.array(text)]) text = torch.from_numpy(text).long().to(device) pos = torch.stack([torch.Tensor([i + 1 for i in range(text.size(1))])]) pos = pos.long().to(device) dec_pos = torch.stack( [torch.Tensor([i + 1 for i in range(int(5.8 * text.size(1)))])]) dec_pos = dec_pos.long().to(device) model.eval() with torch.no_grad(): mel, mel_postnet = model(text, pos, dec_pos) mel = mel[0].cpu().numpy().T mel_postnet = mel_postnet[0].cpu().numpy().T plot_data([mel, mel_postnet]) wav = audio.inv_mel_spectrogram(mel_postnet) print("Wav Have Been Synthesized.") if not os.path.exists("results"): os.mkdir("results") audio.save_wav(wav, os.path.join("results", text_seq + ".wav"))
def test(): wavs_path = os.path.join("data", "LJSpeech-1.1") wavs_path = os.path.join(wavs_path, "wavs") wav_path = os.path.join(wavs_path, "LJ001-0001.wav") wav = audio.load_wav(wav_path) mel_spec = audio.melspectrogram(wav) wav_after_inv = audio.inv_mel_spectrogram(mel_spec) audio.save_wav(wav_after_inv, "test.wav")
def inference(args): hparams = create_hparams() sentences = get_sentences(args) # sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)] model = load_model(hparams) model.load_state_dict(torch.load(args.checkpoint)['state_dict']) model.cuda().eval() #.half() test_set = TextMelLoaderEval(sentences, hparams) test_collate_fn = TextMelCollateEval(hparams) test_sampler = DistributedSampler( valset) if hparams.distributed_run else None test_loader = DataLoader(test_set, num_workers=0, sampler=test_sampler, batch_size=hparams.synth_batch_size, pin_memory=False, drop_last=True, collate_fn=test_collate_fn) # taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, sampling_rate=hparams.sampling_rate) T2_output_range = (-hparams.max_abs_value, hparams.max_abs_value) if hparams.symmetric_mels else ( 0, hparams.max_abs_value) with torch.no_grad(): for i, batch in enumerate(test_loader): print("CHECK batch", i, batch) mel_outputs, mel_outputs_postnet, _, alignments = model.inference( batch) print('synthesize!!!', mel_outputs) print('synthesize!!!', mel_outputs_postnet) mels = mel_outputs_postnet[0].cpu().numpy() print('CHECK MEL SHAPE:', mels.shape) mel_path = os.path.join(args.out_filename, 'sentence_{}_mel.npy'.format(i)) # mels = [mel for gpu_mels in mels for mel in mel_outputs] mels = np.clip(mels, T2_output_range[0], T2_output_range[1]) np.save(mel_path, mels, allow_pickle=False) print('CHECK MEL SHAPE:', mels.shape) audio_path = os.path.join(args.out_filename, 'sentence_{}.wav'.format(i)) wav = audio.inv_mel_spectrogram(mels, hparams) audio.save_wav(wav, audio_path, sr=hparams.sampling_rate)
def synthesis_griffin_lim(text_seq, model, alpha=1.0, mode="", num=100, check=True): text_seq = text_seq[:-1] text = text_to_sequence(text_seq, hp.hparams.text_cleaners) text = text + [0] text = np.stack([np.array(text)]) text = torch.from_numpy(text).long().to(device) sequence = np.array(text_to_sequence(text_seq, hp.hparams.text_cleaners))[None, 1] pos = torch.stack([torch.Tensor([i + 1 for i in range(text.size(1))])]) pos = pos.long().to(device) model.eval() with torch.no_grad(): mel, mel_postnet = model(text, pos, alpha=alpha) if not os.path.exists("results_kor_0730_nam_95000"): os.mkdir("results_kor_0730_nam_95000") new_name = text_seq.replace(" ", "_") new_name = new_name.replace("?", "_") new_name = new_name[:-1] new_name2 = new_name + str(num) + mode + ".wav" new_name3 = "results_kor_0730_nam_95000/" + new_name2 mel = mel[0].cpu().numpy().T mel_postnet = mel_postnet[0].cpu().numpy().T plot_data([mel, mel_postnet], file_name=new_name) start = int(round(time.time() * 1000)) wav = audio.inv_mel_spectrogram(mel_postnet) end = int(round(time.time() * 1000)) audio.save_wav(wav, os.path.join("results_kor_0730_nam_95000", new_name2)) clean_text = new_name.replace("_", " ") if check: x, _, _, y, _, _ = WERCER([new_name3], [str(clean_text)]) else: x = 0 y = 0 print("Total time : ", end - start) print() return new_name, x, y
model_bert, tokenizer, return_token=True) # print(tokens) embeddings = embeddings[1:(embeddings.size(0) - 1)] tokens = tokens[1:(len(tokens) - 1)] characters, sep_list = gen_text_sep(tokens) # print(np.shape(characters)) # print(sep_list) embeddings = [embeddings] characters = np.stack([characters]) characters = torch.from_numpy(characters).long().to(device) # mel_input = np.zeros([1, hparams.num_mels, 1], dtype=np.float32) # mel_input = torch.Tensor(mel_input).to(device) sep_list = [sep_list] with torch.no_grad(): output = model(characters, embeddings, sep_list) mel_output = output[0][1] # print(mel_output.size()) mel_output = mel_output.cpu().numpy()[0].T # print(np.shape(mel_output)) wav = audio.inv_mel_spectrogram(mel_output) # print(np.shape(linear_spec)) # print(np.shape(wav)) # wav = wav[:audio.find_endpoint(wav)] # print(np.shape(wav)) audio.save_wav(wav, "result.wav")
def synthesis_griffin_lim(text_seq, model, alpha=1.0, mode="", num=100, check=True, cute=False): text = text_to_sequence(text_seq, hp.hparams.text_cleaners) text = text + [0] text = np.stack([np.array(text)]) text = torch.from_numpy(text).cuda().to(device) pos = torch.stack([torch.Tensor([i + 1 for i in range(text.size(1))])]) pos = pos.long().to(device) model.eval() #mel = generate_mels(model, text, pos, 1, 0) with torch.no_grad(): mel, mel_postnet = model(text, pos, alpha=alpha) if not os.path.exists("results_kor_0730_indiv"): os.mkdir("results_kor_0730_indiv") new_name = text_seq.replace(" ", "_") new_name = new_name.replace("?", "_") if (cute): new_name2 = new_name + "_cute" new_name2 = new_name + str(num) + ".wav" new_name3 = "results_kor_0730_indiv/" + new_name2 if (cute): #high-pitched sound mel_postnet = mel_postnet[0].cpu().numpy().T else: #print('mel', mel.max(), mel.mean(), mel.min()) #print('mel.shape' , mel_postnet.shape) mel_postnet = mel_postnet.data.cpu().numpy()[0].T mel_postnet = mel_postnet[:, :-1] mel_postnet = np.append(mel_postnet, np.ones((80, 0), dtype=np.float32) * -4.0, axis=1) #print(mel.shape) mel = mel[0].cpu().numpy().T #print('mel_postnet', mel_postnet.max(), mel_postnet.mean(), mel_postnet.min()) plot_data([mel, mel_postnet], file_name=new_name) mels = [] mels.append(mel_postnet) if (cute): wav = audio.inv_mel_spectrogram(mel_postnet) else: stft = audio.taco_stft() wav = mels_to_wavs_GL(mels, stft) audio.save_wav( wav, os.path.join("results_kor_0730_indiv", new_name + str(num) + ".wav")) clean_text = new_name.replace("_", " ") if check: x, _, _, y, _, _ = WERCER([new_name3], [str(clean_text)]) else: x = 0 y = 0 return new_name, x, y
# Test parser = argparse.ArgumentParser() parser.add_argument('--step', type=int, default=0) parser.add_argument("--mode", type=int, default=0) parser.add_argument("--alpha", type=float, default=1.0) args = parser.parse_args() if args.mode == 0: print("use griffin lim") model = get_DNN(args.step) data_list, _, _, _ = get_data() for i, phn in enumerate(data_list): mel, _ = synthesis(model, phn, args.alpha) if not os.path.exists("results"): os.mkdir("results") wav_mel = audio.inv_mel_spectrogram(mel) audio.save_wav( wav_mel, "results/" + str(args.step) + "_" + str(args.mode) + "_" + str(i) + "_mel.wav") elif args.mode == 1: print("use griffin lim + multiband wavernn") model = get_DNN(args.step) data_list, mel_list, wav_list, durations = get_data() for i, phn in enumerate(data_list): mel, _ = synthesis(model, phn, ref_mel=mel_list[i], duration=durations[i], alpha=args.alpha) if not os.path.exists("results"): os.mkdir("results")
def wav_from_mel(self, S): S = np.transpose(S) return audio.inv_mel_spectrogram(S, self.hparams)
if __name__ == "__main__": # Test model_tacotron2 = network.Tacotron2(hp).to(device) checkpoint_path = "checkpoint_148200.pth.tar" checkpoint_path = os.path.join("batch_big", checkpoint_path) model_tacotron2.load_state_dict( torch.load(os.path.join(hp.checkpoint_path, checkpoint_path))['model']) model_tacotron2.eval() Speaker_Encoder = SpeakerEncoder.get_model().to(device) test_wav = audio.load_wav("test.wav") mel_spec = audio.melspectrogram(test_wav) # print(np.shape(mel_spec)) mel_spec = np.transpose(mel_spec)[0:180] # print(np.shape(mel_spec)) mel_spec = torch.from_numpy(mel_spec).float().to(device) mel_spec = torch.stack([mel_spec]) # print(mel_spec.size()) test_text = "What can you do?" test_text = text.text_to_sequence(test_text, hp.text_cleaners) test_text = torch.Tensor(test_text).long().to(device) test_text = torch.stack([test_text]) # print(test_text.size()) output = test(model_tacotron2, Speaker_Encoder, test_text, mel_spec) output = output.cpu().numpy() wav_results = audio.inv_mel_spectrogram(output) audio.save_wav(wav_results, "result.wav")