Esempio n. 1
0
def synthesis(text, num):
    m = Model()
    # m_post = ModelPostNet()

    m.load_state_dict(load_checkpoint(num, "transformer"))
    # m_post.load_state_dict(load_checkpoint(args.restore_step2, "postnet"))

    text = np.asarray(text_to_sequence(text, [hp.cleaners]))
    text = t.LongTensor(text).unsqueeze(0)
    text = text.cuda()
    mel_input = t.zeros([1, 1, 80]).cuda()
    pos_text = t.arange(1, text.size(1) + 1).unsqueeze(0)
    pos_text = pos_text.cuda()

    m = m.cuda()
    # m_post = m_post.cuda()
    m.train(False)
    # m_post.train(False)

    # pbar = tqdm(range(args.max_len))
    with t.no_grad():
        for _ in range(1000):
            pos_mel = t.arange(1, mel_input.size(1) + 1).unsqueeze(0).cuda()
            mel_pred, postnet_pred, attn, stop_token, _, attn_dec = m.forward(
                text, mel_input, pos_text, pos_mel)
            mel_input = t.cat([mel_input, postnet_pred[:, -1:, :]], dim=1)

        # mag_pred = m_post.forward(postnet_pred)

    # wav = spectrogram2wav(mag_pred.squeeze(0).cpu().numpy())
    mel_postnet = postnet_pred[0].cpu().numpy().T
    plot_data([mel_postnet for _ in range(2)])
    wav = audio.inv_mel_spectrogram(mel_postnet)
    wav = wav[0:audio.find_endpoint(wav)]
    audio.save_wav(wav, "result.wav")
Esempio n. 2
0
def synthesis_griffin_lim(text_seq, model, alpha=1.0, mode="", num=100):
    text = text_to_sequence(text_seq, hp.hparams.text_cleaners)
    text = text + [0]
    text = np.stack([np.array(text)])
    text = torch.from_numpy(text).long().to(device)

    pos = torch.stack([torch.Tensor([i + 1 for i in range(text.size(1))])])
    pos = pos.long().to(device)

    start = int(round(time.time() * 1000))

    model.eval()
    with torch.no_grad():
        mel, mel_postnet = model(text, pos, alpha=alpha)

    end = int(round(time.time() * 1000))
    tt = end - start
    print("Total - making mel : %d ms\n" % tt)

    mel = mel[0].cpu().numpy().T
    mel_postnet = mel_postnet[0].cpu().numpy().T
    #plot_data([mel, mel_postnet])

    wav = audio.inv_mel_spectrogram(mel_postnet)
    print("Wav Have Been Synthesized.\n")

    if not os.path.exists("results"):
        os.mkdir("results")
    new_name = text_seq.replace(" ", "_")
    audio.save_wav(
        wav, os.path.join("results", new_name + str(num) + mode + ".wav"))
    return new_name
Esempio n. 3
0
def get_tacotron2_alignment_test(text_seq):
    hparams = hp_tacotron2.create_hparams()
    hparams.sampling_rate = hp.sample_rate

    checkpoint_path = os.path.join(
        "Tacotron2", os.path.join("pre_trained_model",
                                  "tacotron2_statedict.pt"))

    tacotron2 = train_tacotron2.load_model(hparams)
    tacotron2.load_state_dict(torch.load(checkpoint_path)["state_dict"])
    _ = tacotron2.cuda().eval().half()

    sequence = np.array(text_to_sequence(text_seq, hp.text_cleaners))[None, :]
    print("sequence size", np.shape(sequence))

    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()

    mel, mel_postnet, _, alignment = tacotron2.inference(sequence)

    plot_data((mel.float().data.cpu().numpy()[0],
               mel_postnet.float().data.cpu().numpy()[0],
               alignment.float().data.cpu().numpy()[0].T))

    wav = audio.inv_mel_spectrogram(mel_postnet.float().data.cpu().numpy()[0])
    audio.save_wav(wav, "test.wav")

    alignment = alignment.float().data.cpu().numpy()[0]
    print("alignment size", np.shape(alignment))

    get_D(alignment)

    return alignment
Esempio n. 4
0
def synthesis_griffin_lim(text_seq, model):
    text = text_to_sequence(text_seq, hp.text_cleaners)
    text = text + [0]
    text = np.stack([np.array(text)])
    text = torch.from_numpy(text).long().to(device)

    pos = torch.stack([torch.Tensor([i + 1 for i in range(text.size(1))])])
    pos = pos.long().to(device)

    dec_pos = torch.stack(
        [torch.Tensor([i + 1 for i in range(int(5.8 * text.size(1)))])])
    dec_pos = dec_pos.long().to(device)

    model.eval()
    with torch.no_grad():
        mel, mel_postnet = model(text, pos, dec_pos)

    mel = mel[0].cpu().numpy().T
    mel_postnet = mel_postnet[0].cpu().numpy().T
    plot_data([mel, mel_postnet])

    wav = audio.inv_mel_spectrogram(mel_postnet)
    print("Wav Have Been Synthesized.")

    if not os.path.exists("results"):
        os.mkdir("results")
    audio.save_wav(wav, os.path.join("results", text_seq + ".wav"))
Esempio n. 5
0
def test():
    wavs_path = os.path.join("data", "LJSpeech-1.1")
    wavs_path = os.path.join(wavs_path, "wavs")
    wav_path = os.path.join(wavs_path, "LJ001-0001.wav")
    wav = audio.load_wav(wav_path)
    mel_spec = audio.melspectrogram(wav)
    wav_after_inv = audio.inv_mel_spectrogram(mel_spec)
    audio.save_wav(wav_after_inv, "test.wav")
Esempio n. 6
0
def inference(args):
    hparams = create_hparams()

    sentences = get_sentences(args)
    # sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)]

    model = load_model(hparams)
    model.load_state_dict(torch.load(args.checkpoint)['state_dict'])
    model.cuda().eval()  #.half()

    test_set = TextMelLoaderEval(sentences, hparams)
    test_collate_fn = TextMelCollateEval(hparams)
    test_sampler = DistributedSampler(
        valset) if hparams.distributed_run else None
    test_loader = DataLoader(test_set,
                             num_workers=0,
                             sampler=test_sampler,
                             batch_size=hparams.synth_batch_size,
                             pin_memory=False,
                             drop_last=True,
                             collate_fn=test_collate_fn)

    # taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, sampling_rate=hparams.sampling_rate)
    T2_output_range = (-hparams.max_abs_value,
                       hparams.max_abs_value) if hparams.symmetric_mels else (
                           0, hparams.max_abs_value)

    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            print("CHECK batch", i, batch)
            mel_outputs, mel_outputs_postnet, _, alignments = model.inference(
                batch)
            print('synthesize!!!', mel_outputs)
            print('synthesize!!!', mel_outputs_postnet)

            mels = mel_outputs_postnet[0].cpu().numpy()

            print('CHECK MEL SHAPE:', mels.shape)

            mel_path = os.path.join(args.out_filename,
                                    'sentence_{}_mel.npy'.format(i))
            # mels = [mel for gpu_mels in mels for mel in mel_outputs]
            mels = np.clip(mels, T2_output_range[0], T2_output_range[1])
            np.save(mel_path, mels, allow_pickle=False)

            print('CHECK MEL SHAPE:', mels.shape)

            audio_path = os.path.join(args.out_filename,
                                      'sentence_{}.wav'.format(i))
            wav = audio.inv_mel_spectrogram(mels, hparams)
            audio.save_wav(wav, audio_path, sr=hparams.sampling_rate)
Esempio n. 7
0
def synthesis_griffin_lim(text_seq,
                          model,
                          alpha=1.0,
                          mode="",
                          num=100,
                          check=True):
    text_seq = text_seq[:-1]
    text = text_to_sequence(text_seq, hp.hparams.text_cleaners)
    text = text + [0]
    text = np.stack([np.array(text)])
    text = torch.from_numpy(text).long().to(device)

    sequence = np.array(text_to_sequence(text_seq,
                                         hp.hparams.text_cleaners))[None, 1]
    pos = torch.stack([torch.Tensor([i + 1 for i in range(text.size(1))])])
    pos = pos.long().to(device)

    model.eval()

    with torch.no_grad():
        mel, mel_postnet = model(text, pos, alpha=alpha)

    if not os.path.exists("results_kor_0730_nam_95000"):
        os.mkdir("results_kor_0730_nam_95000")
    new_name = text_seq.replace(" ", "_")
    new_name = new_name.replace("?", "_")

    new_name = new_name[:-1]
    new_name2 = new_name + str(num) + mode + ".wav"
    new_name3 = "results_kor_0730_nam_95000/" + new_name2

    mel = mel[0].cpu().numpy().T
    mel_postnet = mel_postnet[0].cpu().numpy().T
    plot_data([mel, mel_postnet], file_name=new_name)

    start = int(round(time.time() * 1000))
    wav = audio.inv_mel_spectrogram(mel_postnet)
    end = int(round(time.time() * 1000))
    audio.save_wav(wav, os.path.join("results_kor_0730_nam_95000", new_name2))
    clean_text = new_name.replace("_", " ")
    if check:
        x, _, _, y, _, _ = WERCER([new_name3], [str(clean_text)])
    else:
        x = 0
        y = 0
    print("Total time : ", end - start)
    print()
    return new_name, x, y
Esempio n. 8
0
                                            model_bert,
                                            tokenizer,
                                            return_token=True)
    # print(tokens)

    embeddings = embeddings[1:(embeddings.size(0) - 1)]
    tokens = tokens[1:(len(tokens) - 1)]
    characters, sep_list = gen_text_sep(tokens)
    # print(np.shape(characters))
    # print(sep_list)

    embeddings = [embeddings]
    characters = np.stack([characters])
    characters = torch.from_numpy(characters).long().to(device)
    # mel_input = np.zeros([1, hparams.num_mels, 1], dtype=np.float32)
    # mel_input = torch.Tensor(mel_input).to(device)
    sep_list = [sep_list]

    with torch.no_grad():
        output = model(characters, embeddings, sep_list)
        mel_output = output[0][1]
        # print(mel_output.size())
        mel_output = mel_output.cpu().numpy()[0].T
        # print(np.shape(mel_output))
    wav = audio.inv_mel_spectrogram(mel_output)
    # print(np.shape(linear_spec))
    # print(np.shape(wav))
    # wav = wav[:audio.find_endpoint(wav)]
    # print(np.shape(wav))
    audio.save_wav(wav, "result.wav")
Esempio n. 9
0
def synthesis_griffin_lim(text_seq,
                          model,
                          alpha=1.0,
                          mode="",
                          num=100,
                          check=True,
                          cute=False):
    text = text_to_sequence(text_seq, hp.hparams.text_cleaners)
    text = text + [0]
    text = np.stack([np.array(text)])
    text = torch.from_numpy(text).cuda().to(device)

    pos = torch.stack([torch.Tensor([i + 1 for i in range(text.size(1))])])
    pos = pos.long().to(device)

    model.eval()

    #mel = generate_mels(model, text, pos, 1, 0)

    with torch.no_grad():
        mel, mel_postnet = model(text, pos, alpha=alpha)

    if not os.path.exists("results_kor_0730_indiv"):
        os.mkdir("results_kor_0730_indiv")
    new_name = text_seq.replace(" ", "_")
    new_name = new_name.replace("?", "_")
    if (cute):
        new_name2 = new_name + "_cute"
    new_name2 = new_name + str(num) + ".wav"
    new_name3 = "results_kor_0730_indiv/" + new_name2

    if (cute):
        #high-pitched sound
        mel_postnet = mel_postnet[0].cpu().numpy().T
    else:
        #print('mel', mel.max(), mel.mean(), mel.min())
        #print('mel.shape' , mel_postnet.shape)
        mel_postnet = mel_postnet.data.cpu().numpy()[0].T
        mel_postnet = mel_postnet[:, :-1]
        mel_postnet = np.append(mel_postnet,
                                np.ones((80, 0), dtype=np.float32) * -4.0,
                                axis=1)
        #print(mel.shape)

    mel = mel[0].cpu().numpy().T
    #print('mel_postnet', mel_postnet.max(), mel_postnet.mean(), mel_postnet.min())
    plot_data([mel, mel_postnet], file_name=new_name)
    mels = []
    mels.append(mel_postnet)

    if (cute):
        wav = audio.inv_mel_spectrogram(mel_postnet)
    else:
        stft = audio.taco_stft()
        wav = mels_to_wavs_GL(mels, stft)

    audio.save_wav(
        wav,
        os.path.join("results_kor_0730_indiv", new_name + str(num) + ".wav"))
    clean_text = new_name.replace("_", " ")

    if check:
        x, _, _, y, _, _ = WERCER([new_name3], [str(clean_text)])
    else:
        x = 0
        y = 0

    return new_name, x, y
Esempio n. 10
0
    # Test
    parser = argparse.ArgumentParser()
    parser.add_argument('--step', type=int, default=0)
    parser.add_argument("--mode", type=int, default=0)
    parser.add_argument("--alpha", type=float, default=1.0)
    args = parser.parse_args()

    if args.mode == 0:
        print("use griffin lim")
        model = get_DNN(args.step)
        data_list, _, _, _ = get_data()
        for i, phn in enumerate(data_list):
            mel, _ = synthesis(model, phn, args.alpha)
            if not os.path.exists("results"):
                os.mkdir("results")
            wav_mel = audio.inv_mel_spectrogram(mel)
            audio.save_wav(
                wav_mel, "results/" + str(args.step) + "_" + str(args.mode) +
                "_" + str(i) + "_mel.wav")
    elif args.mode == 1:
        print("use griffin lim + multiband wavernn")
        model = get_DNN(args.step)
        data_list, mel_list, wav_list, durations = get_data()
        for i, phn in enumerate(data_list):
            mel, _ = synthesis(model,
                               phn,
                               ref_mel=mel_list[i],
                               duration=durations[i],
                               alpha=args.alpha)
            if not os.path.exists("results"):
                os.mkdir("results")
Esempio n. 11
0
 def wav_from_mel(self, S):
     S = np.transpose(S)
     return audio.inv_mel_spectrogram(S, self.hparams)
Esempio n. 12
0
if __name__ == "__main__":
    # Test
    model_tacotron2 = network.Tacotron2(hp).to(device)
    checkpoint_path = "checkpoint_148200.pth.tar"
    checkpoint_path = os.path.join("batch_big", checkpoint_path)
    model_tacotron2.load_state_dict(
        torch.load(os.path.join(hp.checkpoint_path, checkpoint_path))['model'])
    model_tacotron2.eval()
    Speaker_Encoder = SpeakerEncoder.get_model().to(device)

    test_wav = audio.load_wav("test.wav")
    mel_spec = audio.melspectrogram(test_wav)
    # print(np.shape(mel_spec))
    mel_spec = np.transpose(mel_spec)[0:180]
    # print(np.shape(mel_spec))
    mel_spec = torch.from_numpy(mel_spec).float().to(device)
    mel_spec = torch.stack([mel_spec])
    # print(mel_spec.size())

    test_text = "What can you do?"
    test_text = text.text_to_sequence(test_text, hp.text_cleaners)
    test_text = torch.Tensor(test_text).long().to(device)
    test_text = torch.stack([test_text])
    # print(test_text.size())

    output = test(model_tacotron2, Speaker_Encoder, test_text, mel_spec)

    output = output.cpu().numpy()
    wav_results = audio.inv_mel_spectrogram(output)
    audio.save_wav(wav_results, "result.wav")