Beispiel #1
0
def synthesis_griffin_lim(text_seq, model, alpha=1.0, mode="", num=100):
    text = text_to_sequence(text_seq, hp.hparams.text_cleaners)
    text = text + [0]
    text = np.stack([np.array(text)])
    text = torch.from_numpy(text).long().to(device)

    pos = torch.stack([torch.Tensor([i + 1 for i in range(text.size(1))])])
    pos = pos.long().to(device)

    start = int(round(time.time() * 1000))

    model.eval()
    with torch.no_grad():
        mel, mel_postnet = model(text, pos, alpha=alpha)

    end = int(round(time.time() * 1000))
    tt = end - start
    print("Total - making mel : %d ms\n" % tt)

    mel = mel[0].cpu().numpy().T
    mel_postnet = mel_postnet[0].cpu().numpy().T
    #plot_data([mel, mel_postnet])

    wav = audio.inv_mel_spectrogram(mel_postnet)
    print("Wav Have Been Synthesized.\n")

    if not os.path.exists("results"):
        os.mkdir("results")
    new_name = text_seq.replace(" ", "_")
    audio.save_wav(
        wav, os.path.join("results", new_name + str(num) + mode + ".wav"))
    return new_name
  def synthesize(self, text, speaker_id=0):
    """Convert text to speech waveform given a deepvoice3 model.

    Args:
        text (str) : Input text to be synthesized
        p (float) : Replace word to pronounciation if p > 0. Default is 0.
    """
    sequence = np.array(self._frontend.text_to_sequence(text))
    sequence = Variable(torch.from_numpy(sequence)).unsqueeze(0)
    text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long()
    text_positions = Variable(text_positions)
    speaker_ids = None if speaker_id is None else Variable(torch.LongTensor([speaker_id]))
    if self.use_cuda:
        sequence = sequence.cuda()
        text_positions = text_positions.cuda()
        speaker_ids = None if speaker_ids is None else speaker_ids.cuda()

    # Greedy decoding
    mel_outputs, linear_outputs, alignments, done = self.model(
        sequence, text_positions=text_positions, speaker_ids=speaker_ids)

    linear_output = linear_outputs[0].cpu().data.numpy()

    # Predicted audio signal
    waveform = audio.inv_spectrogram(linear_output.T)
    out = io.BytesIO()
    audio.save_wav(waveform, out)
    return out
Beispiel #3
0
def eval_model(global_step, writer, model, checkpoint_dir, ismultispeaker):
    # harded coded
    texts = [
        "Scientists at the CERN laboratory say they have discovered a new particle.",
        "There's a way to measure the acute emotional intelligence that has never gone out of style.",
        "President Trump met with other leaders at the Group of 20 conference.",
        "Generative adversarial network or variational auto-encoder.",
        "Please call Stella.",
        "Some have accepted this as a miracle without any physical explanation.",
    ]
    import synthesis
    synthesis._frontend = _frontend

    eval_output_dir = join(checkpoint_dir, "eval")
    os.makedirs(eval_output_dir, exist_ok=True)

    # hard coded
    speaker_ids = [0, 1, 10] if ismultispeaker else [None]
    for speaker_id in speaker_ids:
        speaker_str = "multispeaker{}".format(
            speaker_id) if speaker_id is not None else "single"

        for idx, text in enumerate(texts):
            signal, alignment, _, mel = synthesis.tts(model,
                                                      text,
                                                      p=0,
                                                      speaker_id=speaker_id,
                                                      fast=False)
            signal /= np.max(np.abs(signal))

            # Alignment
            path = join(
                eval_output_dir, "step{:09d}_text{}_{}_alignment.png".format(
                    global_step, idx, speaker_str))
            save_alignment(path, alignment)
            tag = "eval_averaged_alignment_{}_{}".format(idx, speaker_str)
            writer.add_image(
                tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255),
                global_step)

            # Mel
            writer.add_image(
                "(Eval) Predicted mel spectrogram text{}_{}".format(
                    idx, speaker_str), prepare_spec_image(mel), global_step)

            # Audio
            path = join(
                eval_output_dir, "step{:09d}_text{}_{}_predicted.wav".format(
                    global_step, idx, speaker_str))
            audio.save_wav(signal, path)

            try:
                writer.add_audio("(Eval) Predicted audio signal {}_{}".format(
                    idx, speaker_str),
                                 signal,
                                 global_step,
                                 sample_rate=fs)
            except Exception as e:
                warn(str(e))
                pass
Beispiel #4
0
def get_tacotron2_alignment_test(text_seq):
    hparams = hp_tacotron2.create_hparams()
    hparams.sampling_rate = hp.sample_rate

    checkpoint_path = os.path.join(
        "Tacotron2", os.path.join("pre_trained_model",
                                  "tacotron2_statedict.pt"))

    tacotron2 = train_tacotron2.load_model(hparams)
    tacotron2.load_state_dict(torch.load(checkpoint_path)["state_dict"])
    _ = tacotron2.cuda().eval().half()

    sequence = np.array(text_to_sequence(text_seq, hp.text_cleaners))[None, :]
    print("sequence size", np.shape(sequence))

    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()

    mel, mel_postnet, _, alignment = tacotron2.inference(sequence)

    plot_data((mel.float().data.cpu().numpy()[0],
               mel_postnet.float().data.cpu().numpy()[0],
               alignment.float().data.cpu().numpy()[0].T))

    wav = audio.inv_mel_spectrogram(mel_postnet.float().data.cpu().numpy()[0])
    audio.save_wav(wav, "test.wav")

    alignment = alignment.float().data.cpu().numpy()[0]
    print("alignment size", np.shape(alignment))

    get_D(alignment)

    return alignment
Beispiel #5
0
 def synth(self, text, save=None):
     inp = clean(text)
     print(inp)
     x = [self.c2i[c] for c in inp + 'E']
     x += [0] * (hp.maxlen - len(x))
     x = np.array(x)
     x = x.reshape(1, -1)
     with self.melsession.as_default():
         preds = np.zeros((1, 1, hp.n_mels), np.float32)
         cnt = hp.Tyr
         for j in range(hp.Tyr):
             sys.stdout.write('\rProcessing %d' % j)
             sys.stdout.flush()
             _preds, a = self.melsession.run(
                 [self.melmodel.mel_output, self.melmodel.A], {
                     self.melmodel.text: x,
                     self.melmodel.mel: preds
                 })
             preds = np.concatenate((np.zeros((1, 1, hp.n_mels)), _preds),
                                    axis=1)
             cnt -= 1
             if np.argmax(a[0, :, -1]) >= len(inp) - 3:
                 cnt = min(cnt, 10)
             if cnt <= 0:
                 break
     with self.magsession.as_default():
         wav = self.magsession.run(self.magmodel.wav_output,
                                   {self.magmodel.mel: preds})
         wav = audio.inv_preemphasis(wav)
         if save is not None:
             audio.save_wav(wav[0], save)
         else:
             out = io.BytesIO()
             audio.save_wav(wav[0], out)
             return out.getvalue()
Beispiel #6
0
    def make_prediction(self, x, y, epoch):
        _, target_mel, target_linear, _ = y
        _, mel, linear, _, alignment = self.tacotron._predict_with_target(*x)
        mel = mel[0]
        linear = linear[0]
        alignment = alignment[0]
            
        step = (epoch + 1) * self.steps_per_epoch
        mel_filename    = os.path.join(self.mel_dir, "step-{}.npy".format(step))
        linear_filename = os.path.join(self.linear_dir, "step-{}.npy".format(step))
        plot_mel_filename = os.path.join(self.plot_dir, "mel_step-{}.png".format(step))
        plot_linear_filename = os.path.join(self.plot_dir, "linear_step-{}.png".format(step))
        plot_align_filename = os.path.join(self.plot_dir, "align_step-{}.png".format(step))
        wav_mel_filename = os.path.join(self.wav_dir, "wav_from_mel_step-{}.wav".format(step))
        wav_linear_filename = os.path.join(self.wav_dir, "wav_from_linear_step-{}.wav".format(step))

        wav_from_linear = self.tacotron.wav_from_linear(linear)
        wav_from_mel = self.tacotron.wav_from_mel(mel)
            
        np.save(mel_filename, mel)
        np.save(linear_filename, linear)

        audio.save_wav(wav_from_mel, wav_mel_filename, self.tacotron.audio_rate)
        audio.save_wav(wav_from_linear, wav_linear_filename, self.tacotron.audio_rate)

        plot_alignment(alignment, title="Alignments", filename=plot_align_filename, show=False, fontsize=14)
        plot_spectrogram(mel, title="Mel spectrogram", filename=plot_mel_filename, show=False, target_spectrogram=target_mel[0])
        plot_spectrogram(linear, title="Linear spectrogram", filename=plot_linear_filename, show=False, target_spectrogram=target_linear[0])
Beispiel #7
0
def synthesis_griffin_lim(text_seq, model):
    text = text_to_sequence(text_seq, hp.text_cleaners)
    text = text + [0]
    text = np.stack([np.array(text)])
    text = torch.from_numpy(text).long().to(device)

    pos = torch.stack([torch.Tensor([i + 1 for i in range(text.size(1))])])
    pos = pos.long().to(device)

    dec_pos = torch.stack(
        [torch.Tensor([i + 1 for i in range(int(5.8 * text.size(1)))])])
    dec_pos = dec_pos.long().to(device)

    model.eval()
    with torch.no_grad():
        mel, mel_postnet = model(text, pos, dec_pos)

    mel = mel[0].cpu().numpy().T
    mel_postnet = mel_postnet[0].cpu().numpy().T
    plot_data([mel, mel_postnet])

    wav = audio.inv_mel_spectrogram(mel_postnet)
    print("Wav Have Been Synthesized.")

    if not os.path.exists("results"):
        os.mkdir("results")
    audio.save_wav(wav, os.path.join("results", text_seq + ".wav"))
Beispiel #8
0
def synthesis(text, num):
    m = Model()
    # m_post = ModelPostNet()

    m.load_state_dict(load_checkpoint(num, "transformer"))
    # m_post.load_state_dict(load_checkpoint(args.restore_step2, "postnet"))

    text = np.asarray(text_to_sequence(text, [hp.cleaners]))
    text = t.LongTensor(text).unsqueeze(0)
    text = text.cuda()
    mel_input = t.zeros([1, 1, 80]).cuda()
    pos_text = t.arange(1, text.size(1) + 1).unsqueeze(0)
    pos_text = pos_text.cuda()

    m = m.cuda()
    # m_post = m_post.cuda()
    m.train(False)
    # m_post.train(False)

    # pbar = tqdm(range(args.max_len))
    with t.no_grad():
        for _ in range(1000):
            pos_mel = t.arange(1, mel_input.size(1) + 1).unsqueeze(0).cuda()
            mel_pred, postnet_pred, attn, stop_token, _, attn_dec = m.forward(
                text, mel_input, pos_text, pos_mel)
            mel_input = t.cat([mel_input, postnet_pred[:, -1:, :]], dim=1)

        # mag_pred = m_post.forward(postnet_pred)

    # wav = spectrogram2wav(mag_pred.squeeze(0).cpu().numpy())
    mel_postnet = postnet_pred[0].cpu().numpy().T
    plot_data([mel_postnet for _ in range(2)])
    wav = audio.inv_mel_spectrogram(mel_postnet)
    wav = wav[0:audio.find_endpoint(wav)]
    audio.save_wav(wav, "result.wav")
Beispiel #9
0
    def save_states(self, global_epoch, mel_outputs, linear_outputs, ling, mel,
                    linear, lengths):
        print("Save intermediate states at epoch {}".format(global_epoch))

        # idx = np.random.randint(0, len(input_lengths))
        idx = min(1, len(lengths) - 1)

        # Predicted mel spectrogram
        if mel_outputs is not None:
            mel_output = mel_outputs[idx].cpu().data.numpy()
            mel_output = prepare_spec_image(audio._denormalize(mel_output))
            self.writer.add_image("Predicted mel spectrogram", mel_output,
                                  global_epoch)
        # Predicted spectrogram
        if linear_outputs is not None:
            linear_output = linear_outputs[idx].cpu().data.numpy()
            spectrogram = prepare_spec_image(audio._denormalize(linear_output))
            self.writer.add_image("Predicted spectrogram", spectrogram,
                                  global_epoch)
            # Predicted audio signal
            signal = audio.inv_spectrogram(linear_output.T)
            signal /= np.max(np.abs(signal))
            path = join(self.checkpoint_dir,
                        "epoch{:09d}_predicted.wav".format(global_epoch))
            try:
                self.writer.add_audio("Predicted audio signal",
                                      signal,
                                      global_epoch,
                                      sample_rate=self.fs)
            except Exception as e:
                warn(str(e))
                pass
            audio.save_wav(signal, path)

        # Target mel spectrogram

        if mel_outputs is not None:
            #ling = ling[idx].cpu().data.numpy()
            #mel = prepare_spec_image(audio._denormalize(mel))
            #self.writer.add_image("Source mel spectrogram", ling, global_epoch)
            mel = mel[idx].cpu().data.numpy()
            mel = prepare_spec_image(audio._denormalize(mel))
            self.writer.add_image("Target mel spectrogram", mel, global_epoch)
        if linear_outputs is not None:
            linear = linear[idx].cpu().data.numpy()
            spectrogram = prepare_spec_image(audio._denormalize(linear))
            self.writer.add_image("Target spectrogram", spectrogram,
                                  global_epoch)
            # Target audio signal
            signal = audio.inv_spectrogram(linear.T)
            signal /= np.max(np.abs(signal))
            try:
                self.writer.add_audio("Target audio signal",
                                      signal,
                                      global_epoch,
                                      sample_rate=self.fs)
            except Exception as e:
                warn(str(e))
                pass
Beispiel #10
0
def test():
    wavs_path = os.path.join("data", "LJSpeech-1.1")
    wavs_path = os.path.join(wavs_path, "wavs")
    wav_path = os.path.join(wavs_path, "LJ001-0001.wav")
    wav = audio.load_wav(wav_path)
    mel_spec = audio.melspectrogram(wav)
    wav_after_inv = audio.inv_mel_spectrogram(mel_spec)
    audio.save_wav(wav_after_inv, "test.wav")
    def predict(self, liste_phrases, out_dir, min_iter=5, max_iter=100000):
        mel_dir = os.path.join(out_dir, 'mels')
        linear_dir = os.path.join(out_dir, 'linear')
        plot_dir = os.path.join(out_dir, 'plots')
        wav_dir = os.path.join(out_dir, 'wavs')

        os.makedirs(mel_dir, exist_ok=True)
        os.makedirs(linear_dir, exist_ok=True)
        os.makedirs(plot_dir, exist_ok=True)
        os.makedirs(wav_dir, exist_ok=True)

        outputs = []

        for i, phrase in tqdm(enumerate(liste_phrases)):
            mel, linear, alignment = self._predict(phrase,
                                                   min_iter=min_iter,
                                                   max_iter=max_iter)

            mel_filename = os.path.join(mel_dir, "pred_{}.npy".format(i))
            linear_filename = os.path.join(linear_dir, "pred_{}.npy".format(i))
            plot_mel_filename = os.path.join(
                plot_dir, "mel_spectrogram_{}.png".format(i))
            plot_linear_filename = os.path.join(
                plot_dir, "linear_spectrogram_{}.png".format(i))
            plot_align_filename = os.path.join(plot_dir,
                                               "alignments_{}.png".format(i))
            wav_mel_filename = os.path.join(wav_dir,
                                            "wav_from_mel_{}.wav".format(i))
            wav_linear_filename = os.path.join(
                wav_dir, "wav_from_linear_{}.wav".format(i))

            wav_from_linear = self.wav_from_linear(linear)
            wav_from_mel = self.wav_from_mel(mel)

            np.save(mel_filename, mel)
            np.save(linear_filename, linear)

            audio.save_wav(wav_from_mel, wav_mel_filename, self.audio_rate)
            audio.save_wav(wav_from_linear, wav_linear_filename,
                           self.audio_rate)

            plot_alignment(alignment,
                           title="Alignments for :\n{}".format(phrase),
                           filename=plot_align_filename,
                           show=False,
                           fontsize=16)
            plot_spectrogram(mel,
                             title="Mel spectrogram",
                             filename=plot_mel_filename,
                             show=False)
            plot_spectrogram(linear,
                             title="Linear spectrogram",
                             filename=plot_linear_filename,
                             show=False)

            outputs.append((mel, linear, alignment))

        return outputs
Beispiel #12
0
def tts(model, text, file_path, p=0, speaker_id=None, fast=True):
    from synthesis import tts as _tts
    import audio

    waveform, alignment, spectrogram, mel = _tts(model, text, p, speaker_id,
                                                 fast)

    # 22050, 353 kbps, 16 bit, mono
    audio.save_wav(waveform, file_path)
Beispiel #13
0
def synthesis_waveglow(mel, waveglow, num, alpha=1.0):
    wav = waveglow.infer(mel, sigma=0.666)
    print("Wav Have Been Synthesized.")

    if not os.path.exists("results"):
        os.mkdir("results")
    audio.save_wav(wav[0].data.cpu().numpy(),
                   os.path.join("results",
                                str(num) + ".wav"))
Beispiel #14
0
def eval_model(global_step, writer, model, checkpoint_dir, ismultispeaker):

    # harded coded
    texts = [
        "This is Informatics Institute of Technology evaluation sentence for Text to speeh for sinhala"
    ]

    import synthesis
    synthesis._frontend = _frontend

    eval_output_dir = join(checkpoint_dir, "eval")
    os.makedirs(eval_output_dir, exist_ok=True)

    # hard coded
    speaker_ids = [0, 1, 10] if ismultispeaker else [None]
    for speaker_id in speaker_ids:
        speaker_str = "multispeaker{}".format(
            speaker_id) if speaker_id is not None else "single"

        for idx, text in enumerate(texts):
            signal, alignment, _, mel = synthesis.tts(model,
                                                      text,
                                                      p=0,
                                                      speaker_id=speaker_id,
                                                      fast=False)
            signal /= np.max(np.abs(signal))

            # Alignment
            path = join(
                eval_output_dir, "step{:09d}_text{}_{}_alignment.png".format(
                    global_step, idx, speaker_str))
            save_alignment(path, alignment)
            tag = "eval_averaged_alignment_{}_{}".format(idx, speaker_str)
            writer.add_image(
                tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255),
                global_step)

            # Mel
            writer.add_image(
                "(Eval) Predicted mel spectrogram text{}_{}".format(
                    idx, speaker_str), prepare_spec_image(mel), global_step)

            # Audio
            path = join(
                eval_output_dir, "step{:09d}_text{}_{}_predicted.wav".format(
                    global_step, idx, speaker_str))
            audio.save_wav(signal, path)

            try:
                writer.add_audio("(Eval) Predicted audio signal {}_{}".format(
                    idx, speaker_str),
                                 signal,
                                 global_step,
                                 sample_rate=fs)
            except Exception as e:
                warn(str(e))
                pass
def eval_model(device, model, global_step, logs_dir, ismultispeaker):
    """Evaluate the model
    """
    import synthesis

    # Hardcoded sentences for evaluation
    texts = [
        "Scientists at the CERN laboratory say they have discovered a new particle.",
        "There's a way to measure the acute emotional intelligence that has never gone out of style.",
        "President Trump met with other leaders at the Group of Twenty conference.",
        "Generative adversarial network or variational auto-encoder.",
        "Please call Stella.",
        "Some have accepted this as a miracle without any physical explanation.",
    ]

    eval_output_dir = join(logs_dir, "eval")
    os.makedirs(eval_output_dir, exist_ok=True)

    eval_alignment_dir = join(eval_output_dir, "alignment")
    os.makedirs(eval_alignment_dir, exist_ok=True)

    eval_wav_dir = join(eval_output_dir, "wavs")
    os.makedirs(eval_wav_dir, exist_ok=True)

    # Prepare model for evaluation
    model_eval = build_model().to(device)
    model_eval.load_state_dict(model.state_dict())

    # hard coded
    speaker_ids = [0, 1, cfg.n_speakers - 1] if ismultispeaker else [None]

    for speaker_id in speaker_ids:
        speaker_str = "multispeaker{}".format(
            speaker_id) if speaker_id is not None else "single"

        for idx, text in enumerate(texts):
            signal, alignment, _, _ = synthesis.tts(model_eval,
                                                    text,
                                                    speaker_id=speaker_id,
                                                    fast=True)
            signal /= np.max(np.abs(signal))

            # Alignment
            path = join(
                eval_alignment_dir,
                f"step{global_step:09d}_text{idx}_{speaker_str}_alignment.png")
            save_alignment(path, alignment)

            # Audio
            path = join(
                eval_wav_dir,
                f"step{global_step:09d}_text{idx}_{speaker_str}_predicted.wav")
            audio.save_wav(signal, path)
Beispiel #16
0
def save_states(global_step,
                mel_outputs,
                linear_outputs,
                attn,
                y,
                input_lengths,
                checkpoint_dir=None):
    print("Save intermediate states at step {}".format(global_step))

    # idx = np.random.randint(0, len(input_lengths))
    idx = min(1, len(input_lengths) - 1)
    input_length = input_lengths[idx]

    # Alignment
    # Multi-hop attention
    if attn.dim() == 4:
        for i, alignment in enumerate(attn):
            alignment_dir = join(checkpoint_dir,
                                 "alignment_layer{}".format(i + 1))
            os.makedirs(alignment_dir, exist_ok=True)
            path = join(
                alignment_dir,
                "step{:09d}_layer_{}_alignment.png".format(global_step, i + 1))
            alignment = alignment[idx].cpu().data.numpy()
            save_alignment(path, alignment)

        # Save averaged alignment
        alignment_dir = join(checkpoint_dir, "alignment_ave")
        os.makedirs(alignment_dir, exist_ok=True)
        path = join(alignment_dir,
                    "step{:09d}_alignment.png".format(global_step))
        alignment = attn.mean(0)[idx].cpu().data.numpy()
        save_alignment(path, alignment)
    else:
        assert False

    # Predicted spectrogram
    path = join(checkpoint_dir,
                "step{:09d}_predicted_spectrogram.png".format(global_step))
    linear_output = linear_outputs[idx].cpu().data.numpy()
    save_spectrogram(path, linear_output)

    # Predicted audio signal
    signal = audio.inv_spectrogram(linear_output.T)
    path = join(checkpoint_dir, "step{:09d}_predicted.wav".format(global_step))
    audio.save_wav(signal, path)

    # Target spectrogram
    path = join(checkpoint_dir,
                "step{:09d}_target_spectrogram.png".format(global_step))
    linear_output = y[idx].cpu().data.numpy()
    save_spectrogram(path, linear_output)
Beispiel #17
0
def inference(args):
    hparams = create_hparams()

    sentences = get_sentences(args)
    # sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)]

    model = load_model(hparams)
    model.load_state_dict(torch.load(args.checkpoint)['state_dict'])
    model.cuda().eval()  #.half()

    test_set = TextMelLoaderEval(sentences, hparams)
    test_collate_fn = TextMelCollateEval(hparams)
    test_sampler = DistributedSampler(
        valset) if hparams.distributed_run else None
    test_loader = DataLoader(test_set,
                             num_workers=0,
                             sampler=test_sampler,
                             batch_size=hparams.synth_batch_size,
                             pin_memory=False,
                             drop_last=True,
                             collate_fn=test_collate_fn)

    # taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, sampling_rate=hparams.sampling_rate)
    T2_output_range = (-hparams.max_abs_value,
                       hparams.max_abs_value) if hparams.symmetric_mels else (
                           0, hparams.max_abs_value)

    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            print("CHECK batch", i, batch)
            mel_outputs, mel_outputs_postnet, _, alignments = model.inference(
                batch)
            print('synthesize!!!', mel_outputs)
            print('synthesize!!!', mel_outputs_postnet)

            mels = mel_outputs_postnet[0].cpu().numpy()

            print('CHECK MEL SHAPE:', mels.shape)

            mel_path = os.path.join(args.out_filename,
                                    'sentence_{}_mel.npy'.format(i))
            # mels = [mel for gpu_mels in mels for mel in mel_outputs]
            mels = np.clip(mels, T2_output_range[0], T2_output_range[1])
            np.save(mel_path, mels, allow_pickle=False)

            print('CHECK MEL SHAPE:', mels.shape)

            audio_path = os.path.join(args.out_filename,
                                      'sentence_{}.wav'.format(i))
            wav = audio.inv_mel_spectrogram(mels, hparams)
            audio.save_wav(wav, audio_path, sr=hparams.sampling_rate)
Beispiel #18
0
def text_to_speech(text, speaker_id=-1):
    kwargs = {}
    if speaker_id >= 0:
        kwargs["speaker_id"] = speaker_id

    waveform, alignment, spectrogram, mel = tts(model,
                                                text,
                                                fast=False,
                                                **kwargs)

    with tempfile.SpooledTemporaryFile() as f:
        audio.save_wav(waveform, f)
        f.seek(0)
        return f.read()
Beispiel #19
0
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
    sr = hparams.sample_rate
    filename = os.path.basename(wav_path).replace('.wav', '')

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    lab_path = wav_path.replace("wav48/", "lab/").replace(".wav", ".lab")

    # Trim silence from hts labels if available
    if exists(lab_path):
        labels = hts.load(lab_path)
        b = int(start_at(labels) * 1e-7 * sr)
        e = int(end_at(labels) * 1e-7 * sr)
        wav = wav[b:e]
        wav, _ = librosa.effects.trim(wav, top_db=25)
    # Librosa trim seems to cut off the ending part of speech
    else:
        wav, _ = librosa.effects.trim(wav, top_db=15)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # Save trimmed wav
    save_wav_path = re.sub('wav48', 'wav_trim_22050', wav_path)
    dir = os.path.dirname(save_wav_path)
    if not os.path.exists(dir):
        os.system('mkdir {} -p'.format(dir))
    audio.save_wav(wav, save_wav_path)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = '{}-spec.npy'.format(filename)
    mel_filename = '{}-mel.npy'.format(filename)
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
Beispiel #20
0
def synthesis_griffin_lim(text_seq,
                          model,
                          alpha=1.0,
                          mode="",
                          num=100,
                          check=True):
    text_seq = text_seq[:-1]
    text = text_to_sequence(text_seq, hp.hparams.text_cleaners)
    text = text + [0]
    text = np.stack([np.array(text)])
    text = torch.from_numpy(text).long().to(device)

    sequence = np.array(text_to_sequence(text_seq,
                                         hp.hparams.text_cleaners))[None, 1]
    pos = torch.stack([torch.Tensor([i + 1 for i in range(text.size(1))])])
    pos = pos.long().to(device)

    model.eval()

    with torch.no_grad():
        mel, mel_postnet = model(text, pos, alpha=alpha)

    if not os.path.exists("results_kor_0730_nam_95000"):
        os.mkdir("results_kor_0730_nam_95000")
    new_name = text_seq.replace(" ", "_")
    new_name = new_name.replace("?", "_")

    new_name = new_name[:-1]
    new_name2 = new_name + str(num) + mode + ".wav"
    new_name3 = "results_kor_0730_nam_95000/" + new_name2

    mel = mel[0].cpu().numpy().T
    mel_postnet = mel_postnet[0].cpu().numpy().T
    plot_data([mel, mel_postnet], file_name=new_name)

    start = int(round(time.time() * 1000))
    wav = audio.inv_mel_spectrogram(mel_postnet)
    end = int(round(time.time() * 1000))
    audio.save_wav(wav, os.path.join("results_kor_0730_nam_95000", new_name2))
    clean_text = new_name.replace("_", " ")
    if check:
        x, _, _, y, _, _ = WERCER([new_name3], [str(clean_text)])
    else:
        x = 0
        y = 0
    print("Total time : ", end - start)
    print()
    return new_name, x, y
def copy_synthesis(wav_file, out_path):
    """Perform copy synthesis on the wav file and write the synthesized wav to disk at out_path
    """
    filename = os.path.splitext(os.path.basename(wav_file))[0]

    y = audio.load_wav(wav_file)
    if cfg.rescaling:
        y = y / np.abs(y).max() * cfg.rescaling_max

    mag = audio.spectrogram(y)

    y_hat = audio.inv_spectrogram(mag)

    out_path = os.path.join(out_path, filename + "_synthesized.wav")
    print(f"Writing {out_path} to disk")
    audio.save_wav(y_hat, out_path)
def eval_model(global_step, writer, model, checkpoint_dir, ismultispeaker):
    texts = [
        "Scientists at the CERN laboratory say they have discovered a new particle.",
        "There's a way to measure the acute emotional intelligence that has never gone out of style.",
        "Generative adversarial network or variational auto-encoder.",
    ]
    import synthesis
    synthesis._frontend = _frontend

    eval_output_dir = join(checkpoint_dir, "eval")
    os.makedirs(eval_output_dir, exist_ok=True)

    speaker_id = 0 if ismultispeaker else None
    for idx, text in enumerate(texts):
        signal, alignment, _, mel = synthesis.tts(model,
                                                  text,
                                                  p=0,
                                                  speaker_id=speaker_id,
                                                  fast=False)
        signal /= np.max(np.abs(signal))

        # Alignment
        path = join(eval_output_dir,
                    "step{:09d}_text{}_alignment.png".format(global_step, idx))
        save_alignment(path, alignment)
        tag = "eval_averaged_alignment_{}".format(idx)
        writer.add_image(tag,
                         np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255),
                         global_step)

        # Mel
        writer.add_image("(Eval) Predicted mel spectrogram text{}".format(idx),
                         prepare_spec_image(mel), global_step)

        # Audio
        path = join(eval_output_dir,
                    "step{:09d}_text{}_predicted.wav".format(global_step, idx))
        audio.save_wav(signal, path)

        try:
            writer.add_audio("(Eval) Predicted audio signal {}".format(idx),
                             signal,
                             global_step,
                             sample_rate=fs)
        except Exception as e:
            warn(str(e))
            pass
Beispiel #23
0
def eval_model(global_step, device, model, checkpoint_dir, ismultispeaker):
    # harded coded
    texts = [
        "Scientists at the CERN laboratory say they have discovered a new particle.",
        "There's a way to measure the acute emotional intelligence that has never gone out of style.",
        "President Trump met with other leaders at the Group of 20 conference.",
        "Generative adversarial network or variational auto-encoder.",
        "Please call Stella.",
        "Some have accepted this as a miracle without any physical explanation.",
    ]
    import synthesis
    synthesis._frontend = _frontend

    eval_output_dir = join(checkpoint_dir, "eval")
    os.makedirs(eval_output_dir, exist_ok=True)

    # Prepare model for evaluation
    model_eval = build_model().to(device)
    model_eval.load_state_dict(model.state_dict())

    # hard coded
    speaker_ids = [0, 1, 10] if ismultispeaker else [None]
    for speaker_id in speaker_ids:
        speaker_str = "multispeaker{}".format(
            speaker_id) if speaker_id is not None else "single"

        for idx, text in enumerate(texts):
            signal, alignment, _, mel = synthesis.tts(model_eval,
                                                      text,
                                                      p=0,
                                                      speaker_id=speaker_id,
                                                      fast=True)
            signal /= np.max(np.abs(signal))

            # Alignment
            path = join(
                eval_output_dir, "step{:09d}_text{}_{}_alignment.png".format(
                    global_step, idx, speaker_str))
            save_alignment(path, alignment)
            tag = "eval_averaged_alignment_{}_{}".format(idx, speaker_str)

            # Audio
            path = join(
                eval_output_dir, "step{:09d}_text{}_{}_predicted.wav".format(
                    global_step, idx, speaker_str))
            audio.save_wav(signal, path)
def save_states(global_step, attn, linear_outputs, input_lengths, logs_dir):
    """Save intermediate states
    """
    print(f"Save intermediate states at step {global_step:09d}")

    idx = min(1, len(input_lengths) - 1)

    # Alignment
    # Multi-hop attention
    if attn is not None and attn.dim() == 4:
        for i, alignment in enumerate(attn):
            # Save alignment to disk
            alignment = alignment[idx].cpu().data.numpy()

            alignment_dir = join(logs_dir, f"alignment_layer{i + 1}")
            os.makedirs(alignment_dir, exist_ok=True)

            path = join(alignment_dir,
                        f"step{global_step:09d}_layer_{i + 1}_alignment.png")

            save_alignment(path, alignment)

        # Save averaged alignment
        alignment_dir = join(logs_dir, "alignment_ave")
        os.makedirs(alignment_dir, exist_ok=True)

        path = join(alignment_dir,
                    f"step{global_step:09d}_layer_alignment.png")

        alignment = attn.mean(0)[idx].cpu().data.numpy()

        save_alignment(path, alignment)

    linear_output = linear_outputs[idx].cpu().data.numpy()

    # Predicted audio signal
    signal = audio.inv_spectrogram(linear_output.T)
    signal /= np.max(np.abs(signal))

    wavs_dir = join(logs_dir, "wavs")
    os.makedirs(wavs_dir, exist_ok=True)

    path = join(wavs_dir, f"step{global_step:09d}_predicted.wav")

    audio.save_wav(signal, path)
Beispiel #25
0
def main(args):
    model = CNNVocoder(n_heads=hparams.n_heads,
                       layer_channels=hparams.layer_channels,
                       pre_conv_channels=hparams.pre_conv_channels,
                       pre_residuals=hparams.pre_residuals,
                       up_residuals=hparams.up_residuals,
                       post_residuals=hparams.post_residuals)
    model = model.cuda()

    model, _, _, _ = load_checkpoint(args.model_path, model)
    spec = np.load(args.spec_path)
    spec = torch.FloatTensor(spec).unsqueeze(0).cuda()
    t1 = time()
    _, wav = model(spec)
    dt = time() - t1
    print('Synthesized audio in {}s'.format(dt))
    wav = wav.data.cpu()[0].numpy()
    audio.save_wav(wav, args.out_path)
Beispiel #26
0
Datei: main.py Projekt: cy94/ml2
def main():
    rate, data = aud.get_wav("violin_4k.wav")
    print "Original:", data.size

    new_data = np.copy(data)

    # select 5 seconds of audio
    train_data = aud.cut_wav(new_data, 10, 15)
    print "Train:", train_data.size
    aud.save_wav(train_data, "violin_train.wav")

    seed_data = aud.cut_wav(new_data, 16, 17)

    X, Y = get_data_labels(train_data)
    seed_X, seed_Y = get_data_labels(seed_data)

    generated = generate_audio(X, Y, np.array([seed_X[0]]))
    aud.save_wav(generated, "violin_gen.wav")
Beispiel #27
0
Datei: main.py Projekt: cy94/ml2
def main():
	rate, data = aud.get_wav("violin_4k.wav")
	print "Original:", data.size

	new_data = np.copy(data)

	# select 5 seconds of audio
	train_data = aud.cut_wav(new_data, 10, 15)
	print "Train:", train_data.size
	aud.save_wav(train_data, "violin_train.wav")

	seed_data = aud.cut_wav(new_data, 16, 17)

	X, Y = get_data_labels(train_data)
	seed_X, seed_Y = get_data_labels(seed_data)

	generated = generate_audio(X, Y, np.array([seed_X[0]]))
	aud.save_wav(generated, "violin_gen.wav")
Beispiel #28
0
def generate(model_path,model_name, generate_path, generate_name, piece):
    
    """Synthesize audio from an array of embeddings.
    
    Args:
    encodings: Numpy array with shape [batch_size, time, dim].
    save_paths: Iterable of output file names.
    checkpoint_path: Location of the pretrained model. [model.ckpt-200000]
    samples_per_save: Save files after every amount of generated samples.

    """
    
    # Create directory for encoding
    if os.path.exists(generate_path) is False:
        os.makedirs(generate_path)

    net = AutoEncoder()
    net = load_model(net,model_path,model_name)
    cuda_available = torch.cuda.is_available()
    if cuda_available is True:
        net = net.cuda()

    net.eval()

    # Load audio for encoding
    piece = audio.load_wav(piece)
    spec = audio.spectrogram(piece).astype(np.float32)
    spec = torch.from_numpy(spec.T)
    spec = torch.FloatTensor(spec)
    
    spec = torch.unsqueeze(spec, 0)
    spec = Variable(spec, volatile=True).contiguous()

    if cuda_available is True:
        spec = spec.cuda()

    generated_spec = net(spec)
    generated_spec = generated_spec.data.cpu().numpy()
    generated_spec = np.squeeze(generated_spec)
    
    waveform = audio.inv_spectrogram(generated_spec.T)
    wav_name = generate_path + generate_name + '.wav'

    audio.save_wav(waveform , wav_name)    
Beispiel #29
0
def synthesis_waveglow(text_seq, model, waveglow, alpha=1.0, mode=""):
    text = text_to_sequence(text_seq, hp.hparams.text_cleaners)
    text = text + [0]
    text = np.stack([np.array(text)])
    text = torch.from_numpy(text).long().to(device)

    pos = torch.stack([torch.Tensor([i + 1 for i in range(text.size(1))])])
    pos = pos.long().to(device)

    model.eval()
    with torch.no_grad():
        _, mel_postnet = model(text, pos, alpha=alpha)
    with torch.no_grad():
        wav = waveglow.infer(mel_postnet, sigma=0.666)
    print("Wav Have Been Synthesized.")

    if not os.path.exists("results"):
        os.mkdir("results")
    audio.save_wav(wav[0].data.cpu().numpy(),
                   os.path.join("results", text_seq + mode + ".wav"))
def synthesize(mel_sp, save_path):
    assert len(mel_sp.shape) == 2
    mel_sp = np.expand_dims(mel_sp, axis=0)
    assert mel_sp.shape[1] == hparams.num_mels
    max_time_frame = mel_sp.shape[2]

    audio_len = max_time_frame * hparams.hop_size

    batch = {"c": mel_sp}

    wavenet = tf.estimator.Estimator(
        model_fn=wavenet_fn,
        model_dir=hparams.model_directory,
        params={
            'feature_columns':
            tf.feature_column.numeric_column(
                key="c",
                shape=[hparams.num_mels, max_time_frame],
                dtype=tf.float32),
            'hparams':
            hparams,
            'time_len':
            audio_len
        })

    input_fn = tf.estimator.inputs.numpy_input_fn(x=batch,
                                                  batch_size=1,
                                                  shuffle=False,
                                                  num_epochs=1)

    wavenet_checkpoint = wavenet.latest_checkpoint()
    wavenet_outputs = wavenet.predict(input_fn=input_fn,
                                      checkpoint_path=wavenet_checkpoint)
    for result in wavenet_outputs:
        outputs = result['outputs']

        if hparams.input_type == "mulaw-quantize":
            outputs = inv_mulaw_quantize(outputs)

        save_wav(outputs, save_path, hparams.sample_rate)
Beispiel #31
0
def main():
    # Target data
    filename = "120_kmeans_obj.pkl"

    kmeans = k.load_pkl(filename)
    spec, label = load_test_data()

    print("spec", spec.shape)
    print("label", label.shape)

    spec_ = np.empty((513, ), np.float32)
    for i in range(len(label)):
        spec_ = np.vstack((spec_, kmeans.cluster_centers_[label[i]]))
    spec_ = np.delete(spec_, 0, 0)

    print("compare data structure ----")
    print("spec: ", spec.shape)
    print("spec_: ", spec_.shape)

    print("spec data:", spec)
    print("spec_ data:", spec_)

    print("min-max spce_ data:", min_max(spec_))

    waveform = audio.inv_spectrogram(spec)
    waveform_ = audio.inv_spectrogram(spec_)
    waveformmm_ = audio.inv_spectrogram(min_max(spec_))

    audio.save_wav(waveform, 'ideal_out.wav')
    audio.save_wav(waveform_, 'idela_out_.wav')
    audio.save_wav(waveformmm_, 'idelal_outmm_.wav')