Esempio n. 1
0
def tts(model, text, p=0, speaker_id=None, fast=False, wavenet=None):
    """Convert text to speech waveform given a deepvoice3 model.

    Args:
        text (str) : Input text to be synthesized
        p (float) : Replace word to pronounciation if p > 0. Default is 0.
    """
    model = model.to(device)
    model.eval()
    if fast:
        model.make_generation_fast_()

    sequence = np.array(_frontend.text_to_sequence(text, p=p))
    print('sequence to synthesize: ', sequence)
    sequence = torch.from_numpy(sequence).unsqueeze(0).long().to(device)
    text_positions = torch.arange(1, sequence.size(-1) + 1).unsqueeze(0).long().to(device)
    speaker_ids = None if speaker_id is None else torch.LongTensor([speaker_id]).to(device)

    # Greedy decoding
    with torch.no_grad():
        mel_outputs, linear_outputs, alignments, done = model(
            sequence, text_positions=text_positions, speaker_ids=speaker_ids)

    linear_output = linear_outputs[0].cpu().data.numpy()
    spectrogram = audio._denormalize(linear_output)
    alignment = alignments[0].cpu().data.numpy()
    mel = mel_outputs[0].cpu().data.numpy()
    mel = audio._denormalize(mel)

    # Predicted audio signal
    if wavenet is not None:
        wavenet = wavenet.to(device)
        wavenet.eval()
        if fast:
            wavenet.make_generation_fast_()

        # TODO: assuming scalar input
        initial_value = 0.0
        initial_input = torch.zeros(1, 1, 1).fill_(initial_value).to(device)
        # (B, T, C) -> (B, C, T)
        c = mel_outputs.transpose(1, 2).contiguous()
        g = None
        Tc = c.size(-1)
        length = Tc * 256
        initial_input = initial_input.to(device)
        c = c.to(device)
        waveform = wavenet.incremental_forward(
            initial_input, c=c, g=g, T=length, tqdm=tqdm, softmax=True, quantize=True,
            log_scale_min=float(np.log(1e-14)))
        waveform = waveform.view(-1).cpu().data.numpy()
    else:
        waveform = audio.inv_spectrogram(linear_output.T)

    return waveform, alignment, spectrogram, mel
Esempio n. 2
0
def tts(model, text, p=0., speaker_id=None):
    """
    Convert text to speech waveform given a deepvoice3 model.

    Args:
        model (DeepVoiceTTS): Model used to synthesize waveform.
        text (str) : Input text to be synthesized
        p (float) : Replace word to pronounciation if p > 0. Default is 0.
    
    Returns:
        waveform (numpy.ndarray): Shape(T_wav, ), predicted wave form, where
            T_wav means the length of the synthesized wave form.
        alignment (numpy.ndarray): Shape(T_dec, T_enc), predicted alignment
            matrix, where T_dec means the time steps of decoder outputs, T_enc
            means the time steps of encoder outoputs.
        spectrogram (numpy.ndarray): Shape(T_lin, C_lin), predicted linear
            spectrogram, where T__lin means the time steps of linear
            spectrogram and C_lin mean sthe channels of linear spectrogram.
        mel (numpy.ndarray): Shape(T_mel, C_mel), predicted mel spectrogram,
            where T_mel means the time steps of mel spectrogram and C_mel means
            the channels of mel spectrogram.
    """
    model.eval()

    sequence = np.array(_frontend.text_to_sequence(text, p=p)).astype("int64")
    sequence = np.reshape(sequence, (1, -1))
    text_positions = np.arange(1, sequence.shape[1] + 1, dtype="int64")
    text_positions = np.reshape(text_positions, (1, -1))

    sequence = dg.to_variable(sequence)
    text_positions = dg.to_variable(text_positions)
    speaker_ids = None if speaker_id is None else fluid.layers.fill_constant(
        shape=[1, 1], value=speaker_id)

    # sequence: shape(1, input_length, 1)
    # text_positions: shape(1, input_length, 1)
    # Greedy decoding
    mel_outputs, linear_outputs, alignments, done = model.transduce(
        sequence, text_positions, speaker_ids)

    # reshape to the desired shape
    linear_output = linear_outputs.numpy().squeeze().T
    spectrogram = audio._denormalize(linear_output)
    alignment = alignments.numpy()[0]
    mel = mel_outputs.numpy().squeeze().T
    mel = audio._denormalize(mel)

    # Predicted audio signal
    waveform = audio.inv_spectrogram(linear_output.T)

    return waveform, alignment, spectrogram, mel
Esempio n. 3
0
def save_spectrogram(path, linear_output):
    spectrogram = audio._denormalize(linear_output)
    plt.figure(figsize=(16, 10))
    plt.imshow(spectrogram.T, aspect="auto", origin="lower")
    plt.colorbar()
    plt.tight_layout()
    plt.savefig(path, format="png")
    plt.close()
Esempio n. 4
0
def tts(model, text, speaker_id=None, fast=False):
    """Convert text to speech waveform given a deepvoice3 model.
    """
    model = model.to(device)
    model.eval()

    if fast:
        model.make_generation_fast_()

    if cfg.frontend == "en":
        sequence = np.array(english.text_to_sequence(text))
    else:
        raise NotImplementedError

    sequence = torch.from_numpy(sequence).unsqueeze(0).long().to(device)
    text_positions = torch.arange(1,
                                  sequence.size(-1) +
                                  1).unsqueeze(0).long().to(device)

    speaker_ids = None if speaker_id is None else torch.LongTensor(
        [speaker_id]).to(device)

    # Greedy decoding
    with torch.no_grad():
        mel_outputs, linear_outputs, alignments, done = model(
            sequence, text_positions=text_positions, speaker_ids=speaker_ids)

    linear_output = linear_outputs[0].cpu().data.numpy()
    spectrogram = audio._denormalize(linear_output)
    alignment = alignments[0].cpu().data.numpy()
    mel = mel_outputs[0].cpu().data.numpy()
    mel = audio._denormalize(mel)

    # Predicted audio signal
    waveform = audio.inv_spectrogram(linear_output.T)

    return waveform, alignment, spectrogram, mel
Esempio n. 5
0
def save_states(global_step, writer, mel_outputs, linear_outputs, attn, mel, y,
                input_lengths, checkpoint_dir=None):
    print("Save intermediate states at step {}".format(global_step))

    # idx = np.random.randint(0, len(input_lengths))
    idx = min(1, len(input_lengths) - 1)
    input_length = input_lengths[idx]

    # Alignment
    # Multi-hop attention
    if attn is not None and attn.dim() == 4:
        for i, alignment in enumerate(attn):
            alignment = alignment[idx].cpu().data.numpy()
            tag = "alignment_layer{}".format(i + 1)
            writer.add_image(tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255), global_step)

            # save files as well for now
            alignment_dir = join(checkpoint_dir, "alignment_layer{}".format(i + 1))
            os.makedirs(alignment_dir, exist_ok=True)
            path = join(alignment_dir, "step{:09d}_layer_{}_alignment.png".format(
                global_step, i + 1))
            save_alignment(path, alignment)

        # Save averaged alignment
        alignment_dir = join(checkpoint_dir, "alignment_ave")
        os.makedirs(alignment_dir, exist_ok=True)
        path = join(alignment_dir, "step{:09d}_alignment.png".format(global_step))
        alignment = attn.mean(0)[idx].cpu().data.numpy()
        save_alignment(path, alignment)

        tag = "averaged_alignment"
        writer.add_image(tag, np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255), global_step)

    # Predicted mel spectrogram
    if mel_outputs is not None:
        mel_output = mel_outputs[idx].cpu().data.numpy()
        if hparams.vocoder != "world":
            mel_output = prepare_spec_image(audio._denormalize(mel_output))
            writer.add_image("Predicted mel spectrogram", mel_output, global_step)
        else:
            mel_output_prep = mel_output
            try:
                writer.add_image("Predicted WORLD output", mel_output_prep, global_step)
            except:
                pass

            mel_output = denormalize(mel_output)
            nfft = pw.get_cheaptrick_fft_size(hparams.sample_rate)
            f0 = mel_output[:,0].astype(np.float64)
            sp = pw.decode_spectral_envelope(mel_output[:,1:(hparams.coded_env_dim+1)].astype(np.float64), hparams.sample_rate, nfft)
            ap = pw.decode_aperiodicity(mel_output[:,(hparams.coded_env_dim+1):hparams.num_mels].astype(np.float64), hparams.sample_rate, nfft)

            signal = pw.synthesize(f0, sp, ap, hparams.sample_rate, pw.default_frame_period)
            path = join(checkpoint_dir, "step{:09d}_out.wav".format(
                        global_step))
            audio.save_wav(signal, path)

            try:
                signal /= np.max(np.abs(signal))
                writer.add_audio("Target audio signal", signal, global_step, sample_rate=fs)
            except:
                print("Unexpected error :", sys.exc_info())

            mel_tgt = mel[idx].cpu().data.numpy()
            mel_tgt = denormalize(mel_tgt)

            f0 = mel_tgt[:,0].astype(np.float64)
            sp = pw.decode_spectral_envelope(mel_tgt[:,1:(hparams.coded_env_dim+1)].astype(np.float64), hparams.sample_rate, nfft)
            ap = pw.decode_aperiodicity(mel_tgt[:,(hparams.coded_env_dim+1):hparams.num_mels].astype(np.float64), hparams.sample_rate, nfft)

            signal = pw.synthesize(f0, sp, ap, hparams.sample_rate, pw.default_frame_period)
            try:
                signal /= np.max(np.abs(signal))
                writer.add_audio("Target audio signal", signal, global_step, sample_rate=hparams.sample_rate)
            except:
                print("Unexpected error :", sys.exc_info())
    # Predicted spectrogram
    if linear_outputs is not None:
        linear_output = linear_outputs[idx].cpu().data.numpy()
        spectrogram = prepare_spec_image(audio._denormalize(linear_output))
        writer.add_image("Predicted linear spectrogram", spectrogram, global_step)

        # Predicted audio signal
        signal = audio.inv_spectrogram(linear_output.T)
        signal /= np.max(np.abs(signal))
        path = join(checkpoint_dir, "step{:09d}_predicted.wav".format(
            global_step))
        try:
            writer.add_audio("Predicted audio signal", signal, global_step, sample_rate=fs)
        except Exception as e:
            warn(str(e))
            pass
        audio.save_wav(signal, path)

    # Target mel spectrogram
    if mel_outputs is not None:
        mel_output = mel[idx].cpu().data.numpy()
        mel_output = prepare_spec_image(audio._denormalize(mel_output))
        writer.add_image("Target mel spectrogram", mel_output, global_step)

    # Target spectrogram
    if linear_outputs is not None:
        linear_output = y[idx].cpu().data.numpy()
        spectrogram = prepare_spec_image(audio._denormalize(linear_output))
        writer.add_image("Target linear spectrogram", spectrogram, global_step)

    #ei
    path = join(checkpoint_dir, "step{:09d}_mel_target.npy".format(
                global_step))
    mel_output = mel[idx].cpu().data.numpy()
    np.save(path, denormalize(mel_output))

    path = join(checkpoint_dir, "step{:09d}_mel_out.npy".format(
                global_step))
    mel_output = denormalize(mel_outputs[idx].cpu().data.numpy())
    np.save(path, mel_output)
Esempio n. 6
0
def eval_model(global_step, writer, device, model, checkpoint_dir,
               ismultispeaker):
    # harded coded
    texts = [
        "And debtors might practically have as much as they liked%if they could only pay for it.",
        "There's a way to measure the acute emotional intelligence that has never gone out of style.",
        "President trump met with other leaders at the group of 20 conference.",
        "Generative adversarial network or variational auto encoder.",
        "Please call stella.",
        "Some have accepted this as a miracle without any physical explanation.",
    ]
    import synthesis
    synthesis._frontend = _frontend

    eval_output_dir = join(checkpoint_dir, "eval")
    os.makedirs(eval_output_dir, exist_ok=True)

    # Prepare model for evaluation
    model_eval = tm.build_model().to(device)
    model_eval.load_state_dict(model.state_dict())

    # hard coded
    speaker_ids = [0, 1, 10] if ismultispeaker else [None]
    for speaker_id in speaker_ids:
        speaker_str = "multispeaker{}".format(
            speaker_id) if speaker_id is not None else "single"

        for idx, text in enumerate(texts, 1):
            model_eval.eval()
            model_eval.make_generation_fast_()

            sequence = np.array(_frontend.text_to_sequence(text, p=0.5))
            #import pdb; pdb.set_trace()
            sequence = torch.from_numpy(sequence).unsqueeze(0).long().to(
                device)
            text_positions = torch.arange(1,
                                          sequence.size(-1) +
                                          1).unsqueeze(0).long().to(device)
            speaker_ids = None if speaker_id is None else torch.LongTensor(
                [speaker_id]).to(device)

            # Greedy decoding
            with torch.no_grad():
                mel, alignments, done = model_eval(
                    sequence,
                    text_positions=text_positions,
                    speaker_ids=speaker_ids)
            alignments = alignments[0].cpu().data.numpy()
            mel = mel[0].cpu().data.numpy()
            mel = audio._denormalize(mel)

            # Alignment
            for i, alignment in enumerate(alignments, 1):
                alignment_dir = join(eval_output_dir,
                                     "alignment_layer{}".format(i))
                os.makedirs(alignment_dir, exist_ok=True)
                path = join(
                    alignment_dir,
                    "step{:09d}_text{}_{}_layer{}_alignment.png".format(
                        global_step, idx, speaker_str, i))
                tm.save_alignment(path, alignment, global_step)
                tag = "eval_text_{}_alignment_layer{}_{}".format(
                    idx, i, speaker_str)
                writer.add_image(
                    tag,
                    np.uint8(cm.viridis(np.flip(alignment, 1)) * 255).T,
                    global_step)

            # Mel
            writer.add_image(
                "(Eval) Predicted mel spectrogram text{}_{}".format(
                    idx, speaker_str),
                tm.prepare_spec_image(mel).transpose(2, 0, 1), global_step)
Esempio n. 7
0
def save_states(global_step,
                writer,
                mel_outputs,
                linear_outputs,
                attn,
                mel,
                y,
                input_lengths,
                checkpoint_dir=None):
    """
    Save states for the trainning process.
    """
    print("[train] Saving intermediate states at step {}".format(global_step))

    idx = min(1, len(input_lengths) - 1)
    input_length = input_lengths[idx]

    # Alignment, Multi-hop attention
    if attn is not None and len(attn.shape) == 4:
        attn = attn.numpy()
        for i in range(attn.shape[0]):
            alignment = attn[i]
            alignment = alignment[idx]
            tag = "alignment_layer{}".format(i + 1)
            writer.add_image(tag,
                             np.uint8(
                                 cm.viridis(np.flip(alignment, 1).T) * 255),
                             global_step,
                             dataformats='HWC')

            alignment_dir = join(checkpoint_dir,
                                 "alignment_layer{}".format(i + 1))
            if not os.path.exists(alignment_dir):
                os.makedirs(alignment_dir)
            path = join(
                alignment_dir,
                "step{:09d}_layer_{}_alignment.png".format(global_step, i + 1))
            save_alignment(global_step, path, alignment)

        alignment_dir = join(checkpoint_dir, "alignment_ave")
        if not os.path.exists(alignment_dir):
            os.makedirs(alignment_dir)
        path = join(alignment_dir,
                    "step{:09d}_alignment.png".format(global_step))
        alignment = np.mean(attn, axis=0)[idx]
        save_alignment(global_step, path, alignment)

        tag = "averaged_alignment"
        writer.add_image(tag,
                         np.uint8(cm.viridis(np.flip(alignment, 1).T) * 255),
                         global_step,
                         dataformats="HWC")

    if mel_outputs is not None:
        mel_output = mel_outputs[idx].numpy().squeeze().T
        mel_output = prepare_spec_image(audio._denormalize(mel_output))
        writer.add_image("Predicted_mel_spectrogram",
                         mel_output,
                         global_step,
                         dataformats="HWC")

    if linear_outputs is not None:
        linear_output = linear_outputs[idx].numpy().squeeze().T
        spectrogram = prepare_spec_image(audio._denormalize(linear_output))
        writer.add_image("Predicted_linear_spectrogram",
                         spectrogram,
                         global_step,
                         dataformats="HWC")

        signal = audio.inv_spectrogram(linear_output.T)
        signal /= np.max(np.abs(signal))
        path = join(checkpoint_dir,
                    "step{:09d}_predicted.wav".format(global_step))
        try:
            writer.add_audio("Predicted_audio_signal",
                             signal,
                             global_step,
                             sample_rate=hparams.sample_rate)
        except Exception as e:
            warn(str(e))
            pass
        audio.save_wav(signal, path)

    if mel_outputs is not None:
        mel_output = mel[idx].numpy().squeeze().T
        mel_output = prepare_spec_image(audio._denormalize(mel_output))
        writer.add_image("Target_mel_spectrogram",
                         mel_output,
                         global_step,
                         dataformats="HWC")

    if linear_outputs is not None:
        linear_output = y[idx].numpy().squeeze().T
        spectrogram = prepare_spec_image(audio._denormalize(linear_output))
        writer.add_image("Target_linear_spectrogram",
                         spectrogram,
                         global_step,
                         dataformats="HWC")