Example #1
0
def test_nyanko_basics():
    x, y = _test_data()

    for v in [False, True]:
        model = nyanko(n_vocab, mel_dim=num_mels, linear_dim=num_freq, r=1, downsample_step=4,
                       use_decoder_state_for_postnet_input=v)
        mel_outputs, linear_outputs, alignments, done = model(x, y)
Example #2
0
def test_incremental_path_multiple_times():
    texts = ["ni2 hao3 , wo3 shi4 jing3 cha2 ."]
    seqs = np.array([text_to_sequence(t) for t in texts])
    text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))

    r = 1
    mel_dim = 80

    sequence = torch.LongTensor(seqs)
    text_positions = torch.LongTensor(text_positions)

    model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4,
                   r=r, force_monotonic_attention=False)
    model.eval()

    # first call
    mel_outputs, linear_outputs, alignments, done = model(
        sequence, text_positions=text_positions, speaker_ids=None)

    # second call
    mel_outputs2, linear_outputs2, alignments2, done2 = model(
        sequence, text_positions=text_positions, speaker_ids=None)

    # Should get same result
    c = (mel_outputs - mel_outputs2).abs()
    print(c.mean(), c.max())

    assert np.allclose(mel_outputs.cpu().data.numpy(),
                       mel_outputs2.cpu().data.numpy(), atol=1e-5)
Example #3
0
def test_incremental_correctness():
    texts = [
        "they discarded this for a more completely Roman and far less beautiful letter."
    ]
    seqs = np.array([text_to_sequence(t) for t in texts])
    text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))

    mel = np.load(
        "/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy"
    )
    max_target_len = mel.shape[0]
    r = 1
    mel_dim = 80
    if max_target_len % r != 0:
        max_target_len += r - max_target_len % r
        assert max_target_len % r == 0
    mel = _pad_2d(mel, max_target_len)
    mel = Variable(torch.from_numpy(mel))
    mel_reshaped = mel.view(1, -1, mel_dim * r)
    frame_positions = np.arange(1,
                                mel_reshaped.size(1) + 1).reshape(
                                    1, mel_reshaped.size(1))

    x = Variable(torch.LongTensor(seqs))
    text_positions = Variable(torch.LongTensor(text_positions))
    frame_positions = Variable(torch.LongTensor(frame_positions))

    model = nyanko(n_vocab,
                   mel_dim=mel_dim,
                   linear_dim=513,
                   downsample_step=4,
                   r=r,
                   force_monotonic_attention=False)
    model.eval()

    # Encoder
    encoder_outs = model.seq2seq.encoder(x)

    # Off line decoding
    mel_outputs_offline, alignments_offline, done, _ = model.seq2seq.decoder(
        encoder_outs,
        mel_reshaped,
        text_positions=text_positions,
        frame_positions=frame_positions)

    # Online decoding with test inputs
    model.seq2seq.decoder.start_fresh_sequence()
    mel_outputs_online, alignments, dones_online, _ = model.seq2seq.decoder.incremental_forward(
        encoder_outs, text_positions, test_inputs=mel_reshaped)

    # Should get same result
    assert np.allclose(mel_outputs_offline.cpu().data.numpy(),
                       mel_outputs_online.cpu().data.numpy())
Example #4
0
def test_incremental_correctness():
    texts = ["ni2 hao3 , wo3 shi4 jing3 cha2 ."]
    seqs = np.array([text_to_sequence(t) for t in texts])
    text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))

    mel_path = join(dirname(__file__), "data", "ljspeech-mel-00001.npy")
    mel = np.load(mel_path)[::4]
    max_target_len = mel.shape[0]
    r = 1
    mel_dim = 80
    if max_target_len % r != 0:
        max_target_len += r - max_target_len % r
        assert max_target_len % r == 0
    mel = _pad_2d(mel, max_target_len)
    mel = torch.from_numpy(mel)
    mel_reshaped = mel.view(1, -1, mel_dim * r)
    frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape(1, mel_reshaped.size(1))

    x = torch.LongTensor(seqs)
    text_positions = torch.LongTensor(text_positions)
    frame_positions = torch.LongTensor(frame_positions)

    model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4,
                   r=r, force_monotonic_attention=False)
    model.eval()

    # Encoder
    encoder_outs = model.seq2seq.encoder(x)

    # Off line decoding
    mel_outputs_offline, alignments_offline, done, _ = model.seq2seq.decoder(
        encoder_outs, mel_reshaped,
        text_positions=text_positions, frame_positions=frame_positions)

    # Online decoding with test inputs
    model.seq2seq.decoder.start_fresh_sequence()
    mel_outputs_online, alignments, dones_online, _ = model.seq2seq.decoder.incremental_forward(
        encoder_outs, text_positions,
        test_inputs=mel_reshaped)

    # Should get same result
    assert np.allclose(mel_outputs_offline.cpu().data.numpy(),
                       mel_outputs_online.cpu().data.numpy())
Example #5
0
def test_nyanko():
    texts = [
        "they discarded this for a more completely Roman and far less beautiful letter."
    ]
    seqs = np.array([text_to_sequence(t) for t in texts])
    text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0]))

    mel = np.load(
        "/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy"
    )
    max_target_len = mel.shape[0]
    r = 1
    mel_dim = 80
    if max_target_len % r != 0:
        max_target_len += r - max_target_len % r
        assert max_target_len % r == 0
    mel = _pad_2d(mel, max_target_len)
    mel = torch.from_numpy(mel)
    mel_reshaped = mel.view(1, -1, mel_dim * r)
    frame_positions = np.arange(1,
                                mel_reshaped.size(1) + 1).reshape(
                                    1, mel_reshaped.size(1))

    x = torch.LongTensor(seqs)
    text_positions = torch.LongTensor(text_positions)
    frame_positions = torch.LongTensor(frame_positions)

    model = nyanko(n_vocab,
                   mel_dim=mel_dim,
                   linear_dim=513,
                   downsample_step=4,
                   r=r,
                   force_monotonic_attention=False)
    model.eval()

    def _plot(mel, mel_predicted, alignments):
        from matplotlib import pylab as plt
        plt.figure(figsize=(16, 10))
        plt.subplot(3, 1, 1)
        plt.imshow(mel.data.cpu().numpy().T,
                   origin="lower bottom",
                   aspect="auto",
                   cmap="magma")
        plt.colorbar()

        plt.subplot(3, 1, 2)
        plt.imshow(mel_predicted.view(-1, mel_dim).data.cpu().numpy().T,
                   origin="lower bottom",
                   aspect="auto",
                   cmap="magma")
        plt.colorbar()

        plt.subplot(3, 1, 3)
        if alignments.dim() == 4:
            alignments = alignments.mean(0)
        plt.imshow(alignments[0].data.cpu().numpy().T,
                   origin="lower bottom",
                   aspect="auto")
        plt.colorbar()
        plt.show()

    seq2seq = model.seq2seq

    # Encoder
    encoder_outs = seq2seq.encoder(x)

    # Off line decoding
    print("Offline decoding")
    mel_outputs_offline, alignments_offline, done, _ = seq2seq.decoder(
        encoder_outs,
        mel_reshaped,
        text_positions=text_positions,
        frame_positions=frame_positions)

    _plot(mel, mel_outputs_offline, alignments_offline)

    # Online decoding with test inputs
    print("Online decoding")
    seq2seq.decoder.start_fresh_sequence()
    mel_outputs_online, alignments, dones_online, _ = seq2seq.decoder.incremental_forward(
        encoder_outs, text_positions, test_inputs=mel_reshaped)

    a = mel_outputs_offline.cpu().data.numpy()
    b = mel_outputs_online.cpu().data.numpy()
    c = (mel_outputs_offline - mel_outputs_online).abs()
    print(c.mean(), c.max())

    _plot(mel, mel_outputs_offline, alignments_offline)
    _plot(mel, mel_outputs_online, alignments)
    _plot(mel, c, alignments)

    # Should get same result
    assert np.allclose(a, b)

    postnet = model.postnet

    linear_outputs = postnet(mel_outputs_offline)
    print(linear_outputs.size())