def test_incremental_correctness(): texts = [ "they discarded this for a more completely Roman and far less beautiful letter." ] seqs = np.array([text_to_sequence(t) for t in texts]) text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) mel = np.load( "/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy" ) max_target_len = mel.shape[0] r = 1 mel_dim = 80 if max_target_len % r != 0: max_target_len += r - max_target_len % r assert max_target_len % r == 0 mel = _pad_2d(mel, max_target_len) mel = Variable(torch.from_numpy(mel)) mel_reshaped = mel.view(1, -1, mel_dim * r) frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape( 1, mel_reshaped.size(1)) x = Variable(torch.LongTensor(seqs)) text_positions = Variable(torch.LongTensor(text_positions)) frame_positions = Variable(torch.LongTensor(frame_positions)) model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4, r=r, force_monotonic_attention=False) model.eval() # Encoder encoder_outs = model.seq2seq.encoder(x) # Off line decoding mel_outputs_offline, alignments_offline, done, _ = model.seq2seq.decoder( encoder_outs, mel_reshaped, text_positions=text_positions, frame_positions=frame_positions) # Online decoding with test inputs model.seq2seq.decoder.start_fresh_sequence() mel_outputs_online, alignments, dones_online, _ = model.seq2seq.decoder.incremental_forward( encoder_outs, text_positions, test_inputs=mel_reshaped) # Should get same result assert np.allclose(mel_outputs_offline.cpu().data.numpy(), mel_outputs_online.cpu().data.numpy())
def _test_data(): texts = ["Thank you very much.", "Hello.", "Deep voice 3."] seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts] input_lengths = np.array([len(s) for s in seqs]) max_len = np.max(input_lengths) seqs = np.array([_pad(s, max_len) for s in seqs]) # Test encoder x = Variable(torch.LongTensor(seqs)) y = Variable(torch.rand(x.size(0), 12, 80)) return x, y
def test_multi_speaker_deepvoice3(): texts = ["Thank you very much.", "Hello.", "Deep voice 3."] seqs = [np.array(text_to_sequence(t), dtype=np.int) for t in texts] input_lengths = np.array([len(s) for s in seqs]) max_len = np.max(input_lengths) seqs = np.array([_pad(s, max_len) for s in seqs]) # Test encoder x = Variable(torch.LongTensor(seqs)) y = Variable(torch.rand(x.size(0), 4 * 33, 80)) model = _get_model(n_speakers=32, speaker_embed_dim=16) speaker_ids = Variable(torch.LongTensor([1, 2, 3])) mel_outputs, linear_outputs, alignments, done = model( x, y, speaker_ids=speaker_ids) print("Input text:", x.size()) print("Input mel:", y.size()) print("Mel:", mel_outputs.size()) print("Linear:", linear_outputs.size()) print("Alignments:", alignments.size()) print("Done:", done.size())
def test_nyanko(): texts = [ "they discarded this for a more completely Roman and far less beautiful letter." ] seqs = np.array([text_to_sequence(t) for t in texts]) text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) mel = np.load( "/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy" ) max_target_len = mel.shape[0] r = 1 mel_dim = 80 if max_target_len % r != 0: max_target_len += r - max_target_len % r assert max_target_len % r == 0 mel = _pad_2d(mel, max_target_len) mel = Variable(torch.from_numpy(mel)) mel_reshaped = mel.view(1, -1, mel_dim * r) frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape( 1, mel_reshaped.size(1)) x = Variable(torch.LongTensor(seqs)) text_positions = Variable(torch.LongTensor(text_positions)) frame_positions = Variable(torch.LongTensor(frame_positions)) model = nyanko(n_vocab, mel_dim=mel_dim, linear_dim=513, downsample_step=4, r=r, force_monotonic_attention=False) model.eval() def _plot(mel, mel_predicted, alignments): from matplotlib import pylab as plt plt.figure(figsize=(16, 10)) plt.subplot(3, 1, 1) plt.imshow(mel.data.cpu().numpy().T, origin="lower bottom", aspect="auto", cmap="magma") plt.colorbar() plt.subplot(3, 1, 2) plt.imshow(mel_predicted.view(-1, mel_dim).data.cpu().numpy().T, origin="lower bottom", aspect="auto", cmap="magma") plt.colorbar() plt.subplot(3, 1, 3) if alignments.dim() == 4: alignments = alignments.mean(0) plt.imshow(alignments[0].data.cpu().numpy().T, origin="lower bottom", aspect="auto") plt.colorbar() plt.show() seq2seq = model.seq2seq # Encoder encoder_outs = seq2seq.encoder(x) # Off line decoding print("Offline decoding") mel_outputs_offline, alignments_offline, done, _ = seq2seq.decoder( encoder_outs, mel_reshaped, text_positions=text_positions, frame_positions=frame_positions) _plot(mel, mel_outputs_offline, alignments_offline) # Online decoding with test inputs print("Online decoding") seq2seq.decoder.start_fresh_sequence() mel_outputs_online, alignments, dones_online, _ = seq2seq.decoder.incremental_forward( encoder_outs, text_positions, test_inputs=mel_reshaped) a = mel_outputs_offline.cpu().data.numpy() b = mel_outputs_online.cpu().data.numpy() c = (mel_outputs_offline - mel_outputs_online).abs() print(c.mean(), c.max()) _plot(mel, mel_outputs_offline, alignments_offline) _plot(mel, mel_outputs_online, alignments) _plot(mel, c, alignments) # Should get same result assert np.allclose(a, b) postnet = model.postnet linear_outputs = postnet(mel_outputs_offline) print(linear_outputs.size())
def test_incremental_forward(): checkpoint_path = join(dirname(__file__), "../test_whole/checkpoint_step000265000.pth") if not exists(checkpoint_path): return model = _get_model() use_cuda = False checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint["state_dict"]) model.make_generation_fast_() model = model.cuda() if use_cuda else model texts = [ "they discarded this for a more completely Roman and far less beautiful letter." ] seqs = np.array([text_to_sequence(t) for t in texts]) input_lengths = [len(s) for s in seqs] use_manual_padding = False if use_manual_padding: max_input_len = np.max(input_lengths) + 10 # manuall padding seqs = np.array([_pad(x, max_input_len) for x in seqs], dtype=np.int) input_lengths = torch.LongTensor(input_lengths) input_lengths = input_lengths.cuda() if use_cuda else input_lenghts else: input_lengths = None text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) mel = np.load( "/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy" ) max_target_len = mel.shape[0] r = 4 mel_dim = 80 if max_target_len % r != 0: max_target_len += r - max_target_len % r assert max_target_len % r == 0 mel = _pad_2d(mel, max_target_len) mel = Variable(torch.from_numpy(mel)) mel_reshaped = mel.view(1, -1, mel_dim * r) frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape( 1, mel_reshaped.size(1)) x = Variable(torch.LongTensor(seqs)) text_positions = Variable(torch.LongTensor(text_positions)) frame_positions = Variable(torch.LongTensor(frame_positions)) if use_cuda: x = x.cuda() text_positions = text_positions.cuda() frame_positions = frame_positions.cuda() mel_reshaped = mel_reshaped.cuda() model.eval() def _plot(mel, mel_predicted, alignments): from matplotlib import pylab as plt plt.figure(figsize=(16, 10)) plt.subplot(3, 1, 1) plt.imshow(mel.data.cpu().numpy().T, origin="lower bottom", aspect="auto", cmap="magma") plt.colorbar() plt.subplot(3, 1, 2) plt.imshow(mel_predicted.view(-1, mel_dim).data.cpu().numpy().T, origin="lower bottom", aspect="auto", cmap="magma") plt.colorbar() plt.subplot(3, 1, 3) if alignments.dim() == 4: alignments = alignments.mean(0) plt.imshow(alignments[0].data.cpu().numpy().T, origin="lower bottom", aspect="auto") plt.colorbar() plt.show() # Encoder encoder_outs = model.seq2seq.encoder(x, lengths=input_lengths) # Off line decoding mel_output_offline, alignments_offline, done = model.seq2seq.decoder( encoder_outs, mel_reshaped, text_positions=text_positions, frame_positions=frame_positions, lengths=input_lengths) _plot(mel, mel_output_offline, alignments_offline) # Online decoding test_inputs = None # test_inputs = mel_reshaped model.seq2seq.decoder.start_fresh_sequence() mel_outputs, alignments, dones_online = model.seq2seq.decoder.incremental_forward( encoder_outs, text_positions, # initial_input=mel_reshaped[:, :1, :], test_inputs=test_inputs) if test_inputs is not None: c = (mel_output_offline - mel_outputs).abs() print(c.mean(), c.max()) _plot(mel, c, alignments) _plot(mel, mel_outputs, alignments)
def test_incremental_correctness(): texts = [ "they discarded this for a more completely Roman and far less beautiful letter." ] seqs = np.array([text_to_sequence(t) for t in texts]) text_positions = np.arange(1, len(seqs[0]) + 1).reshape(1, len(seqs[0])) mel = np.load( "/home/ryuichi/Dropbox/sp/deepvoice3_pytorch/data/ljspeech/ljspeech-mel-00035.npy" ) max_target_len = mel.shape[0] r = 4 mel_dim = 80 if max_target_len % r != 0: max_target_len += r - max_target_len % r assert max_target_len % r == 0 mel = _pad_2d(mel, max_target_len) mel = Variable(torch.from_numpy(mel)) mel_reshaped = mel.view(1, -1, mel_dim * r) frame_positions = np.arange(1, mel_reshaped.size(1) + 1).reshape( 1, mel_reshaped.size(1)) x = Variable(torch.LongTensor(seqs)) text_positions = Variable(torch.LongTensor(text_positions)) frame_positions = Variable(torch.LongTensor(frame_positions)) for model, speaker_ids in [ (_get_model(force_monotonic_attention=False), None), (_get_model(force_monotonic_attention=False, n_speakers=32, speaker_embed_dim=16), Variable(torch.LongTensor([1]))) ]: model.eval() if speaker_ids is not None: speaker_embed = model.embed_speakers(speaker_ids) else: speaker_embed = None # Encoder encoder_outs = model.seq2seq.encoder(x, speaker_embed=speaker_embed) # Off line decoding mel_outputs_offline, alignments_offline, done, _ = model.seq2seq.decoder( encoder_outs, mel_reshaped, speaker_embed=speaker_embed, text_positions=text_positions, frame_positions=frame_positions) # Online decoding with test inputs model.seq2seq.decoder.start_fresh_sequence() mel_outputs_online, alignments, dones_online, _ = model.seq2seq.decoder.incremental_forward( encoder_outs, text_positions, speaker_embed=speaker_embed, test_inputs=mel_reshaped) # Should get same result c = (mel_outputs_offline - mel_outputs_online).abs() print(c.mean(), c.max()) assert np.allclose(mel_outputs_offline.cpu().data.numpy(), mel_outputs_online.cpu().data.numpy(), atol=1e-5)