def say(sentence): new_sentence=" " .join([num2words(w) if w.isdigit() else w for w in sentence.split()]) normalized_sentence = "".join([c if c.lower() in vocab else '' for c in new_sentence]) print(normalized_sentence) sentences = [normalized_sentence] max_N = len(normalized_sentence) L = torch.from_numpy(get_test_data(sentences, max_N)) zeros = torch.from_numpy(np.zeros((1, hp.n_mels, 1), np.float32)) Y = zeros A = None for t in range(hp.max_T): _, Y_t, A = text2mel(L, Y, monotonic_attention=True) Y = torch.cat((zeros, Y_t), -1) _, attention = torch.max(A[0, :, -1], 0) attention = attention.item() if L[0, attention] == vocab.index('E'): # EOS break _, Z = ssrn(Y) i=int(0) Z = Z.cpu().detach().numpy() save_to_wav(Z[0, :, :].T, '%d.wav' % (i + 1)) playsound('1.wav')
sys.exit(1) # synthetize by one by one because there is a batch processing bug! for i in range(len(SENTENCES)): sentences = [SENTENCES[i]] max_N = len(SENTENCES[i]) L = torch.from_numpy(get_test_data(sentences, max_N)) zeros = torch.from_numpy(np.zeros((1, hp.n_mels, 1), np.float32)) Y = zeros A = None for t in tqdm(range(hp.max_T)): _, Y_t, A = text2mel(L, Y, monotonic_attention=True) Y = torch.cat((zeros, Y_t), -1) _, attention = torch.max(A[0, :, -1], 0) attention = attention.item() if L[0, attention] == vocab.index('E'): # EOS break _, Z = ssrn(Y) Y = Y.cpu().detach().numpy() A = A.cpu().detach().numpy() Z = Z.cpu().detach().numpy() save_to_png('samples/%d-att.png' % (i + 1), A[0, :, :]) save_to_png('samples/%d-mel.png' % (i + 1), Y[0, :, :]) save_to_png('samples/%d-mag.png' % (i + 1), Z[0, :, :]) save_to_wav(Z[0, :, :].T, 'samples/%d-wav.wav' % (i + 1))
# text2mel = text2mel.eval() for sentence in SENTENCES: with torch.no_grad(): L = torch.from_numpy(get_test_data(sentence)).to(device) zeros = torch.from_numpy(np.zeros((1, hp.n_mels, 1), np.float32)).to(device) Y = zeros # A = None while True: _, Y_t, A = text2mel(L, Y, monotonic_attention=True) Y = torch.cat((zeros, Y_t), -1) _, attention = torch.max(A[0, :, -1], 0) attention = attention.item() if L[0, attention] == vocab.index('E'): # EOS print(f'{sentence} ok!') break _, Z = ssrn(Y) # Y = Y.cpu().detach().numpy() # A = A.cpu().detach().numpy() Z = Z.cpu().detach().numpy() if not os.path.isdir(f'samples/{folder}'): os.mkdir(f'samples/{folder}') if not os.path.isdir(f'samples/{folder}/{filename}'): os.mkdir(f'samples/{folder}/{filename}') # save_to_png('samples/%d-att.png' % (i + 1), A[0, :, :]) # save_to_png('samples/%d-mel.png' % (i + 1), Y[0, :, :]) # save_to_png('samples/%d-mag.png' % (i + 1), Z[0, :, :]) save_to_wav(Z[0, :, :].T, f'samples/{folder}/{filename}/{sentence}.wav')
for t in tqdm(range(hp.max_T)): _, Y_t, A = text2mel(L, Y, speakers, monotonic_attention=True) Y = torch.cat((zeros, Y_t), -1) _, attention = torch.max(A[0, :, -1], 0) attention = attention.item() if L[0, attention] == vocab.index('E'): # EOS break _, Z = ssrn(Y) Y = Y.cpu().detach().numpy() A = A.cpu().detach().numpy() Z = Z.cpu().detach().numpy() save_to_png('samples/%d-att.png' % (i + 1), A[0, :, :]) save_to_png('samples/%d-mel.png' % (i + 1), Y[0, :, :]) save_to_png('samples/%d-mag.png' % (i + 1), Z[0, :, :]) # import matplotlib.pyplot as plt # a = self.embeddings(torch.tensor([x for x in range(10)]).cuda()) # fig, ax = plt.subplots() # plt.imshow(A[0,:,:23]) # labels = [item.get_text() for item in ax.get_xticklabels()] # labels=[x for x in sentence] # # ax.set_yticklabels(labels[::-1]) # plt.show() # save_to_wav(Z[0, :, :].T, 'samples/%d-wav.wav' % (i + 1)) print('saving for speaker: ', speaker) save_to_wav(Z[0, :, :].T, 'samples/%d-%s-wav.wav' % ((i + 1), speaker))
sentences = [SENTENCES[i]] max_N = len(SENTENCES[i]) L = torch.from_numpy(get_test_data(sentences, max_N)) zeros = torch.from_numpy(np.zeros((1, hp.n_mels, 1), np.float32)) Y = zeros A = None for t in tqdm(range(hp.max_T)): _, Y_t, A = text2mel(L, Y, monotonic_attention=True) Y = torch.cat((zeros, Y_t), -1) _, attention = torch.max(A[0, :, -1], 0) attention = attention.item() if L[0, attention] == vocab.index('E'): # EOS break _, Z = ssrn(Y) Y = Y.cpu().detach().numpy() A = A.cpu().detach().numpy() Z = Z.cpu().detach().numpy() save_to_png('samples/samples_bea_sleepiness_3k/%d-att.png' % (i + 1), A[0, :, :]) save_to_png('samples/samples_bea_sleepiness_3k/%d-mel.png' % (i + 1), Y[0, :, :]) save_to_png('samples/samples_bea_sleepiness_3k/%d-mag.png' % (i + 1), Z[0, :, :]) save_to_wav(Z[0, :, :].T, 'samples/samples_bea_sleepiness_3k/%d-wav.wav' % (i + 1))
max_N = len(SENTENCES[i]) L = torch.from_numpy(get_test_data(sentences, max_N)) zeros = torch.from_numpy(np.zeros((1, hp.n_mels, 1), np.float32)) Y = zeros A = None for t in tqdm(range(hp.max_T)): _, Y_t, A = text2mel(L, Y, monotonic_attention=True) Y = torch.cat((zeros, Y_t), -1) _, attention = torch.max(A[0, :, -1], 0) attention = attention.item() if L[0, attention] == vocab.index('E'): # EOS break _, Z = ssrn(Y) Y = Y.cpu().detach().numpy() A = A.cpu().detach().numpy() Z = Z.cpu().detach().numpy() #print("Z", Z[0, :, :]) save_dir = hp.synthesize_samples_dir file_name = save_dir + '/' + t2mel_step_str + "_" + ssrn_step_str os.makedirs(file_name, exist_ok=True) save_to_png(file_name + '/%d-att.png' % (i + 1), A[0, :, :]) save_to_png(file_name + '/%d-mel.png' % (i + 1), Y[0, :, :]) save_to_png(file_name + '/%d-mag.png' % (i + 1), Z[0, :, :]) save_to_wav(Z[0, :, :].T, file_name + '/%d-wav.wav' % (i + 1))