def synthesis(m, s, CONFIG, use_cuda, ap, language=None): """ Given the text, synthesising the audio """ if language is None: language = CONFIG.phoneme_language text_cleaner = [CONFIG.text_cleaner] # print(phoneme_to_sequence(s, text_cleaner)) # print(sequence_to_phoneme(phoneme_to_sequence(s, text_cleaner))) if CONFIG.use_phonemes: seq = np.asarray(phoneme_to_sequence(s, text_cleaner, language), dtype=np.int32) else: seq = np.asarray(text_to_sequence(s, text_cleaner), dtype=np.int32) chars_var = torch.from_numpy(seq).unsqueeze(0) if use_cuda: chars_var = chars_var.cuda() mel_spec, linear_spec, alignments, stop_tokens = m.forward( chars_var.long()) linear_spec = linear_spec[0].data.cpu().numpy() mel_spec = mel_spec[0].data.cpu().numpy() alignment = alignments[0].cpu().data.numpy() mel_tensor = torch.FloatTensor(mel_spec.T).unsqueeze(0) if torch.cuda.is_available(): mel_tensor = mel_tensor.cuda() wav = wavernn.generate(mel_tensor, batched=True, target=11000, overlap=550) return wav
def visualize(alignment, spectrogram_postnet, stop_tokens, text, hop_length, CONFIG, spectrogram=None, output_path=None): if spectrogram is not None: num_plot = 4 else: num_plot = 3 label_fontsize = 16 fig = plt.figure(figsize=(8, 24)) plt.subplot(num_plot, 1, 1) plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None) plt.xlabel("Decoder timestamp", fontsize=label_fontsize) plt.ylabel("Encoder timestamp", fontsize=label_fontsize) if CONFIG.use_phonemes: seq = phoneme_to_sequence(text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars) text = sequence_to_phoneme(seq) print(text) plt.yticks(range(len(text)), list(text)) plt.colorbar() stop_tokens = stop_tokens.squeeze().detach().to('cpu').numpy() plt.subplot(num_plot, 1, 2) plt.plot(range(len(stop_tokens)), list(stop_tokens)) plt.subplot(num_plot, 1, 3) librosa.display.specshow(spectrogram_postnet.T, sr=CONFIG.audio['sample_rate'], hop_length=hop_length, x_axis="time", y_axis="linear") plt.xlabel("Time", fontsize=label_fontsize) plt.ylabel("Hz", fontsize=label_fontsize) plt.tight_layout() plt.colorbar() if spectrogram is not None: plt.subplot(num_plot, 1, 4) librosa.display.specshow(spectrogram.T, sr=CONFIG.audio['sample_rate'], hop_length=hop_length, x_axis="time", y_axis="linear") plt.xlabel("Time", fontsize=label_fontsize) plt.ylabel("Hz", fontsize=label_fontsize) plt.tight_layout() plt.colorbar() if output_path: print(output_path) fig.savefig(output_path) plt.close()
def _generate_and_cache_phoneme_sequence(self, text, cache_path): """generate a phoneme sequence from text. since the usage is for subsequent caching, we never add bos and eos chars here. Instead we add those dynamically later; based on the config option.""" phonemes = phoneme_to_sequence(text, [self.cleaners], language=self.phoneme_language, enable_eos_bos=False) phonemes = np.asarray(phonemes, dtype=np.int32) np.save(cache_path, phonemes) return phonemes
def synthesis(m, s, CONFIG, use_cuda, ap, language=None): """ Given the text, synthesising the audio """ if language is None: language = CONFIG.phoneme_language text_cleaner = [CONFIG.text_cleaner] # print(phoneme_to_sequence(s, text_cleaner)) # print(sequence_to_phoneme(phoneme_to_sequence(s, text_cleaner))) if CONFIG.use_phonemes: seq = np.asarray(phoneme_to_sequence(s, text_cleaner, language), dtype=np.int32) else: seq = np.asarray(text_to_sequence(s, text_cleaner), dtype=np.int32) chars_var = torch.from_numpy(seq).unsqueeze(0) if use_cuda: chars_var = chars_var.cuda() mel_spec, linear_spec, alignments, stop_tokens = m.forward( chars_var.long()) linear_spec = linear_spec[0].data.cpu().numpy() mel_spec = mel_spec[0].data.cpu().numpy() alignment = alignments[0].cpu().data.numpy() wav = ap.inv_spectrogram(linear_spec.T) wav = wav[:ap.find_endpoint(wav)] return wav
def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG, decoder_output=None, output_path=None, figsize=(8, 24)): if decoder_output is not None: num_plot = 4 else: num_plot = 3 label_fontsize = 16 fig = plt.figure(figsize=figsize) plt.subplot(num_plot, 1, 1) plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None) plt.xlabel("Decoder timestamp", fontsize=label_fontsize) plt.ylabel("Encoder timestamp", fontsize=label_fontsize) # compute phoneme representation and back if CONFIG.use_phonemes: seq = phoneme_to_sequence( text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None) text = sequence_to_phoneme( seq, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None) print(text) plt.yticks(range(len(text)), list(text)) plt.colorbar() # plot stopnet predictions plt.subplot(num_plot, 1, 2) plt.plot(range(len(stop_tokens)), list(stop_tokens)) # plot postnet spectrogram plt.subplot(num_plot, 1, 3) librosa.display.specshow(postnet_output.T, sr=CONFIG.audio['sample_rate'], hop_length=hop_length, x_axis="time", y_axis="linear", fmin=CONFIG.audio['mel_fmin'], fmax=CONFIG.audio['mel_fmax']) plt.xlabel("Time", fontsize=label_fontsize) plt.ylabel("Hz", fontsize=label_fontsize) plt.tight_layout() plt.colorbar() if decoder_output is not None: plt.subplot(num_plot, 1, 4) librosa.display.specshow(decoder_output.T, sr=CONFIG.audio['sample_rate'], hop_length=hop_length, x_axis="time", y_axis="linear", fmin=CONFIG.audio['mel_fmin'], fmax=CONFIG.audio['mel_fmax']) plt.xlabel("Time", fontsize=label_fontsize) plt.ylabel("Hz", fontsize=label_fontsize) plt.tight_layout() plt.colorbar() if output_path: print(output_path) fig.savefig(output_path) plt.close()