def visualize(alignment, postnet_output, stop_tokens, text, hop_length, CONFIG, decoder_output=None, output_path=None, figsize=(8, 24), output_fig=False): if decoder_output is not None: num_plot = 4 else: num_plot = 3 label_fontsize = 16 fig = plt.figure(figsize=figsize) plt.subplot(num_plot, 1, 1) plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None) plt.xlabel("Decoder timestamp", fontsize=label_fontsize) plt.ylabel("Encoder timestamp", fontsize=label_fontsize) # compute phoneme representation and back if CONFIG.use_phonemes: seq = phoneme_to_sequence( text, [CONFIG.text_cleaner], CONFIG.phoneme_language, CONFIG.enable_eos_bos_chars, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None) text = sequence_to_phoneme( seq, tp=CONFIG.characters if 'characters' in CONFIG.keys() else None) print(text) plt.yticks(range(len(text)), list(text)) plt.colorbar() # plot stopnet predictions plt.subplot(num_plot, 1, 2) plt.plot(range(len(stop_tokens)), list(stop_tokens)) # plot postnet spectrogram plt.subplot(num_plot, 1, 3) librosa.display.specshow(postnet_output.T, sr=CONFIG.audio['sample_rate'], hop_length=hop_length, x_axis="time", y_axis="linear", fmin=CONFIG.audio['mel_fmin'], fmax=CONFIG.audio['mel_fmax']) plt.xlabel("Time", fontsize=label_fontsize) plt.ylabel("Hz", fontsize=label_fontsize) plt.tight_layout() plt.colorbar() if decoder_output is not None: plt.subplot(num_plot, 1, 4) librosa.display.specshow(decoder_output.T, sr=CONFIG.audio['sample_rate'], hop_length=hop_length, x_axis="time", y_axis="linear", fmin=CONFIG.audio['mel_fmin'], fmax=CONFIG.audio['mel_fmax']) plt.xlabel("Time", fontsize=label_fontsize) plt.ylabel("Hz", fontsize=label_fontsize) plt.tight_layout() plt.colorbar() if output_path: print(output_path) fig.savefig(output_path) plt.close() if not output_fig: plt.close()
def _test_phoneme_to_sequence(self, add_blank): """Verify en-us sentence phonemes""" text_cleaner = ["phoneme_cleaners"] sequence = phoneme_to_sequence(EXAMPLE_TEXT, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) text_hat = sequence_to_phoneme(sequence) text_hat_with_params = sequence_to_phoneme(sequence) gt = EXPECTED_PHONEMES.replace("|", "") self.assertEqual(text_hat, text_hat_with_params) self.assertEqual(text_hat, gt) # multiple punctuations text = "Be a voice, not an! echo?" sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) text_hat = sequence_to_phoneme(sequence) text_hat_with_params = sequence_to_phoneme(sequence) gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?" print(text_hat) print(len(sequence)) self.assertEqual(text_hat, text_hat_with_params) self.assertEqual(text_hat, gt) # not ending with punctuation text = "Be a voice, not an! echo" sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) text_hat = sequence_to_phoneme(sequence) text_hat_with_params = sequence_to_phoneme(sequence) gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ" print(text_hat) print(len(sequence)) self.assertEqual(text_hat, text_hat_with_params) self.assertEqual(text_hat, gt) # original text = "Be a voice, not an echo!" sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) text_hat = sequence_to_phoneme(sequence) text_hat_with_params = sequence_to_phoneme(sequence) gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!" print(text_hat) print(len(sequence)) self.assertEqual(text_hat, text_hat_with_params) self.assertEqual(text_hat, gt) # extra space after the sentence text = "Be a voice, not an! echo. " sequence = phoneme_to_sequence(text, text_cleaner, LANG, add_blank=add_blank, use_espeak_phonemes=True) text_hat = sequence_to_phoneme(sequence) text_hat_with_params = sequence_to_phoneme(sequence) gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ." print(text_hat) print(len(sequence)) self.assertEqual(text_hat, text_hat_with_params) self.assertEqual(text_hat, gt) # extra space after the sentence text = "Be a voice, not an! echo. " sequence = phoneme_to_sequence(text, text_cleaner, LANG, enable_eos_bos=True, add_blank=add_blank, use_espeak_phonemes=True) text_hat = sequence_to_phoneme(sequence) text_hat_with_params = sequence_to_phoneme(sequence) gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ.~" print(text_hat) print(len(sequence)) self.assertEqual(text_hat, text_hat_with_params) self.assertEqual(text_hat, gt)