Exemple #1
0
def visualize(alignment,
              postnet_output,
              stop_tokens,
              text,
              hop_length,
              CONFIG,
              decoder_output=None,
              output_path=None,
              figsize=(8, 24),
              output_fig=False):
    if decoder_output is not None:
        num_plot = 4
    else:
        num_plot = 3

    label_fontsize = 16
    fig = plt.figure(figsize=figsize)

    plt.subplot(num_plot, 1, 1)
    plt.imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
    plt.xlabel("Decoder timestamp", fontsize=label_fontsize)
    plt.ylabel("Encoder timestamp", fontsize=label_fontsize)
    # compute phoneme representation and back
    if CONFIG.use_phonemes:
        seq = phoneme_to_sequence(
            text, [CONFIG.text_cleaner],
            CONFIG.phoneme_language,
            CONFIG.enable_eos_bos_chars,
            tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
        text = sequence_to_phoneme(
            seq,
            tp=CONFIG.characters if 'characters' in CONFIG.keys() else None)
        print(text)
    plt.yticks(range(len(text)), list(text))
    plt.colorbar()
    # plot stopnet predictions
    plt.subplot(num_plot, 1, 2)
    plt.plot(range(len(stop_tokens)), list(stop_tokens))
    # plot postnet spectrogram
    plt.subplot(num_plot, 1, 3)
    librosa.display.specshow(postnet_output.T,
                             sr=CONFIG.audio['sample_rate'],
                             hop_length=hop_length,
                             x_axis="time",
                             y_axis="linear",
                             fmin=CONFIG.audio['mel_fmin'],
                             fmax=CONFIG.audio['mel_fmax'])

    plt.xlabel("Time", fontsize=label_fontsize)
    plt.ylabel("Hz", fontsize=label_fontsize)
    plt.tight_layout()
    plt.colorbar()

    if decoder_output is not None:
        plt.subplot(num_plot, 1, 4)
        librosa.display.specshow(decoder_output.T,
                                 sr=CONFIG.audio['sample_rate'],
                                 hop_length=hop_length,
                                 x_axis="time",
                                 y_axis="linear",
                                 fmin=CONFIG.audio['mel_fmin'],
                                 fmax=CONFIG.audio['mel_fmax'])
        plt.xlabel("Time", fontsize=label_fontsize)
        plt.ylabel("Hz", fontsize=label_fontsize)
        plt.tight_layout()
        plt.colorbar()

    if output_path:
        print(output_path)
        fig.savefig(output_path)
        plt.close()

    if not output_fig:
        plt.close()
    def _test_phoneme_to_sequence(self, add_blank):
        """Verify en-us sentence phonemes"""
        text_cleaner = ["phoneme_cleaners"]
        sequence = phoneme_to_sequence(EXAMPLE_TEXT,
                                       text_cleaner,
                                       LANG,
                                       add_blank=add_blank,
                                       use_espeak_phonemes=True)
        text_hat = sequence_to_phoneme(sequence)
        text_hat_with_params = sequence_to_phoneme(sequence)
        gt = EXPECTED_PHONEMES.replace("|", "")
        self.assertEqual(text_hat, text_hat_with_params)
        self.assertEqual(text_hat, gt)

        # multiple punctuations
        text = "Be a voice, not an! echo?"
        sequence = phoneme_to_sequence(text,
                                       text_cleaner,
                                       LANG,
                                       add_blank=add_blank,
                                       use_espeak_phonemes=True)
        text_hat = sequence_to_phoneme(sequence)
        text_hat_with_params = sequence_to_phoneme(sequence)
        gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ?"
        print(text_hat)
        print(len(sequence))
        self.assertEqual(text_hat, text_hat_with_params)
        self.assertEqual(text_hat, gt)

        # not ending with punctuation
        text = "Be a voice, not an! echo"
        sequence = phoneme_to_sequence(text,
                                       text_cleaner,
                                       LANG,
                                       add_blank=add_blank,
                                       use_espeak_phonemes=True)
        text_hat = sequence_to_phoneme(sequence)
        text_hat_with_params = sequence_to_phoneme(sequence)
        gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ"
        print(text_hat)
        print(len(sequence))
        self.assertEqual(text_hat, text_hat_with_params)
        self.assertEqual(text_hat, gt)

        # original
        text = "Be a voice, not an echo!"
        sequence = phoneme_to_sequence(text,
                                       text_cleaner,
                                       LANG,
                                       add_blank=add_blank,
                                       use_espeak_phonemes=True)
        text_hat = sequence_to_phoneme(sequence)
        text_hat_with_params = sequence_to_phoneme(sequence)
        gt = "biː ɐ vɔɪs, nɑːt ɐn ɛkoʊ!"
        print(text_hat)
        print(len(sequence))
        self.assertEqual(text_hat, text_hat_with_params)
        self.assertEqual(text_hat, gt)

        # extra space after the sentence
        text = "Be a voice, not an! echo.  "
        sequence = phoneme_to_sequence(text,
                                       text_cleaner,
                                       LANG,
                                       add_blank=add_blank,
                                       use_espeak_phonemes=True)
        text_hat = sequence_to_phoneme(sequence)
        text_hat_with_params = sequence_to_phoneme(sequence)
        gt = "biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ."
        print(text_hat)
        print(len(sequence))
        self.assertEqual(text_hat, text_hat_with_params)
        self.assertEqual(text_hat, gt)

        # extra space after the sentence
        text = "Be a voice, not an! echo.  "
        sequence = phoneme_to_sequence(text,
                                       text_cleaner,
                                       LANG,
                                       enable_eos_bos=True,
                                       add_blank=add_blank,
                                       use_espeak_phonemes=True)
        text_hat = sequence_to_phoneme(sequence)
        text_hat_with_params = sequence_to_phoneme(sequence)
        gt = "^biː ɐ vɔɪs, nɑːt ɐn! ɛkoʊ.~"
        print(text_hat)
        print(len(sequence))
        self.assertEqual(text_hat, text_hat_with_params)
        self.assertEqual(text_hat, gt)