def __init__(self,
                 text_cleaners=['basic_cleaners'],
                 use_phonemes=True,
                 n_jobs=1,
                 with_stress=True):
        """
        Text sequencies preprocessor with G2P support.
        :param text_cleaners: text cleaner type:
            * `basic_cleaners`: basic pipeline that lowercases and collapses whitespace without transliteration.
            * `transliteration_cleaners`: pipeline for non-English text that transliterates to ASCII.
            * `english_cleaners`: pipeline for English text, including number and abbreviation expansion.
        :param use_phonemes: file path with phonemes set separated by `|`
        :param n_jobs: number of workers for phonemization
        :param with_stress: set `True` to stress words during phonemization
        """
        self.text_cleaners = text_cleaners
        self.use_phonemes = use_phonemes
        self.n_jobs = n_jobs
        self.with_stress = with_stress

        CHARS = _GRAPHEMES if not self.use_phonemes else _PHONEMES

        self.SYMBOLS = [_PAD, _EOS, _SPACE] + _PUNCTUATIONS + _NUMBERS + CHARS

        # Mappings from symbol to numeric ID and vice versa:
        self._symbol_to_id = {s: i for i, s in enumerate(self.SYMBOLS)}
        self._id_to_symbol = {i: s for i, s in enumerate(self.SYMBOLS)}

        self._separator = Separator(word=_WORD_SEP,
                                    syllable='',
                                    phone=_PHONEME_SEP)
Example #2
0
def test_french():
    backend = EspeakBackend('fr-fr')
    text = ['bonjour le monde']
    sep = Separator(word=';eword ', syllable=None, phone=' ')
    expected = ['b ɔ̃ ʒ u ʁ ;eword l ə ;eword m ɔ̃ d ;eword ']
    out = backend.phonemize(text, sep, False)
    assert out == expected
Example #3
0
    def __init__(
        self,
        backend,
        word_separator: Optional[str] = None,
        syllable_separator: Optional[str] = None,
        phone_separator: Optional[str] = " ",
        strip=False,
        split_by_single_token: bool = False,
        **phonemizer_kwargs,
    ):
        # delayed import
        from phonemizer.backend import BACKENDS
        from phonemizer.separator import Separator

        self.separator = Separator(
            word=word_separator,
            syllable=syllable_separator,
            phone=phone_separator,
        )

        # define logger to suppress the warning in phonemizer
        logger = logging.getLogger("phonemizer")
        logger.setLevel(logging.ERROR)
        self.phonemizer = BACKENDS[backend](
            **phonemizer_kwargs,
            logger=logger,
        )
        self.strip = strip
        self.split_by_single_token = split_by_single_token
Example #4
0
def test_separator_3():
    backend = SegmentsBackend('cree')
    text = ['achi acho']

    sep = Separator(word=' ', syllable=None, phone='_')
    assert backend.phonemize(text, separator=sep) == [u'ʌ_tʃ_ɪ_ ʌ_tʃ_ʊ_ ']
    assert backend.phonemize(text, separator=sep, strip=True) \
        == [u'ʌ_tʃ_ɪ ʌ_tʃ_ʊ']
def test_no_switch(policy, caplog):
    text = ["j'aime l'anglais", "tu parles le français"]
    backend = EspeakBackend('fr-fr', language_switch=policy)
    out = backend.phonemize(text, separator=Separator(), strip=True)
    assert out == ['ʒɛm lɑ̃ɡlɛ', 'ty paʁl lə fʁɑ̃sɛ']

    messages = [msg[2] for msg in caplog.record_tuples]
    assert not messages
Example #6
0
def test_separator_5():
    backend = SegmentsBackend('cree')
    text = ['achi acho']

    sep = Separator(phone=' ', word='_')
    assert backend.phonemize(text, separator=sep) == [u'ʌ tʃ ɪ _ʌ tʃ ʊ _']
    assert backend.phonemize(text, separator=sep, strip=True) \
        == [u'ʌ tʃ ɪ_ʌ tʃ ʊ']
Example #7
0
def test_separator_4():
    backend = SegmentsBackend('cree')
    text = ['achi acho']

    # TODO bug when sep.phone == ' ' with no sep.word
    sep = Separator(phone=' ', word='')
    assert backend.phonemize(text, separator=sep) == [u'ʌ tʃ ɪ ʌ tʃ ʊ ']
    assert backend.phonemize(text, separator=sep, strip=True) \
        == [u'ʌ tʃ ɪʌ tʃ ʊ']
Example #8
0
def test_tie_simple(caplog, tie, expected):
    backend = EspeakBackend('en-us', tie=tie)
    assert backend.phonemize(['Jackie Chan'],
                             separator=Separator(word=' ',
                                                 phone='_'))[0] == expected

    if tie:
        messages = [msg[2] for msg in caplog.record_tuples]
        assert (
            'cannot use ties AND phone separation, ignoring phone separator'
            in messages)
Example #9
0
def test_phone_separator_simple():
    text = ['The lion and the tiger ran']
    sep = Separator(phone='_')
    backend = EspeakBackend('en-us')

    output = backend.phonemize(text, separator=sep, strip=True)
    expected = ['ð_ə l_aɪə_n æ_n_d ð_ə t_aɪ_ɡ_ɚ ɹ_æ_n']
    assert expected == output

    output = backend.phonemize(text, separator=sep, strip=False)
    expected = ['ð_ə_ l_aɪə_n_ æ_n_d_ ð_ə_ t_aɪ_ɡ_ɚ_ ɹ_æ_n_ ']
    assert expected == output
Example #10
0
def test_arabic():
    backend = EspeakBackend('ar')
    text = ['السلام عليكم']
    sep = Separator()

    # Arabic seems to have changed starting at espeak-ng-1.49.3
    if EspeakBackend.version() >= (1, 49, 3):
        expected = ['ʔassalaːm ʕliːkm ']
    else:
        expected = ['ʔassalaam ʕaliijkum ']
    out = backend.phonemize(text, sep, False)
    assert out == expected
Example #11
0
def _phonemize(text, language):
    try:
        seperators = Separator(word=' ', phone='')
        phonemes = phonemize(text,
                             separator=seperators,
                             backend='espeak',
                             language=language)
    except RuntimeError:
        epi = epitran.Epitran(language)
        phonemes = epi.transliterate(text, normpunc=True)
    phonemes.replace('\n', ' ', 1)
    return phonemes
Example #12
0
def test_french_sampa(backend):
    text = ['bonjour le monde']
    sep = Separator(word=None, phone=' ')

    expected = ['b o~ Z u R l @ m o~ d ']
    out = backend.phonemize(text, separator=sep, strip=False)
    assert out == expected

    expected = ['b o~ Z u R l @ m o~ d']
    out = backend.phonemize(text, separator=sep, strip=True)
    assert out == expected

    assert backend.phonemize([''], separator=sep, strip=True) == ['']
    assert backend.phonemize(['"'], separator=sep, strip=True) == ['']
def test_language_switch_remove_utterance(caplog, langswitch_text, njobs):
    backend = EspeakBackend('fr-fr', language_switch='remove-utterance')
    out = backend.phonemize(langswitch_text,
                            separator=Separator(),
                            strip=True,
                            njobs=njobs)
    assert out == ['ʒɛm lɑ̃ɡlɛ', '', '', '', '']

    messages = [msg[2] for msg in caplog.record_tuples]
    assert ('removed 4 utterances containing language switches '
            '(applying "remove-utterance" policy)' in messages)

    with pytest.raises(RuntimeError):
        backend = EspeakBackend('fr-fr', language_switch='foo')
def extract_phonemes(filename):
    from phonemizer.phonemize import phonemize
    from phonemizer.backend import FestivalBackend
    from phonemizer.separator import Separator

    with open(filename) as f:
        text = f.read()
        phones = phonemize(text,
                           language='en-us',
                           backend='festival',
                           separator=Separator(phone=' ', syllable='',
                                               word=''))

    with open(filename.replace(".txt", ".phones"), "w") as outfile:
        print(phones, file=outfile)
Example #15
0
    def __init__(
        self,
        word_separator: Optional[str] = None,
        syllable_separator: Optional[str] = None,
        **phonemize_kwargs,
    ):
        # delayed import
        from phonemizer import phonemize
        from phonemizer.separator import Separator

        self.phonemize = phonemize
        self.separator = Separator(word=word_separator,
                                   syllable=syllable_separator,
                                   phone=" ")
        self.phonemize_kwargs = phonemize_kwargs
def test_language_switch_remove_flags(caplog, langswitch_text, njobs):
    backend = EspeakBackend('fr-fr', language_switch='remove-flags')
    out = backend.phonemize(langswitch_text,
                            separator=Separator(),
                            strip=True,
                            njobs=njobs)
    assert out == [
        'ʒɛm lɑ̃ɡlɛ', 'ʒɛm lə fʊtbɔːl', 'fʊtbɔːl', 'syʁtu lə ɹiəl madʁid',
        'nytiliz pa ɡuːɡəl'
    ]

    messages = [msg[2] for msg in caplog.record_tuples]
    assert ('4 utterances containing language switches on lines 2, 3, 4, 5'
            in messages)
    assert ('language switch flags have been removed '
            '(applying "remove-flags" policy)' in messages)
Example #17
0
def extract_phonemes(filename):
    from phonemizer.phonemize import phonemize

    from phonemizer.backend import FestivalBackend

    from phonemizer.separator import Separator
    #FestivalBackend.set_festival_path("/home/zhoukun/merlin/tools/festival/src/main/festival")    
    with open(filename) as f:
        text=f.read()
        phones = phonemize(text, language='en-us', backend='festival', separator=Separator(phone=' ', syllable='', word=''))
    filename = filename.replace('/data06_2/', '/data07/zhoukun/')
    file_test = filename[:-13]
    if not os.path.isdir(file_test):
        os.mkdir(file_test)
    with open(filename.replace(".txt", ".phones"), "w") as outfile:
        print(phones, file=outfile)
Example #18
0
    def phonemize(self, text: str, phonemizer_lang: Optional[str] = None) -> str:
        from phonemizer.separator import Separator

        word_delimiter = self.word_delimiter_token + " " if self.word_delimiter_token is not None else ""
        if phonemizer_lang is not None and phonemizer_lang != self.phonemizer_lang:
            self.init_backend(phonemizer_lang)
        else:
            phonemizer_lang = self.phonemizer_lang

        separator = Separator(phone=self.phone_delimiter_token, word=word_delimiter, syllable="")
        phonemes = self.backend.phonemize(
            [text],
            separator=separator,
        )
        phonemes = phonemes[0].strip()

        return phonemes
Example #19
0
def test_bad_backend():
    with pytest.raises(RuntimeError):
        phonemize('', backend='fetiv')

    with pytest.raises(RuntimeError):
        phonemize('', backend='foo')

    with pytest.raises(RuntimeError):
        phonemize('', tie=True, backend='festival')
    with pytest.raises(RuntimeError):
        phonemize('', tie=True, backend='mbrola')
    with pytest.raises(RuntimeError):
        phonemize('', tie=True, backend='segments')
    with pytest.raises(RuntimeError):
        phonemize('',
                  tie=True,
                  backend='espeak',
                  separator=Separator(' ', None, '-'))
def test_language_switch_default(caplog, langswitch_text, njobs):
    # default behavior is to keep the flags
    backend = EspeakBackend('fr-fr')
    out = backend.phonemize(langswitch_text,
                            separator=Separator(),
                            strip=True,
                            njobs=njobs)
    assert out == [
        'ʒɛm lɑ̃ɡlɛ', 'ʒɛm lə (en)fʊtbɔːl(fr)', '(en)fʊtbɔːl(fr)',
        'syʁtu lə (en)ɹiəl(fr) madʁid', 'nytiliz pa (en)ɡuːɡəl(fr)'
    ]

    messages = [msg[2] for msg in caplog.record_tuples]
    assert ('4 utterances containing language switches on lines 2, 3, 4, 5'
            in messages)
    assert (
        'language switch flags have been kept (applying "keep-flags" policy)'
        in messages)
Example #21
0
    def __init__(
        self,
        word_separator: Optional[str] = None,
        syllable_separator: Optional[str] = None,
        phone_separator: Optional[str] = " ",
        split_by_single_token: bool = False,
        **phonemize_kwargs,
    ):
        # delayed import
        from phonemizer import phonemize
        from phonemizer.separator import Separator

        self.phonemize = phonemize
        self.separator = Separator(
            word=word_separator,
            syllable=syllable_separator,
            phone=phone_separator,
        )
        self.split_by_single_token = split_by_single_token
        self.phonemize_kwargs = phonemize_kwargs
Example #22
0
def ipa_phonemize(text, lang="en-us", use_g2p=False):
    if use_g2p:
        assert lang == "en-us", "g2pE phonemizer only works for en-us"
        try:
            from g2p_en import G2p
            g2p = G2p()
            return " ".join("|" if p == " " else p for p in g2p(text))
        except ImportError:
            raise ImportError("Please install phonemizer: pip install g2p_en")
    else:
        try:
            from phonemizer import phonemize
            from phonemizer.separator import Separator
            return phonemize(text,
                             backend='espeak',
                             language=lang,
                             separator=Separator(word="| ", phone=" "))
        except ImportError:
            raise ImportError(
                "Please install phonemizer: pip install phonemizer")
Example #23
0
    def phonemize(self, text: str, phonemizer_lang: Optional[str] = None) -> str:
        requires_backends(self, "phonemizer")

        from phonemizer import phonemize
        from phonemizer.separator import Separator

        word_delimiter = self.word_delimiter_token + " " if self.word_delimiter_token is not None else ""
        phonemizer_lang = phonemizer_lang if phonemizer_lang is not None else self.phonemizer_lang

        separator = Separator(phone=self.phone_delimiter_token, word=word_delimiter, syllable="")
        phonemes = phonemize(
            text,
            language=phonemizer_lang,
            backend=self.phonemizer_backend,
            separator=separator,
            language_switch="remove-flags",
        )
        phonemes = phonemes.strip()

        return phonemes
Example #24
0
    def phonemize(
        cls,
        text: str,
        lang: Optional[str],
        phonemizer: Optional[str] = None,
        preserve_punct: bool = False,
        to_simplified_zh: bool = False,
    ):
        if to_simplified_zh:
            import hanziconv

            text = hanziconv.HanziConv.toSimplified(text)

        if phonemizer == "g2p":
            import g2p_en

            g2p = g2p_en.G2p()
            if preserve_punct:
                return " ".join("|" if p == " " else p for p in g2p(text))
            else:
                res = [{",": "sp", ";": "sp"}.get(p, p) for p in g2p(text)]
                return " ".join(p for p in res if p.isalnum())
        if phonemizer == "g2pc":
            import g2pc

            g2p = g2pc.G2pC()
            return " ".join([w[3] for w in g2p(text)])
        elif phonemizer == "ipa":
            assert lang is not None
            import phonemizer
            from phonemizer.separator import Separator

            lang_map = {"en": "en-us", "fr": "fr-fr"}
            return phonemizer.phonemize(
                text,
                backend="espeak",
                language=lang_map.get(lang, lang),
                separator=Separator(word="| ", phone=" "),
            )
        else:
            return text
def test_equal():
    assert Separator() == Separator()
    assert default_separator == Separator(phone='', syllable='', word=' ')
    assert Separator(word='  ') != default_separator
def test_str():
    separator = Separator(word='w', syllable='s', phone='p')
    assert str(separator) == '(phone: "p", syllable: "s", word: "w")'
    assert str(default_separator) == '(phone: "", syllable: "", word: " ")'
def test_same():
    with pytest.raises(ValueError):
        Separator(word=' ', phone=' ')
def test_empty(val):
    s = Separator(val, val, val)
    assert s.phone == ''
    assert s.syllable == ''
    assert s.word == ''
Example #29
0
def test_sampa_fr(text, expected):
    assert expected == EspeakMbrolaBackend('mb-fr1').phonemize(

        text, strip=True, separator=Separator(phone=''))
Example #30
0
def test_im():
    sep = Separator(word=' ', syllable='', phone='')
    assert _test(["I'm looking for an image"], sep) \
        == ['aym luhkaxng faor axn ihmaxjh']
    assert _test(["Im looking for an image"], sep) \
        == ['ihm luhkaxng faor axn ihmaxjh']