def test_split_diacritics(self): """Test Phonemes.split with a diacritic substring replacement""" pron_str = "/ɑɑ̃/" lang_phonemes = Phonemes.from_language("fr-fr") pron_phonemes = lang_phonemes.split(pron_str, keep_stress=False) # Ensure first ɑ is transformed into a, but not the second phoneme_strs = [p.text for p in pron_phonemes] self.assertEqual(phoneme_strs, ["a", "ɑ̃"])
def test_split_dipthong(self): """Test Phonemes.split with a dipthong""" pron_str = "/neu̯rt͡ʃɪtou̯/" lang_phonemes = Phonemes.from_language("cs-cz") pron_phonemes = lang_phonemes.split(pron_str, keep_stress=False) # Ensure eu̯ ends up as eu̯ phoneme_strs = [p.text for p in pron_phonemes] self.assertEqual(phoneme_strs, ["n", "eu̯", "r", "t͡ʃ", "ɪ", "t", "ou̯"])
def test_split_substring(self): """Test Phonemes.split with a substring replacement""" pron_str = "/viːtɛt͡ʃnaː/" lang_phonemes = Phonemes.from_language("cs-cz") pron_phonemes = lang_phonemes.split(pron_str, keep_stress=False) # Ensure iː doesn't get transformed into ɪː phoneme_strs = [p.text for p in pron_phonemes] self.assertEqual(phoneme_strs, ["v", "iː", "t", "ɛ", "t͡ʃ", "n", "aː"])
def test_tones(self): """Test Phonemes.split with tones""" # á khôi pron_str = "/a˨˦xoj˧˧/" lang_phonemes = Phonemes.from_language("vi-n") pron_phonemes = lang_phonemes.split(pron_str) # Ensure tones are kept phoneme_strs = [p.text for p in pron_phonemes] self.assertEqual(phoneme_strs, ["a˨˦", "x", "oj˧˧"])
def test_dipthong(self): """Test Phonemes.from_string with a dipthong""" # ampliam pron_str = "/ɐ̃pliɐ̃w̃/" lang_phonemes = Phonemes.from_language("pt") pron_phonemes = lang_phonemes.split(pron_str) # Ensure "ɐ̃" and "ɐ̃w̃" are kept phoneme_strs = [p.text for p in pron_phonemes] self.assertEqual(phoneme_strs, ["ɐ̃", "p", "l", "i", "ɐ̃w̃"])
def test_split(self): """Test Phonemes.from_string""" # "Just a cow." pron_str = "/dʒʌst ə kˈaʊ/" lang_phonemes = Phonemes.from_language("en-us") pron_phonemes = lang_phonemes.split(pron_str, keep_stress=True) # Ensure "d ʒ" -> "d͡ʒ" and "a ʊ" -> "aʊ" phoneme_strs = [p.text for p in pron_phonemes] self.assertEqual(phoneme_strs, ["d͡ʒ", "ʌ", "s", "t", "ə", "k", "ˈaʊ"])
def __init__( self, config, language: typing.Optional[str] = None, preload_lexicon: bool = False, custom_tokenize: typing.Optional[TokenizeFunc] = None, custom_post_tokenize: typing.Optional[PostTokenizeFunc] = None, ): if language is None: self.language = pydash.get(config, "language.code") else: self.language = language self.config = config self.tokenizer = Tokenizer( config, custom_tokenize=custom_tokenize, custom_post_tokenize=custom_post_tokenize, ) self.phonemizer = Phonemizer(config, preload_lexicon=preload_lexicon) self.phonemizer.is_word = self.tokenizer.is_word # type: ignore self.phonemes = Phonemes.from_language(self.language) self.accents: typing.Dict[str, typing.Dict[str, typing.List[str]]] = {} # If True, primary/seconary stress should be kept during phonemization self.keep_stress = bool( pydash.get(self.config, "language.keep_stress", False)) # If True, acute/grave accents should be kept during phonemization self.keep_accents = bool( pydash.get(self.config, "language.keep_accents", False)) # Allowable tones in the language self.tones: typing.List[str] = pydash.get(self.config, "language.tones", []) # Load language-specific "accents" (different than acute/grave) accents = self.config.get("accents", {}) for accent_lang, accent_map in accents.items(): final_map = {} for from_phoneme, to_phonemes in accent_map.items(): if isinstance(to_phonemes, str): to_phonemes = [to_phonemes] final_map[from_phoneme] = to_phonemes self.accents[accent_lang] = final_map
def __init__(self, config, language: typing.Optional[str] = None): if language is None: self.language = pydash.get(config, "language.code") else: self.language = language self.config = config # Language-specific loading custom_tokenize: typing.Optional[TOKENIZE_FUNC] = None if language == "fa": custom_tokenize = Language.make_fa_tokenize() self.tokenizer = Tokenizer(config, custom_tokenize=custom_tokenize) self.phonemizer = Phonemizer(config) self.phonemes = Phonemes.from_language(self.language) self.accents: typing.Dict[str, typing.Dict[str, typing.List[str]]] = {} # If True, primary/seconary stress should be kept during phonemization self.keep_stress = bool( pydash.get(self.config, "language.keep_stress", False)) # If True, acute/grave accents should be kept during phonemization self.keep_accents = bool( pydash.get(self.config, "language.keep_accents", False)) # Allowable tones in the language self.tones: typing.List[str] = pydash.get(self.config, "language.tones", []) # Load language-specific "accents" (different than acute/grave) accents = self.config.get("accents", {}) for accent_lang, accent_map in accents.items(): final_map = {} for from_phoneme, to_phonemes in accent_map.items(): if isinstance(to_phonemes, str): to_phonemes = [to_phonemes] final_map[from_phoneme] = to_phonemes self.accents[accent_lang] = final_map
def setUpClass(cls): cls.de_phonemes = Phonemes.from_language("de-de")