def test_filter_chars(self): s = u"əˈkiːn æˌkænˈθɑ.lə.d͡ʒi" s_ipa = IPAString(unicode_string=s) values = [ (None, s), ([], s), ({}, s), (u"", s), (u"foo", s), (u"bar", s), (0, s), (1, s), (u"cns", u"knknθld͡ʒ"), (u"consonants", u"knknθld͡ʒ"), (u"vwl", u"əiææɑəi"), (u"vowels", u"əiææɑəi"), (u"cns_vwl", u"əkinækænθɑləd͡ʒi"), (u"letters", u"əkinækænθɑləd͡ʒi"), (u"cns_vwl_pstr", u"əˈkinækænˈθɑləd͡ʒi"), (u"cvp", u"əˈkinækænˈθɑləd͡ʒi"), (u"cns_vwl_pstr_long", u"əˈkiːnækænˈθɑləd͡ʒi"), (u"cvpl", u"əˈkiːnækænˈθɑləd͡ʒi"), (u"cns_vwl_str", u"əˈkinæˌkænˈθɑləd͡ʒi"), (u"cvs", u"əˈkinæˌkænˈθɑləd͡ʒi"), (u"cns_vwl_str_len", u"əˈkiːnæˌkænˈθɑləd͡ʒi"), (u"cvsl", u"əˈkiːnæˌkænˈθɑləd͡ʒi"), (u"cns_vwl_str_len_wb", u"əˈkiːn æˌkænˈθɑləd͡ʒi"), (u"cvslw", u"əˈkiːn æˌkænˈθɑləd͡ʒi"), (u"cns_vwl_str_len_wb_sb", u"əˈkiːn æˌkænˈθɑ.lə.d͡ʒi"), (u"cvslws", u"əˈkiːn æˌkænˈθɑ.lə.d͡ʒi"), ] for v, e in values: self.assertTrue(s_ipa.filter_chars(v).is_equivalent(IPAString(unicode_string=e)))
def test_filter_chars(self): s = u"əˈkiːn æˌkænˈθɑ.lə.d͡ʒi" s_ipa = IPAString(unicode_string=s) values = [ (None, s), ([], s), ({}, s), (u"", s), (u"foo", s), (u"bar", s), (0, s), (1, s), (u"cns", u"knknθld͡ʒ"), (u"consonants", u"knknθld͡ʒ"), (u"vwl", u"əiææɑəi"), (u"vowels", u"əiææɑəi"), (u"cns_vwl", u"əkinækænθɑləd͡ʒi"), (u"letters", u"əkinækænθɑləd͡ʒi"), (u"cns_vwl_pstr", u"əˈkinækænˈθɑləd͡ʒi"), (u"cvp", u"əˈkinækænˈθɑləd͡ʒi"), (u"cns_vwl_pstr_long", u"əˈkiːnækænˈθɑləd͡ʒi"), (u"cvpl", u"əˈkiːnækænˈθɑləd͡ʒi"), (u"cns_vwl_str", u"əˈkinæˌkænˈθɑləd͡ʒi"), (u"cvs", u"əˈkinæˌkænˈθɑləd͡ʒi"), (u"cns_vwl_str_len", u"əˈkiːnæˌkænˈθɑləd͡ʒi"), (u"cvsl", u"əˈkiːnæˌkænˈθɑləd͡ʒi"), (u"cns_vwl_str_len_wb", u"əˈkiːn æˌkænˈθɑləd͡ʒi"), (u"cvslw", u"əˈkiːn æˌkænˈθɑləd͡ʒi"), (u"cns_vwl_str_len_wb_sb", u"əˈkiːn æˌkænˈθɑ.lə.d͡ʒi"), (u"cvslws", u"əˈkiːn æˌkænˈθɑ.lə.d͡ʒi"), ] for v, e in values: self.assertTrue( s_ipa.filter_chars(v).is_equivalent( IPAString(unicode_string=e)))
def _extract_symbols(input_symbols: List[str], ignore_tones: bool, ignore_arcs: bool, replace_unknown_ipa_by: str = PADDING_SYMBOL) -> List[str]: symbols: List[str] = [] input_word = ''.join(input_symbols) try: ipa = IPAString(unicode_string=input_word, ignore=False) except: ipa = IPAString(unicode_string=input_word, ignore=True) print(f"{input_word} conversion to IPA failed. Result would be: {ipa}.") result = [replace_unknown_ipa_by] * len(input_symbols) return result for char in ipa.ipa_chars: if char.is_diacritic or char.is_tone: if len(symbols) > 0: if char.is_tone and ignore_tones: continue # I think it is a bug in IPAString that the arc sometimes gets classified as diacritic and sometimes not if char.unicode_repr == ARC: if ignore_arcs: continue symbols.append(ARC) else: symbols[-1] += char.unicode_repr else: uc = char.unicode_repr if ignore_arcs: uc = uc.split(ARC) symbols.extend(uc) else: symbols.append(uc) return symbols
def test_ipa_chars_set(self): IPAString().ipa_chars = [U2I[u"f"], U2I[u"o"], U2I[u"o"]] with self.assertRaises(TypeError): IPAString().ipa_chars = u"foo" with self.assertRaises(TypeError): IPAString().ipa_chars = [U2I[u"f"], None] with self.assertRaises(TypeError): IPAString().ipa_chars = [U2I[u"f"], u"o", u"o"]
def test_add(self): values = [ (None, None, 0), (None, u"a", 1), (u"a", None, 1), (u"a", u"b", 2), (u"f\u006e\u0361\u006doo", u"", 4), (u"f\u006e\u0361\u006doo", u"foo", 7), ] for v1, v2, e in values: self.assertEqual( len( IPAString(unicode_string=v1) + IPAString(unicode_string=v2)), e)
def test_init_ipa_chars_bad(self): values = [ 1, u"", "", u"foo", "foo", { "k": "v" }, [None], [1], [u""], [""], [u"foo"], ["foo"], [{ "k": "v" }], [U2I[u"f"], None], ["f", U2I[u"o"], U2I[u"o"]], ] for v in values: with self.assertRaises(TypeError): IPAString(ipa_chars=v)
def test_map_ipa_string_ignore(self): mapper = KirshenbaumMapper() values = [ (u"", u""), (u"foo", u"foo"), (u"\u0070\u032A", u"p["), (u"\u025F", u"J"), (u"\u0294", u"?"), (u"foo\u025F\u0294", u"fooJ?"), (u"fo\u02C8o\u025F\u0294", u"fo'oJ?"), (u"foo bar", u"foo#bar<trl>"), (u"\u0261\u0067", u"gg"), (u"ma\u0272ana", u"man^ana"), (u"\u02A3", u"dz"), (u"\u02A7", u"tS"), (u"L", u""), (u"foo", u"foo"), (u"\u0070\u032AL", u"p["), (u"L\u025FM", u"J"), (u"L\u0294M", u"?"), (u"fLoo\u025F\u0294M", u"fooJ?"), (u"fo\u02C8oL\u025F\u0294M", u"fo'oJ?"), (u"fooL MbarN", u"foo#bar<trl>"), (u"\u0261L\u0067", u"gg"), (u"mLa\u0272Mana", u"man^ana"), (u"L\u02A3", u"dz"), (u"\u02A7M", u"tS"), ] for v, e in values: self.assertEqual( mapper.map_ipa_string(IPAString(unicode_string=v, ignore=True), ignore=True), e)
def test_is_equivalent(self): values = [ (None, None, True), (None, u"", True), (u"", None, True), (u"", u"", True), (u"f", u"f", True), (u"f\u006e\u0361\u006d", u"f\u006e\u0361\u006d", True), (u"f\u006e\u0361\u006d", u"f\u006e\u035C\u006d", True), (u"f\u006e\u0361\u006d", u"f\u006e\u006d", True), (u"\u0074\u0361\u026C", u"\u019B", True), ] for v1, v2, e in values: self.assertEqual( IPAString(unicode_string=v1).is_equivalent( IPAString(unicode_string=v2)), e)
def map_unicode_string(self, unicode_string, ignore=False, single_char_parsing=False, return_as_list=False, return_can_map=False): """ Convert the given Unicode string, representing an IPA string, to a string containing the corresponding mapped representation. Return ``None`` if ``unicode_string`` is ``None``. :param str unicode_string: the Unicode string to be parsed :param bool ignore: if ``True``, ignore Unicode characters that are not IPA valid :param bool single_char_parsing: if ``True``, parse one Unicode character at a time :param bool return_as_list: if ``True``, return as a list of strings, one for each IPAChar, instead of their concatenation (single str) :param bool return_can_map: if ``True``, return a pair ``(bool, str)``, where the first element says if the mapper can map all the IPA characters in the given IPA string, and the second element is either ``None`` or the mapped string/list :rtype: str or (bool, str) or (bool, list) """ if unicode_string is None: return None ipa_string = IPAString(unicode_string=unicode_string, ignore=ignore, single_char_parsing=single_char_parsing) return self.map_ipa_string(ipa_string=ipa_string, ignore=ignore, return_as_list=return_as_list, return_can_map=return_can_map)
def _process_phonology(self, string): """Process phonology.""" syll = "".join(string.split()) try: syll = "".join([str(x) for x in IPAString(unicode_string=syll)]) return segment_phonology(syll, to_keep=self.diacritics) except ValueError: return None
def get_syllable_features(syl): from ipapy.ipastring import IPAString import unicodedata if syl['ipa'] == 'R': # represents silence syllables return None try: ipa = IPAString(unicode_string=syl['ipa'].replace('I', 'ɪ')) except ValueError: # manually fix some errors ipa = IPAString(unicode_string={ 'Nis': 'nis', 'ɾoU': 'ɾou', 'Vin': 'vin', 'vIN': 'vɪn', 'ɾe-': 'ɾe' }[syl['ipa']]) start, end, idx = None, None, 0 for ph in ipa: if ph.is_vowel: if start is None: start = idx if end is not None: raise ValueError("Discontinued nucleus in: {}".format(str(ph))) elif ph.is_consonant: if start is not None and end is None: end = idx idx += len( [c for c in ph.unicode_repr if unicodedata.category(c) != 'Mn']) if start is None: assert end is None return '', syl['ipa'], '' onset = syl['ipa'][:start] nucleus = syl['ipa'][start:end] if end is not None: coda = syl['ipa'][end:] else: coda = '' return onset, nucleus, coda
def test_init_ipa_chars(self): values = [ None, [], [U2I[u"f"]], [U2I[u"f"], U2I[u"o"], U2I[u"o"]], ] for v in values: IPAString(ipa_chars=v)
def test_init_unicode_string_ignore(self): values = [ u"L", u"fL", u"fLooM", u"/\u0066\u02BCoo/", u"[f\u006e\u0361\u006doo]", ] for v in values: IPAString(unicode_string=v, ignore=True)
def _parse_phonemes(phonemes): """Parse the incoming tuple of phonemes as IPA characters.""" phonemes = [ IPAString(unicode_string=p, single_char_parsing=True) for p in phonemes ] vowels = filter(lambda x: x[0].is_vowel, phonemes) consonants = filter(lambda x: not x[0].is_vowel, phonemes) return list(vowels), list(consonants)
def test_init_unicode_string(self): values = [ None, u"", u"f", u"foo", u"\u0066\u02BCoo", u"f\u006e\u0361\u006doo", ] for v in values: IPAString(unicode_string=v)
def test_init_unicode_len(self): values = [ (None, 0), (u"", 0), (u"f", 1), (u"foo", 3), (u"\u0066\u02BCoo", 3), (u"f\u006e\u0361\u006doo", 4), ] for v, e in values: self.assertEqual(len(IPAString(unicode_string=v)), e)
def i2t(ipa): ipa = unicodedata.normalize('NFD', ipa) ipa = re.sub(r'^\*', '', ipa) tokens = ipa2tokens(ipa, merge_vowels=False, merge_geminates=False) ret = list() for t in tokens: # NOTE(j_luo) Stress symbol is not handled by `ipapy`'s canonicalization process. t = t.replace("'", 'ˈ') # NOTE(j_luo) Not sure what these symbols mean. t = t.replace('̣', '').replace('̧', '').replace('̦', '') ret.append(str(IPAString(unicode_string=t))) return ret
def _parse_phonemes(self, phonemes): """Parse the incoming tuple of phonemes as IPA characters.""" phonemes = [ IPAString(unicode_string=p, single_char_parsing=True) for p in phonemes ] for x in phonemes: if not self._is_valid(x): raise ValueError("{} was not a valid phoneme.".format(x)) vowels = filter(lambda x: x[0].is_vowel, phonemes) consonants = filter(lambda x: x[0].is_consonant, phonemes) return list(vowels), list(consonants)
def test_canonical_representation(self): values = [ (None, 0), (u"", 0), (u"f", 1), (u"foo", 3), (u"\u0066\u02BCoo", 3), (u"f\u006e\u0361\u006doo", 4), ] for v, e in values: self.assertEqual( len( IPAString( unicode_string=v, single_char_parsing=True).canonical_representation), e)
def test_init_unicode_string_bad(self): values = [ b"", b"f", b"foo", b"\u0066\u02BCoo", b"f\u006e\u0361\u006doo", u"L", u"fL", u"fLooM", u"/\u0066\u02BCoo/", u"[f\u006e\u0361\u006doo]", ] for v in values: with self.assertRaises(ValueError): IPAString(unicode_string=v)
def test_map_ipa_string(self): mapper = ARPABETMapper() values = [ (u"", u""), (u"p", u"P"), (u"p\u03B8", u"PTH"), (u"\u027E", u"DX"), (u"p\u0258\u026A", u"PEY"), (u"p\u0258\u026Aw", u"PEYW"), (u"p\u0258\u026A\u0258\u026Aw", u"PEYEYW"), (u"p\u0258\u026A\u0251w", u"PEYAAW"), (u"\u006A\u0075", u"YUW"), ] for v, e in values: self.assertEqual( mapper.map_ipa_string(IPAString(unicode_string=v)), e)
def command_canonize(string, vargs): """ Print the canonical representation of the given string. It will replace non-canonical compound characters with their canonical synonym. :param str string: the string to act upon :param dict vargs: the command line arguments """ try: ipa_string = IPAString( unicode_string=string, ignore=vargs["ignore"], single_char_parsing=vargs["single_char_parsing"]) print(vargs["separator"].join([(u"%s" % c) for c in ipa_string])) except ValueError as exc: print_error(str(exc))
def test_can_map_ipa_string(self): mapper = ARPABETMapper() values = [ (u"", True), (u"p", True), (u"p\u03B8", True), (u"\u027E", True), (u"\u0258\u026A", True), (u"p\u0258\u026A", True), (u"p\u0258\u026Aw", True), (u"p\u0258\u026A\u0258\u026Aw", True), (u"p\u0258\u026A\u0251w", True), (u"\u006A\u0075", True), (u"\u1DC6", False), # valid IPA char, unmapped in Kirshenbaum (u"p\u1DC6b", False), # valid IPA char, unmapped in Kirshenbaum ] for v, e in values: self.assertEqual( mapper.can_map_ipa_string(IPAString(unicode_string=v)), e)
def test_map_ipa_string(self): mapper = KirshenbaumMapper() values = [ (u"", u""), (u"foo", u"foo"), (u"\u0070\u032A", u"p["), (u"\u025F", u"J"), (u"\u0294", u"?"), (u"foo\u025F\u0294", u"fooJ?"), (u"fo\u02C8o\u025F\u0294", u"fo'oJ?"), (u"foo bar", u"foo#bar<trl>"), (u"\u0261\u0067", u"gg"), (u"ma\u0272ana", u"man^ana"), (u"\u02A3", u"dz"), (u"\u02A7", u"tS"), ] for v, e in values: self.assertEqual( mapper.map_ipa_string(IPAString(unicode_string=v)), e)
def extract_symbols(ipa: str): symbols = [] for ch in ipa: x = IPAString(unicode_string=ch, ignore=True) x_len = len(x) was_ignored = x_len == 0 if was_ignored: symbols.append(ch) elif x_len == 1: char = x[0] if char.is_diacritic: if len(symbols) > 0: symbols[-1] += ch else: symbols.append(ch) else: assert False return symbols
def command_chars(string, vargs): """ Print a list of all IPA characters in the given string. It will print the Unicode representation, the full IPA name, and the Unicode "U+"-prefixed hexadecimal codepoint representation of each IPA character. :param str string: the string to act upon :param dict vargs: the command line arguments """ try: ipa_string = IPAString( unicode_string=string, ignore=vargs["ignore"], single_char_parsing=vargs["single_char_parsing"]) for c in ipa_string: print(u"'%s'\t%s (%s)" % (c.unicode_repr, c.name, unicode_to_hex(c.unicode_repr))) except ValueError as exc: print_error(str(exc))
def test_can_map_ipa_string(self): mapper = KirshenbaumMapper() values = [ (u"", True), (u"foo", True), (u"\u0070\u032A", True), (u"\u025F", True), (u"\u0294", True), (u"foo\u025F\u0294", True), (u"fo\u02C8o\u025F\u0294", True), (u"foo bar", True), (u"\u0261\u0067", True), (u"ma\u0272ana", True), (u"\u02A3", True), (u"\u02A7", True), (u"\u1DC6", False), # valid IPA char, unmapped in Kirshenbaum (u"foo\u1DC6bar", False), # valid IPA char, unmapped in Kirshenbaum ] for v, e in values: self.assertEqual( mapper.can_map_ipa_string(IPAString(unicode_string=v)), e)
def convert_stress(ipa: str) -> List[str]: tokens = i2t(ipa) should_stress = False ret = list() for t in tokens: if t.startswith('ˈ') or t.startswith("'"): t = t[1:] should_stress = True elif t.startswith('ˌ'): t = t[1:] t = str( IPAString(unicode_string=unicodedata.normalize('NFD', t), ignore=True)) seg = _processor.process(t) if isinstance(seg, Nphthong) or (isinstance(seg, Segment) and seg.is_vowel()): if should_stress: t = t + '{+}' should_stress = False else: t = t + '{-}' ret.append(t) assert not should_stress return ret
def test(): """test i2a function, print the original IPA and arpabet. """ a = u"ˈɑkən" print IPAString(unicode_string=a) print i2a(a)
def standardize(ph: str, ignore: bool = False) -> str: ph = str(IPAString(unicode_string=ph, ignore=ignore)) return unicodedata.normalize('NFD', ph)
def extract_symbols(ipa: str): symbols = [] for ch in ipa: x = IPAString(unicode_string=ch, ignore=True) x_len = len(x) was_ignored = x_len == 0 if was_ignored: symbols.append(ch) elif x_len == 1: char = x[0] if char.is_diacritic: if len(symbols) > 0: symbols[-1] += ch else: symbols.append(ch) else: assert False return symbols if __name__ == "__main__": y = u"ˈprɪnɪŋ, ɪn ðə ˈoʊnli sɛns wɪθ wɪʧ wi ər æt ˈprɛzənt kənˈsərnd, ˈdɪfərz frəm moʊst ɪf nɑt frəm ɔl ðə ɑrts ənd kræfts ˌrɛprɪˈzɛnɪd ɪn ðə ˌɛksəˈbɪʃən." #y = u"wɪʧ" #y = "ɪʃn̩'" s_ipa = IPAString(unicode_string=y, ignore=True) tmp = extract_symbols(y) print(tmp)