def jyutping2tipa(jp_str): """ Convert *jp_str* to a list of LaTeX TIPA strings. """ jp_parsed_list = parse_jyutping(jp_str) tipa_list = [] for jp_parsed in jp_parsed_list: onset = jp_parsed[0] # TODO: Separate "final" as "nucleus" and "coda" instead? final = jp_parsed[1] + jp_parsed[2] tone = jp_parsed[3] tipa = ONSETS_TIPA[onset] + FINALS_TIPA[final] tipa = tipa.strip() + TONES_TIPA[tone] tipa_list.append(tipa) return tipa_list
def jyutping2yale(jp_str, as_list=False): """ Convert *jp_str* to Yale. :param as_list: If True (default: False), return a list of Yale strings for individual syllables. """ if PY2 and isinstance(jp_str, str): # pragma: no cover jp_str = unicode(jp_str) # noqa F821 ('unicode' undefined in py >= 3) jp_parsed_list = parse_jyutping(jp_str) yale_list = [] for jp_parsed in jp_parsed_list: onset = ONSETS_YALE[jp_parsed[0]] nucleus = NUCLEI_YALE[jp_parsed[1]] coda = CODAS_YALE[jp_parsed[2]] tone = jp_parsed[3] # still in parse_jyutping # jyutping2yale system uses "h" to mark the three low tones if tone in {"4", "5", "6"}: low_tone_h = "h" else: low_tone_h = "" # in jyutping2yale, long "aa" vowel with no coda is denoted by "a" if nucleus == "aa" and coda == "": nucleus = "a" # when nucleus is "yu"... # 1. disallow "yyu" (when onset is "y") # 2. change nucleus "yu" into "u" -- this is a hack for adding tone # diacritic, since we don't want "y" to bear the diacritic if nucleus == "yu": if onset == "y": onset = "" nucleus = "u" # when nucleus is "ng" # the tone diacritic has to be on "g" but not "n" # now we pretend that the nucleus is "g", and will prepend the "n" back # at the end if nucleus == 'ng': nucleus = 'g' # add the jyutping2yale tone diacritic to the first nucleus letter # parse_jyutping tone 1 --> add macron # parse_jyutping tone 2 or 5 --> add acute # parse_jyutping tone 4 --> add grave # parse_jyutping tone 3 or 6 --> (no diacritic) # If the accented letter doesn't exist in unicode, use the combining # accent instead. letter = nucleus[0] # nucleus 1st letter unicode_letter_name = unicodedata.name(letter) if tone == "1": try: letter_with_diacritic = unicodedata.lookup( unicode_letter_name + " WITH MACRON") except KeyError: letter_with_diacritic = letter + "\u0304" elif tone in {"2", "5"}: try: letter_with_diacritic = unicodedata.lookup( unicode_letter_name + " WITH ACUTE") except KeyError: letter_with_diacritic = letter + "\u0301" elif tone == "4": try: letter_with_diacritic = unicodedata.lookup( unicode_letter_name + " WITH GRAVE") except KeyError: letter_with_diacritic = letter + "\u0300" else: # either tone 3 or tone 6 letter_with_diacritic = letter nucleus = letter_with_diacritic + nucleus[1:] # add back "y" if the nucleus is "yu" # ("y" was taken away for convenience in adding tone diacritic) if jp_parsed[1] == "yu": nucleus = "y" + nucleus # add back "n" if the nucleus is "ng" # ('n' was taken away so that tone diacritic is on "g" but not "n") if jp_parsed[1] == 'ng': nucleus = 'n' + nucleus # parse_jyutping final "eu" should be jyutping2yale "ew" (not "eu") if coda == "u" and nucleus == "e": coda = "w" # save the resultant jyutping2yale if coda in {"i", "u", "w"} and tone in {"4", "5", "6"}: yale = onset + nucleus + coda + low_tone_h else: yale = onset + nucleus + low_tone_h + coda yale_list.append(yale) if as_list: return yale_list # Output yale_list as a string # Check if there's potential ambiguity when Yale strings are concatenated # Ambiguity case 1: # 1st syllable coda is one of the "ambiguous_consonants" # and 2nd syllable starts with a vowel *letter* # Ambiguity case 2: # 1st syllable has no coda and 2nd syllable starts with one of the # "ambiguous_consonants" # e.g., hei3hau6 'climate' --> heihauh # (middle "h" for tone in 1st syllable or being onset of 2nd syllable?) if len(yale_list) == 1: return yale_list[0] ambiguous_consonants = {'h', 'p', 't', 'k', 'm', 'n', 'ng'} vowel_letters = {'a', 'e', 'i', 'o', 'u', 'á', 'é', 'í', 'ó', 'ú', 'à', 'è', 'ì', 'ò', 'ù', 'ā', 'ē', 'ī', 'ō', 'ū'} output_str = '' for i in range(len(yale_list) - 1): yale1 = yale_list[i] yale2 = yale_list[i+1] ambiguous = False # test case 1: if endswithoneof(yale1, ambiguous_consonants) and \ startswithoneof(yale2, vowel_letters): ambiguous = True # test case 2: if not ambiguous and \ not endswithoneof(yale1, ambiguous_consonants) and \ startswithoneof(yale2, ambiguous_consonants): ambiguous = True output_str += yale1 if ambiguous: output_str += '\'' output_str += yale_list[-1] return output_str
def perform_search(fn_to_tagged_sents, onset=None, nucleus=None, coda=None, tone=None, initial=None, final=None, jyutping=None, character=None, pos=None, word_range=(0, 0), sent_range=(0, 0), tagged=True, sents=False): """ overall strategy: deal with jp (and all jp-related elements) first, and then the character 1. jp hierarchy of jp and associated search elements: jp / | \ onset/initial final tone / \ nucleus coda lower search elements cannot be used together with dominating higher elements """ # ensure tuple type: word_range and sent_range if not (type(word_range) == type(sent_range) == tuple): raise ValueError('word_range and sent_range must be tuples') words_left, words_right = word_range sents_left, sents_right = sent_range # ensure int type: words_left, words_right, sents_left, sents_right if not (type(words_left) == type(words_right) == type(sents_left) == type(sents_right) == int): raise ValueError('int required for {words, sents}_{left, right}') if sents_left > 0 or sents_right > 0: sents = True # determine what kinds of search we are doing character_search = False jp_search = False pos_search = False if character: character_search = True if onset or nucleus or coda or tone or final or jyutping: jp_search = True if pos: pos_search = True if not (character_search or jp_search or pos_search): raise ValueError('no search elements') # check if jyutping search is valid jp_search_tuple = (None, None, None, None) if jp_search: # ensure compatible jyutping search elements if final and (nucleus or coda): raise ValueError('final cannot be used together with ' 'either nucleus or coda (or both)') if jyutping and (onset or final or nucleus or coda or tone): raise ValueError('jyutping cannot be used together with other ' 'Jyutping elements') if (onset != initial) and onset and initial: raise ValueError('onset conflicts with initial') # onset/initial if initial: onset = initial # determine jp_search_tuple if jyutping: try: jp_search_list = parse_jyutping(jyutping) except ValueError: raise ValueError('invalid jyutping -- %s' % (repr(jyutping))) if len(jp_search_list) > 1: raise ValueError('only jyutping for one character is allowed') else: jp_search_tuple = jp_search_list[0] else: if final: nucleus, coda = parse_final(final) jp_search_tuple = (onset, nucleus, coda, tone) fn_to_results = {} for fn, tagged_sents in fn_to_tagged_sents.items(): sent_word_index_pairs = [] for i_sent, tagged_sent in enumerate(tagged_sents): for i_word, tagged_word in enumerate(tagged_sent): c_characters, c_pos, c_mor, _ = tagged_word # c = current c_jyutping = get_jyutping_from_mor(c_mor) # determine character_search and pos_search if character_search: character_match = character in c_characters else: character_match = True # if 'V' in c_pos.upper(): # import pdb; pdb.set_trace() if pos_search: pos_match = bool(re.search(pos, c_pos)) else: pos_match = True if not (character_match and pos_match): continue # determine if jyutping matches c_jyutping jyutping_match = False if not jp_search: jyutping_match = True elif not c_jyutping: pass else: try: c_parsed_jyutpings = parse_jyutping(c_jyutping) except ValueError: continue for c_parsed_jyutping in c_parsed_jyutpings: booleans = [ _jp_element_match(search_, current_) for search_, current_ in zip( jp_search_tuple, c_parsed_jyutping) ] if all(booleans): jyutping_match = True break if jyutping_match: sent_word_index_pairs.append((i_sent, i_word)) results_list = [] for i_sent, i_word in sent_word_index_pairs: if not sents: tagged_sent = tagged_sents[i_sent] i_word_start = i_word - words_left i_word_end = i_word + words_right + 1 if i_word_start < 0: i_word_start = 0 if i_word_end > len(tagged_sent): i_word_end = len(tagged_sent) words_wanted = tagged_sent[i_word_start:i_word_end] if not tagged: words_wanted = [x[0] for x in words_wanted] if len(words_wanted) == 1: words_wanted = words_wanted[0] results_list.append(words_wanted) else: i_sent_start = i_sent - sents_left i_sent_end = i_sent + sents_right + 1 if i_sent_start < 0: i_sent_start = 0 if i_sent_end > len(tagged_sents): i_sent_end = len(tagged_sents) sents_wanted = tagged_sents[i_sent_start:i_sent_end] if not tagged: for i, sent in enumerate(sents_wanted[:]): sents_wanted[i] = [x[0] for x in sent] if len(sents_wanted) == 1: sents_wanted = sents_wanted[0] results_list.append(sents_wanted) fn_to_results[fn] = results_list return fn_to_results
def test_unicode_str_compatibility(): assert parse_jyutping('wui5') == [('w', 'u', 'i', '5')] assert parse_jyutping(u'wui5') == [('w', 'u', 'i', '5')] # note prefix u
def test_no_noda(): assert parse_jyutping('gaa1') == [('g', 'aa', '', '1')]
def test_coda_ng(): assert parse_jyutping('hoeng1') == [('h', 'oe', 'ng', '1')]
def test_invalid_onset(): with pytest.raises(ValueError) as e: parse_jyutping('shaa1') assert 'onset error' in str(e.value)
def test_invalid_nucleus(): with pytest.raises(ValueError) as e: parse_jyutping('sk3') assert 'nucleus error' in str(e.value)
def test_invalid_coda(): with pytest.raises(ValueError) as e: parse_jyutping('leil3') assert 'coda error' in str(e.value)
def test_fewer_than_2_characters(): with pytest.raises(ValueError) as e: parse_jyutping('3') assert 'fewer than 2 characters' in str(e.value)
def test_no_tone(): with pytest.raises(ValueError) as e: parse_jyutping('lei') assert 'tone error' in str(e.value)
def test_syllabic_nasals(): # TODO assert parse_jyutping('hm4') == [('h', 'm', '', '4')] assert parse_jyutping('ng5') == [('', 'ng', '', '5')] assert parse_jyutping('m4') == [('', 'm', '', '4')] assert parse_jyutping('n3') == [('', 'n', '', '3')]
def test_wrong_data_type(): with pytest.raises(ValueError): parse_jyutping(123)
def test_basic_case_gwong2dung1waa2(): assert parse_jyutping('gwong2dung1waa2') == [ ('gw', 'o', 'ng', '2'), ('d', 'u', 'ng', '1'), ('w', 'aa', '', '2') ]