Python parse_jyutpingの例、pycantonese.jyutping.parse_jyutping Pythonの例

コード例 #1

0

ファイルを表示

ファイル: tipa.py プロジェクト: pycantonese/pycantonese

def jyutping2tipa(jp_str):
    """
    Convert *jp_str* to a list of LaTeX TIPA strings.
    """
    jp_parsed_list = parse_jyutping(jp_str)
    tipa_list = []

    for jp_parsed in jp_parsed_list:
        onset = jp_parsed[0]
        # TODO: Separate "final" as "nucleus" and "coda" instead?
        final = jp_parsed[1] + jp_parsed[2]
        tone = jp_parsed[3]
        tipa = ONSETS_TIPA[onset] + FINALS_TIPA[final]
        tipa = tipa.strip() + TONES_TIPA[tone]
        tipa_list.append(tipa)

    return tipa_list

コード例 #2

0

ファイルを表示

ファイル: tipa.py プロジェクト: tisttsf/pycantonese

def jyutping2tipa(jp_str):
    """
    Convert *jp_str* to a list of LaTeX TIPA strings.
    """
    jp_parsed_list = parse_jyutping(jp_str)
    tipa_list = []

    for jp_parsed in jp_parsed_list:
        onset = jp_parsed[0]
        # TODO: Separate "final" as "nucleus" and "coda" instead?
        final = jp_parsed[1] + jp_parsed[2]
        tone = jp_parsed[3]
        tipa = ONSETS_TIPA[onset] + FINALS_TIPA[final]
        tipa = tipa.strip() + TONES_TIPA[tone]
        tipa_list.append(tipa)

    return tipa_list

コード例 #3

0

ファイルを表示

def jyutping2yale(jp_str, as_list=False):
    """
    Convert *jp_str* to Yale.

    :param as_list: If True (default: False), return a list of Yale strings
        for individual syllables.
    """
    if PY2 and isinstance(jp_str, str):  # pragma: no cover
        jp_str = unicode(jp_str)  # noqa F821 ('unicode' undefined in py >= 3)

    jp_parsed_list = parse_jyutping(jp_str)
    yale_list = []

    for jp_parsed in jp_parsed_list:
        onset = ONSETS_YALE[jp_parsed[0]]
        nucleus = NUCLEI_YALE[jp_parsed[1]]
        coda = CODAS_YALE[jp_parsed[2]]
        tone = jp_parsed[3]  # still in parse_jyutping

        # jyutping2yale system uses "h" to mark the three low tones
        if tone in {"4", "5", "6"}:
            low_tone_h = "h"
        else:
            low_tone_h = ""

        # in jyutping2yale, long "aa" vowel with no coda is denoted by "a"
        if nucleus == "aa" and coda == "":
            nucleus = "a"

        # when nucleus is "yu"...
        # 1. disallow "yyu" (when onset is "y")
        # 2. change nucleus "yu" into "u" -- this is a hack for adding tone
        #       diacritic, since we don't want "y" to bear the diacritic
        if nucleus == "yu":
            if onset == "y":
                onset = ""
            nucleus = "u"

        # when nucleus is "ng"
        # the tone diacritic has to be on "g" but not "n"
        # now we pretend that the nucleus is "g", and will prepend the "n" back
        # at the end
        if nucleus == 'ng':
            nucleus = 'g'

        # add the jyutping2yale tone diacritic to the first nucleus letter
        # parse_jyutping tone 1      --> add macron
        # parse_jyutping tone 2 or 5 --> add acute
        # parse_jyutping tone 4      --> add grave
        # parse_jyutping tone 3 or 6 --> (no diacritic)
        # If the accented letter doesn't exist in unicode, use the combining
        # accent instead.

        letter = nucleus[0]  # nucleus 1st letter
        unicode_letter_name = unicodedata.name(letter)
        if tone == "1":
            try:
                letter_with_diacritic = unicodedata.lookup(
                    unicode_letter_name + " WITH MACRON")
            except KeyError:
                letter_with_diacritic = letter + "\u0304"
        elif tone in {"2", "5"}:
            try:
                letter_with_diacritic = unicodedata.lookup(
                    unicode_letter_name + " WITH ACUTE")
            except KeyError:
                letter_with_diacritic = letter + "\u0301"
        elif tone == "4":
            try:
                letter_with_diacritic = unicodedata.lookup(
                    unicode_letter_name + " WITH GRAVE")
            except KeyError:
                letter_with_diacritic = letter + "\u0300"
        else:
            # either tone 3 or tone 6
            letter_with_diacritic = letter
        nucleus = letter_with_diacritic + nucleus[1:]

        # add back "y" if the nucleus is "yu"
        # ("y" was taken away for convenience in adding tone diacritic)
        if jp_parsed[1] == "yu":
            nucleus = "y" + nucleus

        # add back "n" if the nucleus is "ng"
        # ('n' was taken away so that tone diacritic is on "g" but not "n")
        if jp_parsed[1] == 'ng':
            nucleus = 'n' + nucleus

        # parse_jyutping final "eu" should be jyutping2yale "ew" (not "eu")
        if coda == "u" and nucleus == "e":
            coda = "w"

        # save the resultant jyutping2yale
        if coda in {"i", "u", "w"} and tone in {"4", "5", "6"}:
            yale = onset + nucleus + coda + low_tone_h
        else:
            yale = onset + nucleus + low_tone_h + coda
        yale_list.append(yale)

    if as_list:
        return yale_list

    # Output yale_list as a string
    # Check if there's potential ambiguity when Yale strings are concatenated

    # Ambiguity case 1:
    #   1st syllable coda is one of the "ambiguous_consonants"
    #   and 2nd syllable starts with a vowel *letter*

    # Ambiguity case 2:
    #   1st syllable has no coda and 2nd syllable starts with one of the
    #   "ambiguous_consonants"
    #   e.g., hei3hau6 'climate' --> heihauh
    #   (middle "h" for tone in 1st syllable or being onset of 2nd syllable?)

    if len(yale_list) == 1:
        return yale_list[0]

    ambiguous_consonants = {'h', 'p', 't', 'k', 'm', 'n', 'ng'}
    vowel_letters = {'a', 'e', 'i', 'o', 'u',
                     'á', 'é', 'í', 'ó', 'ú',
                     'à', 'è', 'ì', 'ò', 'ù',
                     'ā', 'ē', 'ī', 'ō', 'ū'}

    output_str = ''

    for i in range(len(yale_list) - 1):
        yale1 = yale_list[i]
        yale2 = yale_list[i+1]

        ambiguous = False

        # test case 1:
        if endswithoneof(yale1, ambiguous_consonants) and \
                startswithoneof(yale2, vowel_letters):
            ambiguous = True

        # test case 2:
        if not ambiguous and \
                not endswithoneof(yale1, ambiguous_consonants) and \
                startswithoneof(yale2, ambiguous_consonants):
            ambiguous = True

        output_str += yale1

        if ambiguous:
            output_str += '\''

    output_str += yale_list[-1]

    return output_str

コード例 #4

0

ファイルを表示

ファイル: search.py プロジェクト: tisttsf/pycantonese

def perform_search(fn_to_tagged_sents,
                   onset=None,
                   nucleus=None,
                   coda=None,
                   tone=None,
                   initial=None,
                   final=None,
                   jyutping=None,
                   character=None,
                   pos=None,
                   word_range=(0, 0),
                   sent_range=(0, 0),
                   tagged=True,
                   sents=False):
    """
    overall strategy: deal with jp (and all jp-related elements) first, and
                      then the character

    1. jp

    hierarchy of jp and associated search elements:
                      jp
                 /     |    \
      onset/initial  final  tone
                     /   \
                nucleus  coda

    lower search elements cannot be used together with dominating higher
    elements
    """
    # ensure tuple type: word_range and sent_range
    if not (type(word_range) == type(sent_range) == tuple):
        raise ValueError('word_range and sent_range must be tuples')

    words_left, words_right = word_range
    sents_left, sents_right = sent_range

    # ensure int type: words_left, words_right, sents_left, sents_right
    if not (type(words_left) == type(words_right) == type(sents_left) ==
            type(sents_right) == int):
        raise ValueError('int required for {words, sents}_{left, right}')

    if sents_left > 0 or sents_right > 0:
        sents = True

    # determine what kinds of search we are doing
    character_search = False
    jp_search = False
    pos_search = False

    if character:
        character_search = True
    if onset or nucleus or coda or tone or final or jyutping:
        jp_search = True
    if pos:
        pos_search = True
    if not (character_search or jp_search or pos_search):
        raise ValueError('no search elements')

    # check if jyutping search is valid
    jp_search_tuple = (None, None, None, None)

    if jp_search:

        # ensure compatible jyutping search elements
        if final and (nucleus or coda):
            raise ValueError('final cannot be used together with '
                             'either nucleus or coda (or both)')
        if jyutping and (onset or final or nucleus or coda or tone):
            raise ValueError('jyutping cannot be used together with other '
                             'Jyutping elements')
        if (onset != initial) and onset and initial:
            raise ValueError('onset conflicts with initial')

        # onset/initial
        if initial:
            onset = initial

        # determine jp_search_tuple
        if jyutping:
            try:
                jp_search_list = parse_jyutping(jyutping)
            except ValueError:
                raise ValueError('invalid jyutping -- %s' % (repr(jyutping)))
            if len(jp_search_list) > 1:
                raise ValueError('only jyutping for one character is allowed')
            else:
                jp_search_tuple = jp_search_list[0]
        else:
            if final:
                nucleus, coda = parse_final(final)
            jp_search_tuple = (onset, nucleus, coda, tone)

    fn_to_results = {}

    for fn, tagged_sents in fn_to_tagged_sents.items():
        sent_word_index_pairs = []

        for i_sent, tagged_sent in enumerate(tagged_sents):

            for i_word, tagged_word in enumerate(tagged_sent):
                c_characters, c_pos, c_mor, _ = tagged_word  # c = current
                c_jyutping = get_jyutping_from_mor(c_mor)

                # determine character_search and pos_search
                if character_search:
                    character_match = character in c_characters
                else:
                    character_match = True

                # if 'V' in c_pos.upper():
                #     import pdb; pdb.set_trace()
                if pos_search:
                    pos_match = bool(re.search(pos, c_pos))
                else:
                    pos_match = True

                if not (character_match and pos_match):
                    continue

                # determine if jyutping matches c_jyutping
                jyutping_match = False

                if not jp_search:
                    jyutping_match = True
                elif not c_jyutping:
                    pass
                else:
                    try:
                        c_parsed_jyutpings = parse_jyutping(c_jyutping)
                    except ValueError:
                        continue

                    for c_parsed_jyutping in c_parsed_jyutpings:

                        booleans = [
                            _jp_element_match(search_, current_)
                            for search_, current_ in zip(
                                jp_search_tuple, c_parsed_jyutping)
                        ]

                        if all(booleans):
                            jyutping_match = True
                        break

                if jyutping_match:
                    sent_word_index_pairs.append((i_sent, i_word))

        results_list = []

        for i_sent, i_word in sent_word_index_pairs:
            if not sents:
                tagged_sent = tagged_sents[i_sent]
                i_word_start = i_word - words_left
                i_word_end = i_word + words_right + 1

                if i_word_start < 0:
                    i_word_start = 0
                if i_word_end > len(tagged_sent):
                    i_word_end = len(tagged_sent)

                words_wanted = tagged_sent[i_word_start:i_word_end]

                if not tagged:
                    words_wanted = [x[0] for x in words_wanted]

                if len(words_wanted) == 1:
                    words_wanted = words_wanted[0]

                results_list.append(words_wanted)
            else:
                i_sent_start = i_sent - sents_left
                i_sent_end = i_sent + sents_right + 1

                if i_sent_start < 0:
                    i_sent_start = 0
                if i_sent_end > len(tagged_sents):
                    i_sent_end = len(tagged_sents)

                sents_wanted = tagged_sents[i_sent_start:i_sent_end]

                if not tagged:
                    for i, sent in enumerate(sents_wanted[:]):
                        sents_wanted[i] = [x[0] for x in sent]

                if len(sents_wanted) == 1:
                    sents_wanted = sents_wanted[0]

                results_list.append(sents_wanted)

        fn_to_results[fn] = results_list

    return fn_to_results

コード例 #5

0

ファイルを表示

ファイル: test_parse_jyutping.py プロジェクト: tisttsf/pycantonese

def test_unicode_str_compatibility():
    assert parse_jyutping('wui5') == [('w', 'u', 'i', '5')]
    assert parse_jyutping(u'wui5') == [('w', 'u', 'i', '5')]  # note prefix u

コード例 #6

0

ファイルを表示

ファイル: test_parse_jyutping.py プロジェクト: tisttsf/pycantonese

def test_no_noda():
    assert parse_jyutping('gaa1') == [('g', 'aa', '', '1')]

コード例 #7

0

ファイルを表示

ファイル: test_parse_jyutping.py プロジェクト: tisttsf/pycantonese

def test_coda_ng():
    assert parse_jyutping('hoeng1') == [('h', 'oe', 'ng', '1')]

コード例 #8

0

ファイルを表示

ファイル: test_parse_jyutping.py プロジェクト: tisttsf/pycantonese

def test_invalid_onset():
    with pytest.raises(ValueError) as e:
        parse_jyutping('shaa1')
    assert 'onset error' in str(e.value)

コード例 #9

0

ファイルを表示

ファイル: test_parse_jyutping.py プロジェクト: tisttsf/pycantonese

def test_invalid_nucleus():
    with pytest.raises(ValueError) as e:
        parse_jyutping('sk3')
    assert 'nucleus error' in str(e.value)

コード例 #10

0

ファイルを表示

ファイル: test_parse_jyutping.py プロジェクト: tisttsf/pycantonese

def test_invalid_coda():
    with pytest.raises(ValueError) as e:
        parse_jyutping('leil3')
    assert 'coda error' in str(e.value)

コード例 #11

0

ファイルを表示

ファイル: test_parse_jyutping.py プロジェクト: tisttsf/pycantonese

def test_fewer_than_2_characters():
    with pytest.raises(ValueError) as e:
        parse_jyutping('3')
    assert 'fewer than 2 characters' in str(e.value)

コード例 #12

0

ファイルを表示

ファイル: test_parse_jyutping.py プロジェクト: tisttsf/pycantonese

def test_no_tone():
    with pytest.raises(ValueError) as e:
        parse_jyutping('lei')
    assert 'tone error' in str(e.value)

コード例 #13

0

ファイルを表示

ファイル: test_parse_jyutping.py プロジェクト: tisttsf/pycantonese

def test_syllabic_nasals():
    # TODO assert parse_jyutping('hm4') == [('h', 'm', '', '4')]
    assert parse_jyutping('ng5') == [('', 'ng', '', '5')]
    assert parse_jyutping('m4') == [('', 'm', '', '4')]
    assert parse_jyutping('n3') == [('', 'n', '', '3')]

コード例 #14

0

ファイルを表示

ファイル: test_parse_jyutping.py プロジェクト: tisttsf/pycantonese

def test_wrong_data_type():
    with pytest.raises(ValueError):
        parse_jyutping(123)

コード例 #15

0

ファイルを表示

ファイル: test_parse_jyutping.py プロジェクト: tisttsf/pycantonese

def test_basic_case_gwong2dung1waa2():
    assert parse_jyutping('gwong2dung1waa2') == [
        ('gw', 'o', 'ng', '2'), ('d', 'u', 'ng', '1'), ('w', 'aa', '', '2')
    ]

コード例 #16

0

ファイルを表示

ファイル: yale.py プロジェクト: pycantonese/pycantonese

def jyutping2yale(jp_str, as_list=False):
    """
    Convert *jp_str* to Yale.

    :param as_list: If True (default: False), return a list of Yale strings
        for individual syllables.
    """
    if PY2 and isinstance(jp_str, str):  # pragma: no cover
        jp_str = unicode(jp_str)  # noqa F821 ('unicode' undefined in py >= 3)

    jp_parsed_list = parse_jyutping(jp_str)
    yale_list = []

    for jp_parsed in jp_parsed_list:
        onset = ONSETS_YALE[jp_parsed[0]]
        nucleus = NUCLEI_YALE[jp_parsed[1]]
        coda = CODAS_YALE[jp_parsed[2]]
        tone = jp_parsed[3]  # still in parse_jyutping

        # jyutping2yale system uses "h" to mark the three low tones
        if tone in {"4", "5", "6"}:
            low_tone_h = "h"
        else:
            low_tone_h = ""

        # in jyutping2yale, long "aa" vowel with no coda is denoted by "a"
        if nucleus == "aa" and coda == "":
            nucleus = "a"

        # when nucleus is "yu"...
        # 1. disallow "yyu" (when onset is "y")
        # 2. change nucleus "yu" into "u" -- this is a hack for adding tone
        #       diacritic, since we don't want "y" to bear the diacritic
        if nucleus == "yu":
            if onset == "y":
                onset = ""
            nucleus = "u"

        # when nucleus is "ng"
        # the tone diacritic has to be on "g" but not "n"
        # now we pretend that the nucleus is "g", and will prepend the "n" back
        # at the end
        if nucleus == 'ng':
            nucleus = 'g'

        # add the jyutping2yale tone diacritic to the first nucleus letter
        # parse_jyutping tone 1      --> add macron
        # parse_jyutping tone 2 or 5 --> add acute
        # parse_jyutping tone 4      --> add grave
        # parse_jyutping tone 3 or 6 --> (no diacritic)
        # If the accented letter doesn't exist in unicode, use the combining
        # accent instead.

        letter = nucleus[0]  # nucleus 1st letter
        unicode_letter_name = unicodedata.name(letter)
        if tone == "1":
            try:
                letter_with_diacritic = unicodedata.lookup(
                    unicode_letter_name + " WITH MACRON")
            except KeyError:
                letter_with_diacritic = letter + "\u0304"
        elif tone in {"2", "5"}:
            try:
                letter_with_diacritic = unicodedata.lookup(
                    unicode_letter_name + " WITH ACUTE")
            except KeyError:
                letter_with_diacritic = letter + "\u0301"
        elif tone == "4":
            try:
                letter_with_diacritic = unicodedata.lookup(
                    unicode_letter_name + " WITH GRAVE")
            except KeyError:
                letter_with_diacritic = letter + "\u0300"
        else:
            # either tone 3 or tone 6
            letter_with_diacritic = letter
        nucleus = letter_with_diacritic + nucleus[1:]

        # add back "y" if the nucleus is "yu"
        # ("y" was taken away for convenience in adding tone diacritic)
        if jp_parsed[1] == "yu":
            nucleus = "y" + nucleus

        # add back "n" if the nucleus is "ng"
        # ('n' was taken away so that tone diacritic is on "g" but not "n")
        if jp_parsed[1] == 'ng':
            nucleus = 'n' + nucleus

        # parse_jyutping final "eu" should be jyutping2yale "ew" (not "eu")
        if coda == "u" and nucleus == "e":
            coda = "w"

        # save the resultant jyutping2yale
        if coda in {"i", "u", "w"} and tone in {"4", "5", "6"}:
            yale = onset + nucleus + coda + low_tone_h
        else:
            yale = onset + nucleus + low_tone_h + coda
        yale_list.append(yale)

    if as_list:
        return yale_list

    # Output yale_list as a string
    # Check if there's potential ambiguity when Yale strings are concatenated

    # Ambiguity case 1:
    #   1st syllable coda is one of the "ambiguous_consonants"
    #   and 2nd syllable starts with a vowel *letter*

    # Ambiguity case 2:
    #   1st syllable has no coda and 2nd syllable starts with one of the
    #   "ambiguous_consonants"
    #   e.g., hei3hau6 'climate' --> heihauh
    #   (middle "h" for tone in 1st syllable or being onset of 2nd syllable?)

    if len(yale_list) == 1:
        return yale_list[0]

    ambiguous_consonants = {'h', 'p', 't', 'k', 'm', 'n', 'ng'}
    vowel_letters = {'a', 'e', 'i', 'o', 'u',
                     'á', 'é', 'í', 'ó', 'ú',
                     'à', 'è', 'ì', 'ò', 'ù',
                     'ā', 'ē', 'ī', 'ō', 'ū'}

    output_str = ''

    for i in range(len(yale_list) - 1):
        yale1 = yale_list[i]
        yale2 = yale_list[i+1]

        ambiguous = False

        # test case 1:
        if endswithoneof(yale1, ambiguous_consonants) and \
                startswithoneof(yale2, vowel_letters):
            ambiguous = True

        # test case 2:
        if not ambiguous and \
                not endswithoneof(yale1, ambiguous_consonants) and \
                startswithoneof(yale2, ambiguous_consonants):
            ambiguous = True

        output_str += yale1

        if ambiguous:
            output_str += '\''

    output_str += yale_list[-1]

    return output_str