def extract_components(input, orthography='honzi_jcz'):
    assert orthography in {'honzi_jcz', 'jcz_only'}
    is_alphabetic = lambda x: x.isascii() and x.isalpha()
    jpings_list = pc.characters_to_jyutping(input)
    assert len(jpings_list) == 1
    _, jpings = jpings_list[0]
    x = pc.parse_jyutping(jpings)
    if orthography == 'jcz_only':
        return [str(y) for y in x]

    targets = [pc.characters_to_jyutping(glyph)[0] for glyph in input \
                if not is_alphabetic(glyph) and glyph != "-"]
    idx = 0
    tgt_idx = 0
    outs = []
    while idx < len(x):
        elem = x[idx]
        if tgt_idx >= len(targets):
            outs.extend([str(y) for y in x[idx:]])
            return outs
        if str(elem)[:-1] == targets[tgt_idx][1][:-1]:
            if targets[tgt_idx][0] in mouth_list:
                outs.append(str(elem))
            else:
                outs.append(targets[tgt_idx][0])
            tgt_idx += 1
        else:
            outs.append(str(elem))
        idx += 1
    return outs
 def search_single_char(self, word_dict, m_char):
     # search single char in word dict
     for key, value in word_dict.items():
         if m_char in key:
             m_index = list(key).index(m_char)
             jp = pc.parse_jyutping(value)
             jp_index = ''.join(list(jp[m_index]))
             return jp_index
def get_word_phone_list(word_dict, word_list):
    """
        get phone list and phone array of word_list

    Parameters:
        word_list -- list of words in a sentence ["我","是个"]
        create_oov -- oov dictionary needed to be created at first step.
    """

    phone_list = []
    tone_list = []
    syl_map = OrderedDict()
    word_map = OrderedDict()
    phone_index = 0
    char_index = 0
    word_index = 0
    lex_dict = mld.get_lexicon_dict('./lexicon.txt')
    for word in word_list:
        word = word.strip()
        try:
            word_phone = word_dict[word]
        except Exception as e:
            word_phone_list = []
            # need find one by one
            for char in list(word):
                try:
                    char_phone = word_dict[char]
                except Exception as e:
                    char_phone = mld.search_single_char(word_dict, char)
                    if char_phone is None:
                        print(char)
                char_phone_list = char_phone.strip().split(" ")
                word_phone_list.extend(char_phone_list)
            word_phone = ''.join(word_phone_list)
        jp = pc.parse_jyutping(word_phone)
        for phone_t in jp:
            char_phone = list(phone_t)
            char_phone = [e_phone for e_phone in char_phone if e_phone != '']
            assert char_phone[-1].isdigit()
            char_phone_list = lex_dict[''.join(char_phone[:-1])]
            for my_phone in char_phone_list:
                syl_map[phone_index] = char_index
                phone_index = phone_index + 1
            phone_list.extend(char_phone_list)
            tone_list.append(char_phone[-1])
            word_map[char_index] = word_index
            char_index = char_index + 1
            # non_tone_line_phones.append(''.join(char_phone[:-1]))
        word_index = word_index + 1
    #     logging.debug("phone_list:" + ' '.join(phone_list))
    return phone_list, tone_list, syl_map, word_map
def pycantonese_converter(input,
                          cur_cmu,
                          orthography='honzi_jcz',
                          sep_eng_words=True):
    assert orthography in {'honzi_jcz', 'jcz_only'}
    frags = re.split(r'(\s+)',
                     input)  # preserve whitespace except space character
    frags = list(filter(lambda x: x not in {'', ' '}, frags))
    outs, unparsed = [], set()
    prev_was_eng_word = False  # used for adding space between words
    for frag in frags:
        if frag.isspace() and (frag != ' ' and sep_eng_words):
            outs.append(frag)
            prev_was_eng_word = False
            continue
        for glyphs, jpings in pc.characters_to_jyutping(frag):
            if jpings is not None:
                jping_arr = pc.parse_jyutping(jpings)
                if re.search('[a-zA-Z]', glyphs):
                    # this is a dictionary entry with both honzi and latin characters
                    outs.extend(
                        extract_components(glyphs, orthography=orthography))
                else:
                    if orthography == 'jcz_only':
                        outs.extend([str(x) for x in jping_arr])
                    else:
                        for i, glyph in enumerate(glyphs):
                            outs.append(
                                str(jping_arr[i]) if glyph in
                                mouth_list else glyph)
                prev_was_eng_word = False
            elif len(glyphs) == 1 and not is_alphanum(
                    glyphs):  #glyphs in puncs or glyphs.isdigit():
                outs.append(glyphs)
                prev_was_eng_word = False
            else:
                outs, unparsed, prev_was_eng_word = words_to_jyutping(
                    glyphs, cur_cmu, outs, unparsed, prev_was_eng_word
                    and sep_eng_words)
    return outs, unparsed
def test_invalid_onset():
    with pytest.raises(ValueError) as e:
        parse_jyutping("shaa1")
    assert "onset error" in str(e.value)
def test_invalid_nucleus():
    with pytest.raises(ValueError) as e:
        parse_jyutping("sk3")
    assert "nucleus error" in str(e.value)
def test_invalid_coda():
    with pytest.raises(ValueError) as e:
        parse_jyutping("leil3")
    assert "coda error" in str(e.value)
def test_fewer_than_2_characters():
    with pytest.raises(ValueError) as e:
        parse_jyutping("3")
    assert "fewer than 2 characters" in str(e.value)
def test_no_tone():
    with pytest.raises(ValueError) as e:
        parse_jyutping("lei")
    assert "tone error" in str(e.value)
def tojyutping_converter(input,
                         cur_cmu,
                         orthography='honzi_jcz',
                         sep_eng_words=True):
    # input: original input from transliterate, cur_cmu
    # output: array of syllables/glyphs (outs)

    outs, unparsed, word = [], set(), ""

    # ToJyutping has better pronunciation values compared to PyCantonese
    tj_parse = ToJyutping.get_jyutping_list(input)

    # parse words properly
    # i.e. convert [('做', 'zou6'), ('g', None), ('y', None), ('m', None)]
    # to [('做', 'zou6'), ('gym', None)]
    new_tj_parse, word, ptr = [], "", 0
    # A-Z and a-z and 0-9 only
    while ptr < len(tj_parse):
        glyph, jping = tj_parse[ptr]
        # print(glyph, jping)
        if not is_loweralphanum(
                glyph):  # note sinoglyphs are alphabetic in unicode
            if glyph != " ":
                new_tj_parse.append(tj_parse[ptr])
            ptr += 1
        else:
            word = ""
            while is_loweralphanum(
                    glyph) and jping is None and ptr < len(tj_parse):
                # print("word =", word)
                glyph, jping = tj_parse[ptr]
                if is_loweralphanum(glyph) and jping is None:
                    word += glyph
                    ptr += 1
                else:
                    break
            if word != " ":
                new_tj_parse.append((word, None))

    prev_was_eng_word = False  # used for adding space between words

    # perform the conversion to Honzi-Jyutping mix
    # i.e. convert [('做', 'zou6'), ('gym', None)] to ['做','zim1']
    for i, pair in enumerate(new_tj_parse):
        glyphs, jpings = pair
        if jpings is not None:
            jping_arr = pc.parse_jyutping(jpings.replace(" ", ""))
            for j, glyph in enumerate(glyphs):
                if orthography == 'jcz_only':
                    outs.extend(jpings.split(" "))
                elif orthography == 'honzi_jcz':
                    outs.append(
                        str(jping_arr[j]) if glyph in mouth_list else glyph)
            prev_was_eng_word = False
        elif len(glyphs) == 1 and not is_alphanum(
                glyphs):  #glyphs in puncs or glyphs.isdigit():
            outs.append(glyphs)
            prev_was_eng_word = False
        else:
            outs, unparsed, prev_was_eng_word = words_to_jyutping(
                glyphs, cur_cmu, outs, unparsed, prev_was_eng_word
                and sep_eng_words)
    return outs, unparsed
        if (not (re.match('X.+', word[1]) == None) and not (word[0] == '鴨寮街')):
            allwords[file].pop(i)
        i = i + 1
# 揸fit, call機x3, 操fitx2 were also excluded
        
parsed_words = [] 
unparsed_words = []
parsed_word_files = []

for file in allwords.keys():
    i = 0;
    filename = re.sub("[A-Z]:\\\\.*\\\\hkcancor\\\\","",file)
    for word in allwords[file]:
        try:
            print("Word parsed: ", word)
            parsed_words.append(pc.parse_jyutping(word[2]));
            parsed_word_files.append(filename);
        except ValueError:
            unparsed_words.append(word)
            print("Error: The word ", word[2], " cannot be parsed.")
        except IndexError:
            unparsed_words.append(word)
            print("Error: The word ", word[2], " cannot be parsed.")

parsed_syls = []
parsed_syls_files = []

i = 0;
for word in parsed_words:
    for syl in word:
        parsed_syls.append(syl);
def test_wrong_data_type():
    with pytest.raises(ValueError):
        parse_jyutping(123)
def test_basic_case_gwong2dung1waa2():
    assert parse_jyutping("gwong2dung1waa2") == [
        ("gw", "o", "ng", "2"),
        ("d", "u", "ng", "1"),
        ("w", "aa", "", "2"),
    ]
Beispiel #14
0
    def get_word_phone_list(self,word_dict, word_list):
        """
            get phone list and phone array of word_list

        Parameters:
            word_list -- list of words in a sentence ["我","是个"]
            create_oov -- oov dictionary needed to be created at first step.
        """

        phone_list = []
        tone_list = []
        syl_map = OrderedDict()
        word_map = OrderedDict()
        phone_index = 0
        char_index = 0
        word_index = 0
        non_tone_line_phones = []
        for word in word_list:
            word = word.strip()
            try:
                word_phone = word_dict[word]
            except Exception as e:
                temp_word_phone = jyutping.get(word)
                temp_word_phone_renew = []
                # if polyphone appear, just pick first one
                for char_phone in temp_word_phone:
                    if isinstance(char_phone,list):
                        temp_word_phone_renew.append(char_phone[0])
                    else:
                        temp_word_phone_renew.append(char_phone)
                if temp_word_phone[0] == None:
                    print(word)
                    pdb.set_trace()
                    exit(0)
                word_phone = ''.join(temp_word_phone_renew)
            if word_phone == 'hng1':
                jp = [('h','ng','1')]
            elif word_phone == 'ung2':
                jp = [('u','ng','2')]
            else:
                try:
                    jp = pc.parse_jyutping(word_phone)
                except Exception as e:
                    pdb.set_trace()
            for phone_t in jp:
                char_phone = list(phone_t)
                char_phone = [e_phone for e_phone in char_phone if e_phone != '']
                assert char_phone[-1].isdigit()
                try:
                    char_phone_list = self.lex_dict[''.join(char_phone[:-1])]
                except Exception as e:
                    pdb.set_trace()
                for my_phone in char_phone_list:
                    syl_map[phone_index] = char_index
                    phone_index = phone_index + 1
                phone_list.extend(char_phone_list)
                tone_list.append(char_phone[-1])
                word_map[char_index] = word_index
                char_index = char_index + 1
                non_tone_line_phones.append(''.join(char_phone[:-1]))
            word_index = word_index + 1
        #     logging.debug("phone_list:" + ' '.join(phone_list))
        return phone_list, tone_list, syl_map, word_map,non_tone_line_phones
Beispiel #15
0
    
    age_to_tones[age] = Counter()
    
    for tagged_word in tagged_words:
        
        # jyutping should be like "gaa1jau2" (two syllables), "ngo5" (one syllable) etc
        mor = tagged_word[2]
        jyutping, _, _ = mor.partition('-')
        jyutping, _, _ = jyutping.partition('&')

        if not jyutping:
            continue
        
        # use PyCantonese to parse the "jyutping" str
        try:
            jyutping_parsed_list = pc.parse_jyutping(jyutping)
        except:
            continue
        
        for jyutping_parsed in jyutping_parsed_list:
            _, _, _, tone = jyutping_parsed  # (onset, nucleus, coda, tone)
            age_to_tones[age][tone] += 1


# Creating the dataframe for plotting the desired heatmap
# ---------------------------------------------------------------
# 
# The dataframe has three columns and is created by `data_dict`.

# In[17]:
def test_coda_ng():
    assert parse_jyutping("hoeng1") == [("h", "oe", "ng", "1")]
def test_no_noda():
    assert parse_jyutping("gaa1") == [("g", "aa", "", "1")]
def test_null_input(input_):
    assert parse_jyutping(input_) == []
Beispiel #19
0
    def get_word_phone_list(self, word_list, using_tool):
        """
            get phone list and phone array of word_list

        Parameters:
            word_dict -- dictionary of word
            word_list -- list of words in a sentence ["我","是个"] without non-verbal information
            using_tool -- whether use tool instead of dictionary to fetch phone sequence
            lang -- cantonse or mandarin
        TO DO: add more functions for language support
        :return
            phone list : ph e m e j
            tone list : 1 2 3
            syl_map: [p1:s1,p2,s1,p3,s1,p4,p4]
            word_map
            non_tone_line_phones
        """
        flag = False
        phone_list = []
        tone_list = []
        syl_map = OrderedDict()
        word_map = OrderedDict()
        # phone index the index of phone in one sentence
        phone_index = 0
        # char index the index of char in one sentence
        char_index = 0

        word_index = 0
        non_tone_line_phones = []
        for word in word_list:
            word = word.strip()
            # get the phone or word
            if not using_tool:
                try:
                    word_phone = word_dict[word]
                except Exception as e:
                    temp_word_phone = jyutping.get(word)
                    temp_word_phone_renew = []
                    # if polyphone appear, just pick first one
                    for char_phone in temp_word_phone:
                        if isinstance(char_phone, list):
                            temp_word_phone_renew.append(char_phone[0])
                        else:
                            temp_word_phone_renew.append(char_phone)
                    word_phone = ''.join(temp_word_phone_renew)
                    # word_phone [('j', 'a', 't', '1'),
                    #  ('g', 'a', 'u', '2'),
                    #  ('s', 'e', 'i', '3'),
                    #  ('g', 'a', 'u', '2'),
                    #  ('n', 'i', 'n', '4')]
                    if word_phone == 'hng1':
                        word_phone_list = [('h', 'ng', '1')]
                    elif word_phone == 'ung2':
                        word_phone_list = [('u', 'ng', '2')]
                    else:
                        try:
                            word_phone_list = pc.parse_jyutping(word_phone)
                        except Exception as e:
                            pdb.set_trace()
            else:
                word_phone_list = []
                # word = HanziConv.toSimplified(word)
                for character in pinyin(word, style=Style.TONE3):
                    if not character[0][-1].isdigit():
                        # 轻声作为第五声
                        character[0] += '5'
                    # assert character[0][-1].isdigit()
                    char_phone_sequence = []
                    char_phone_sequence = self.chinese_dict[character[0]
                                                            [:-1]].copy()
                    char_phone_sequence.append(character[0][-1])
                    word_phone_list.append(char_phone_sequence)

            for phone_t in word_phone_list:
                char_phone = phone_t
                char_phone = [
                    e_phone for e_phone in char_phone if e_phone != ''
                ]
                assert char_phone[-1].isdigit()
                char_phone_list = char_phone[:-1]
                for my_phone in char_phone_list:
                    syl_map[phone_index] = char_index
                    phone_index = phone_index + 1
                phone_list.extend(char_phone_list)
                tone_list.append(char_phone[-1])
                word_map[char_index] = word_index
                char_index = char_index + 1
                non_tone_line_phones.append(''.join(char_phone[:-1]))
            word_index = word_index + 1
        #     logging.debug("phone_list:" + ' '.join(phone_list))
        return phone_list, tone_list, syl_map, word_map, non_tone_line_phones
def test_syllabic_nasals():
    # TODO assert parse_jyutping('hm4') == [('h', 'm', '', '4')]
    assert parse_jyutping("ng5") == [("", "ng", "", "5")]
    assert parse_jyutping("m4") == [("", "m", "", "4")]
    assert parse_jyutping("n3") == [("", "n", "", "3")]