def get_all_jyutping(word): output = [] output.extend(get_all_yue(word)) word_to_jyutping_list = get_word_to_jyutping_list() if word in word_to_jyutping_list: output.extend(word_to_jyutping_list[word]) jyut = jyutping.get(word) if jyut != None and None not in jyut: if is_unambiguous(jyut) and (len(output) == 0 or len(output) > 1): output.insert(0, ''.join(get_first_of_all(jyut)).strip().replace(' ', '')) jyut = get_word_to_jyutping_corpus_mostfreq(word) if jyut != None: output.insert(0, jyut) return [x.strip().replace(' ', '') for x in output]
def get_word_phone_list(self, word_list, using_tool): """ get phone list and phone array of word_list Parameters: word_dict -- dictionary of word word_list -- list of words in a sentence ["我","是个"] without non-verbal information using_tool -- whether use tool instead of dictionary to fetch phone sequence lang -- cantonse or mandarin TO DO: add more functions for language support :return phone list : ph e m e j tone list : 1 2 3 syl_map: [p1:s1,p2,s1,p3,s1,p4,p4] word_map non_tone_line_phones """ flag = False phone_list = [] tone_list = [] syl_map = OrderedDict() word_map = OrderedDict() # phone index the index of phone in one sentence phone_index = 0 # char index the index of char in one sentence char_index = 0 word_index = 0 non_tone_line_phones = [] for word in word_list: word = word.strip() # get the phone or word if not using_tool: try: word_phone = word_dict[word] except Exception as e: temp_word_phone = jyutping.get(word) temp_word_phone_renew = [] # if polyphone appear, just pick first one for char_phone in temp_word_phone: if isinstance(char_phone, list): temp_word_phone_renew.append(char_phone[0]) else: temp_word_phone_renew.append(char_phone) word_phone = ''.join(temp_word_phone_renew) # word_phone [('j', 'a', 't', '1'), # ('g', 'a', 'u', '2'), # ('s', 'e', 'i', '3'), # ('g', 'a', 'u', '2'), # ('n', 'i', 'n', '4')] if word_phone == 'hng1': word_phone_list = [('h', 'ng', '1')] elif word_phone == 'ung2': word_phone_list = [('u', 'ng', '2')] else: try: word_phone_list = pc.parse_jyutping(word_phone) except Exception as e: pdb.set_trace() else: word_phone_list = [] # word = HanziConv.toSimplified(word) for character in pinyin(word, style=Style.TONE3): if not character[0][-1].isdigit(): # 轻声作为第五声 character[0] += '5' # assert character[0][-1].isdigit() char_phone_sequence = [] char_phone_sequence = self.chinese_dict[character[0] [:-1]].copy() char_phone_sequence.append(character[0][-1]) word_phone_list.append(char_phone_sequence) for phone_t in word_phone_list: char_phone = phone_t char_phone = [ e_phone for e_phone in char_phone if e_phone != '' ] assert char_phone[-1].isdigit() char_phone_list = char_phone[:-1] for my_phone in char_phone_list: syl_map[phone_index] = char_index phone_index = phone_index + 1 phone_list.extend(char_phone_list) tone_list.append(char_phone[-1]) word_map[char_index] = word_index char_index = char_index + 1 non_tone_line_phones.append(''.join(char_phone[:-1])) word_index = word_index + 1 # logging.debug("phone_list:" + ' '.join(phone_list)) return phone_list, tone_list, syl_map, word_map, non_tone_line_phones
parser = argparse.ArgumentParser( description= 'Adds jyutping to all entries in the standard Chinese dictionary') parser.add_argument('--input', dest='inputPath', help='Input dictionary file') parser.add_argument('--output', dest='outputPath', help='Output dictionary file') args = parser.parse_args() with open(args.inputPath) as input: with open(args.outputPath, "w+") as output: lines = input.readlines() for line in lines: characters = line.split(" ")[0] pronunciation = jyutping.get(characters) if None in pronunciation: pronunciation = [] for i in range(len(pronunciation)): if isinstance(pronunciation[i], list): pronunciation[i] = "/".join(pronunciation[i]) replPattern = r"\1 \2 [\3] {{{0}}} /\4/".format( " ".join(pronunciation)) newLine = re.sub(r"(.+?) (.+?) \[(.+?)\] \/(.+)\/", replPattern, line) # This way we can re-run on an existing dictionary to update the existing entries # if the first sub did not do anything. Otherwise the entry should be the same newLine = re.sub(r"(.+?) (.+?) \[(.+?)\] {.*?} \/(.+)\/",
def get_word_phone_list(self,word_dict, word_list): """ get phone list and phone array of word_list Parameters: word_list -- list of words in a sentence ["我","是个"] create_oov -- oov dictionary needed to be created at first step. """ phone_list = [] tone_list = [] syl_map = OrderedDict() word_map = OrderedDict() phone_index = 0 char_index = 0 word_index = 0 non_tone_line_phones = [] for word in word_list: word = word.strip() try: word_phone = word_dict[word] except Exception as e: temp_word_phone = jyutping.get(word) temp_word_phone_renew = [] # if polyphone appear, just pick first one for char_phone in temp_word_phone: if isinstance(char_phone,list): temp_word_phone_renew.append(char_phone[0]) else: temp_word_phone_renew.append(char_phone) if temp_word_phone[0] == None: print(word) pdb.set_trace() exit(0) word_phone = ''.join(temp_word_phone_renew) if word_phone == 'hng1': jp = [('h','ng','1')] elif word_phone == 'ung2': jp = [('u','ng','2')] else: try: jp = pc.parse_jyutping(word_phone) except Exception as e: pdb.set_trace() for phone_t in jp: char_phone = list(phone_t) char_phone = [e_phone for e_phone in char_phone if e_phone != ''] assert char_phone[-1].isdigit() try: char_phone_list = self.lex_dict[''.join(char_phone[:-1])] except Exception as e: pdb.set_trace() for my_phone in char_phone_list: syl_map[phone_index] = char_index phone_index = phone_index + 1 phone_list.extend(char_phone_list) tone_list.append(char_phone[-1]) word_map[char_index] = word_index char_index = char_index + 1 non_tone_line_phones.append(''.join(char_phone[:-1])) word_index = word_index + 1 # logging.debug("phone_list:" + ' '.join(phone_list)) return phone_list, tone_list, syl_map, word_map,non_tone_line_phones