def inputText_client_(self, string, client): if not hasattr(self, 'initialized'): self.__init__() if string == ' ': self.reset() return NO self.raw_string += string result = bogo.process_sequence(self.raw_string) same_initial_chars = list( takewhile(lambda tupl: tupl[0] == tupl[1], zip(self.composing_string, result))) n_backspace = len(self.composing_string) - len(same_initial_chars) string_to_commit = result[len(same_initial_chars):] start = self.client().length() - n_backspace length = len(string_to_commit) self.client().insertText_replacementRange_(string_to_commit, NSMakeRange(start, length)) self.composing_string = result return YES
def inputText_client_(self, string, client): if not hasattr(self, 'initialized'): self.__init__() if string == ' ': self.reset() return NO self.raw_string += string result = bogo.process_sequence(self.raw_string) same_initial_chars = list(takewhile(lambda tupl: tupl[0] == tupl[1], zip(self.composing_string, result))) n_backspace = len(self.composing_string) - len(same_initial_chars) string_to_commit = result[len(same_initial_chars):] start = self.client().length() - n_backspace length = len(string_to_commit) self.client().insertText_replacementRange_( string_to_commit, NSMakeRange(start, length)) self.composing_string = result return YES
def clean_tone(str_in): str_in = ftfy.fix_text(str_in) typing_out = [] for word in str_in.split(): if len(set_map_char & set(list(word))) == 0: typing_out.append(word) else: sub_words = subword_split.split(word) for i in range(len(sub_words)): if len(set_map_char & set(list(sub_words[i]))) != 0: # if after bogo == before bogo, using raw input sub_word_enter = get_enter_code(sub_words[i]) sub_word_no_tone = remove_tone(sub_words[i]) sub_word_bogo = bogo.process_sequence(sub_word_enter) sub_word_bogo_enter = get_enter_code(sub_word_bogo) sub_word_bogo_no_tone = remove_tone(sub_word_bogo) if sub_word_bogo != sub_word_enter and \ sub_word_bogo_no_tone == sub_word_no_tone and \ sub_word_bogo_enter == sub_word_enter: sub_words[i] = sub_word_bogo typing_out.append(''.join(sub_words)) return " ".join(typing_out)
continue if '#' in line: line = line[:line.index('#')].strip() parts = line.split('\t') weight = 1 if len(parts) == 3: weight = int(parts[2]) elif len(parts) != 2: continue telex = parts[1].strip() if telex == '': continue for telex in get_variants(telex): weights[telex] += weight print('''# Rime dictionary # encoding: utf-8 --- name: vietnamese version: "2013.07.10" sort: original use_preset_vocabulary: false max_phrase_length: 7 min_phrase_weight: 100 ... ''', file=outfile) for telex,weight in weights.items(): vietnamese = bogo.process_sequence(telex.replace('z', ''), rules=rules).strip() print('\t'.join(map(str, [vietnamese + ' ', telex, weight])), file=outfile)
#for telex in get_variants(telex): # vietnamese = telex #bogo.process_sequence(telex.replace('z', ''), rules=rules).strip() # vietnamese_to_hannom_to_weight[vietnamese][hannom] += weight print('''# Rime dictionary # encoding: utf-8 --- name: vietnamese version: "2013.07.10" sort: original use_preset_vocabulary: false ... ''', file=outfile) def sorted_descending_by_weight(d): return [ y[0] for y in sorted(list(d.items()), key=lambda x: x[1], reverse=True) ] full_width_space = ' ' for vietnamese, hannom_to_weight in vietnamese_to_hannom_to_weight.items(): weight = sum(hannom_to_weight.values()) for telex in get_variants(vietnamese): quocngu = bogo.process_sequence(telex.replace('z', ''), rules=rules).strip() print('\t'.join(map(str, [quocngu + ' ', telex, weight])), file=outfile)
def process_data(text_raw): dau = '.,;?:!' pattern = '\s' pattern1 = '^<' pattern2 = '>$' pattern3 = '^<[\s|\w|\W]{1,10000}>$' # them khoang trang sau > text = text_raw.replace('>', '> ') text = text.replace('<', ' <') text = text.replace('<u>', '') text = text.replace('</u>', '') # them <> cho cac dau cau for i in dau: index = text.find(i) if index == -1: text = text else: replace = ' <' + i + '> ' text = text.replace(i, replace) # lay ra list word list_raw = re.split(pattern, text) list_words = [] for lt in list_raw: if lt != '': list_words.append(lt) # filter and join word words = [] index = 0 while index < len(list_words): regx1 = re.match(pattern1, list_words[index]) regx2 = re.match(pattern2, list_words[index]) regx3 = re.match(pattern3, list_words[index]) if regx3: words.append(list_words[index]) index = index + 1 elif regx1 and not regx2: words.append(list_words[index] + ' ' + list_words[index + 1]) index = index + 2 elif regx2 and not regx1: index = index + 1 elif list_words[index] == '': index = index + 1 else: words.append(list_words[index]) index = index + 1 list_sen = [] list_raw = [] for (index, word) in enumerate(words): wrd = {} wrd1 = {} regx3 = re.match(pattern3, word) if regx3: wrd = {"index": index, "type": 1, "word": word} wrd1 = {"index": index, "type": 1, "word": word} else: wrd = { "index": index, "type": 0, "word": bg.process_sequence(word) } wrd1 = {"index": index, "type": 0, "word": word} list_sen.append(wrd) list_raw.append(wrd1) sentence_raw = '' for wd in list_sen: if wd['type'] == 0: sentence_raw = sentence_raw + wd['word'] + ' ' return list_sen, list_raw, sentence_raw