예제 #1
0
    def inputText_client_(self, string, client):
        if not hasattr(self, 'initialized'):
            self.__init__()

        if string == ' ':
            self.reset()
            return NO

        self.raw_string += string
        result = bogo.process_sequence(self.raw_string)

        same_initial_chars = list(
            takewhile(lambda tupl: tupl[0] == tupl[1],
                      zip(self.composing_string, result)))

        n_backspace = len(self.composing_string) - len(same_initial_chars)
        string_to_commit = result[len(same_initial_chars):]

        start = self.client().length() - n_backspace
        length = len(string_to_commit)

        self.client().insertText_replacementRange_(string_to_commit,
                                                   NSMakeRange(start, length))

        self.composing_string = result

        return YES
예제 #2
0
파일: main.py 프로젝트: BoGoEngine/bogo-osx
	def inputText_client_(self, string, client):
		if not hasattr(self, 'initialized'):
			self.__init__()

		if string == ' ':
			self.reset()
			return NO
		
		self.raw_string += string
		result = bogo.process_sequence(self.raw_string)

		same_initial_chars = list(takewhile(lambda tupl: tupl[0] == tupl[1],
                                            zip(self.composing_string,
                                                result)))

		n_backspace = len(self.composing_string) - len(same_initial_chars)
		string_to_commit = result[len(same_initial_chars):]

		start = self.client().length() - n_backspace 
		length = len(string_to_commit)

		self.client().insertText_replacementRange_(
			string_to_commit,
			NSMakeRange(start, length))

		self.composing_string = result

		return YES
예제 #3
0
def clean_tone(str_in):
    str_in = ftfy.fix_text(str_in)
    typing_out = []
    for word in str_in.split():
        if len(set_map_char & set(list(word))) == 0:
            typing_out.append(word)
        else:
            sub_words = subword_split.split(word)
            for i in range(len(sub_words)):
                if len(set_map_char & set(list(sub_words[i]))) != 0:
                    # if after bogo == before bogo, using raw input
                    sub_word_enter = get_enter_code(sub_words[i])
                    sub_word_no_tone = remove_tone(sub_words[i])
                    sub_word_bogo = bogo.process_sequence(sub_word_enter)
                    sub_word_bogo_enter = get_enter_code(sub_word_bogo)
                    sub_word_bogo_no_tone = remove_tone(sub_word_bogo)
                    if sub_word_bogo != sub_word_enter and \
                            sub_word_bogo_no_tone == sub_word_no_tone and \
                            sub_word_bogo_enter == sub_word_enter:
                        sub_words[i] = sub_word_bogo
            typing_out.append(''.join(sub_words))

    return " ".join(typing_out)
예제 #4
0
    continue
  if '#' in line:
    line = line[:line.index('#')].strip()
  parts = line.split('\t')
  weight = 1
  if len(parts) == 3:
    weight = int(parts[2])
  elif len(parts) != 2:
    continue
  telex = parts[1].strip()
  if telex == '':
    continue
  for telex in get_variants(telex):
    weights[telex] += weight

print('''# Rime dictionary
# encoding: utf-8
---
name: vietnamese
version: "2013.07.10"
sort: original
use_preset_vocabulary: false
max_phrase_length: 7
min_phrase_weight: 100
...
''', file=outfile)

for telex,weight in weights.items():
  vietnamese = bogo.process_sequence(telex.replace('z', ''), rules=rules).strip()
  print('\t'.join(map(str, [vietnamese + ' ', telex, weight])), file=outfile)
예제 #5
0
    #for telex in get_variants(telex):
    #  vietnamese = telex #bogo.process_sequence(telex.replace('z', ''), rules=rules).strip()
    #  vietnamese_to_hannom_to_weight[vietnamese][hannom] += weight

print('''# Rime dictionary
# encoding: utf-8
---
name: vietnamese
version: "2013.07.10"
sort: original
use_preset_vocabulary: false
...
''',
      file=outfile)


def sorted_descending_by_weight(d):
    return [
        y[0] for y in sorted(list(d.items()), key=lambda x: x[1], reverse=True)
    ]


full_width_space = ' '
for vietnamese, hannom_to_weight in vietnamese_to_hannom_to_weight.items():
    weight = sum(hannom_to_weight.values())
    for telex in get_variants(vietnamese):
        quocngu = bogo.process_sequence(telex.replace('z', ''),
                                        rules=rules).strip()
        print('\t'.join(map(str, [quocngu + ' ', telex, weight])),
              file=outfile)
def process_data(text_raw):
    dau = '.,;?:!'
    pattern = '\s'
    pattern1 = '^<'
    pattern2 = '>$'
    pattern3 = '^<[\s|\w|\W]{1,10000}>$'
    # them khoang trang sau >
    text = text_raw.replace('>', '> ')
    text = text.replace('<', ' <')
    text = text.replace('<u>', '')
    text = text.replace('</u>', '')
    # them <> cho cac dau cau
    for i in dau:
        index = text.find(i)
        if index == -1:
            text = text
        else:
            replace = ' <' + i + '> '
            text = text.replace(i, replace)

    # lay ra list word
    list_raw = re.split(pattern, text)
    list_words = []
    for lt in list_raw:
        if lt != '':
            list_words.append(lt)

    # filter and join word
    words = []
    index = 0
    while index < len(list_words):
        regx1 = re.match(pattern1, list_words[index])
        regx2 = re.match(pattern2, list_words[index])
        regx3 = re.match(pattern3, list_words[index])
        if regx3:
            words.append(list_words[index])
            index = index + 1
        elif regx1 and not regx2:
            words.append(list_words[index] + ' ' + list_words[index + 1])
            index = index + 2
        elif regx2 and not regx1:
            index = index + 1
        elif list_words[index] == '':
            index = index + 1
        else:
            words.append(list_words[index])
            index = index + 1

    list_sen = []
    list_raw = []
    for (index, word) in enumerate(words):
        wrd = {}
        wrd1 = {}
        regx3 = re.match(pattern3, word)
        if regx3:
            wrd = {"index": index, "type": 1, "word": word}
            wrd1 = {"index": index, "type": 1, "word": word}
        else:
            wrd = {
                "index": index,
                "type": 0,
                "word": bg.process_sequence(word)
            }
            wrd1 = {"index": index, "type": 0, "word": word}
        list_sen.append(wrd)
        list_raw.append(wrd1)

    sentence_raw = ''
    for wd in list_sen:
        if wd['type'] == 0:
            sentence_raw = sentence_raw + wd['word'] + ' '

    return list_sen, list_raw, sentence_raw