def separate(string): """ Separates a valid Vietnamese word into 3 components: the start sound, the middle sound and the end sound. Eg: toán -> [u't', u'oá', u't'] Otherwise returns None (not a valid Vietnamese word). """ comps = [u'', u'', u''] if string == u'': return comps # Search for the first vowel for i in range(len(string)): if utils.is_vowel(string[i]): comps[0] = u'' + string[:i] string = u'' + string[i:] break # No vowel? if comps[0] == u'' and not utils.is_vowel(string[0]): comps[0] = string string = u'' # Search for the first consonant after the first vowel for i in range(len(string)): if not utils.is_vowel(string[i]): comps[1] = string[:i] comps[2] = string[i:] break # No ending consonant? Then the rest of the string must be the vowel part if comps[1] == u'': comps[1] = string # 'gi' and 'qu' need some special treatments # We want something like this: # ['g', 'ia', ''] -> ['gi', 'a', ''] if (comps[0] != u'' and comps[1] != u'') and \ ((comps[0] in u'gG' and comps[1][0] in 'iI' and len(comps[1]) > 1) or \ (comps[0] in u'qQ' and comps[1][0] in 'uU')): comps[0] += comps[1][:1] comps[1] = comps[1][1:] if not is_valid_combination(comps): return None return comps
def get_end(word, span): i = len(word) cnt = 0 while True: i -= 1 if is_vowel(word[i]) and not any(r(word, i) for r in RULES): cnt += 1 if i == 0 or cnt == span: break return word[i:]
def gen_error(word): new_word = [] for char in word: if not char.isalpha(): new_word.append(char) continue to_do = random.randint(1, 4) if to_do == VOWEL and utils.is_vowel(char): new_word.append(random.choice(vowels)) elif to_do == CAPITALIZATION: new_word.append(utils.change_case(char)) elif to_do == REPETITION: while 1: new_word.append(char) if random.randint(0, 1): break else: new_word.append(char) return ''.join(new_word)
def transform(comps, trans): """ Transform the given string with transform type trans """ components = list(comps) # Special case for 'ư, ơ' #if trans[0] == '<' and not trans[1] in (u'ư', u'ơ', u'Ư', u'Ơ'): # trans = '+' + trans[1] # (Not our job) if trans[0] == u'<': if not components[2]: # Undo operation if components[1][-1:] == trans[1]: return components # Only allow ư, ơ or ươ sitting alone in the middle part elif not components[1] or \ (components[1].lower() == u'ư' and trans[1].lower() == u'ơ'): components[1] += trans[1] # Quite a hack. If you want to type gi[f = 'giờ', separate() # will create ['g', 'i', '']. Therefore we have to allow # components[1] == 'i'. elif components[1].lower() == 'i' and components[0].lower() == 'g': components[1] += trans[1] components = separate(utils.join(components)) if trans[0] == u'+': # See this and you'll understand: # transform([u'nn', '', ''],'+n') = [u'nnn', '', ''] # transform([u'c', '', ''],'+o') = [u'c', 'o', ''] # transform([u'c', 'o', ''],'+o') = [u'c', 'oo', ''] # transform([u'c', 'o', ''],'+n') = [u'c', 'o', 'n'] if components[1] == u'': if utils.is_vowel(trans[1]): components[1] += trans[1] else: components[0] += trans[1] else: if components[2] == u'' and utils.is_vowel(trans[1]): components[1] += trans[1] else: components[2] += trans[1] # If there is any accent, remove and reapply it # because it is likely to be misplaced in previous transformations ac = accent.Accent.NONE for c in components[1]: ac = accent.get_accent_char(c) if ac: break if ac != accent.Accent.NONE: # Remove accent components = accent.add_accent(components, Accent.NONE) components = accent.add_accent(components, ac) return components action, factor = get_action(trans) if action == Action.ADD_ACCENT: components = accent.add_accent(components, factor) elif action == Action.ADD_MARK: if (mark.is_valid_mark(components, trans)): components = mark.add_mark(components, factor) return components
def transform(comps, trans): """ Transform the given string with transform type trans """ components = list(comps) # Special case for 'ư, ơ' #if trans[0] == '<' and not trans[1] in (u'ư', u'ơ', u'Ư', u'Ơ'): # trans = '+' + trans[1] # (Not our job) if trans[0] == u'<': if not components[2]: # Undo operation if components[1][-1:] == trans[1]: return components # Only allow ư, ơ or ươ sitting alone in the middle part elif not components[1] or \ (components[1].lower() == u'ư' and trans[1].lower() == u'ơ'): components[1] += trans[1] # Quite a hack. If you want to type gi[f = 'giờ', separate() # will create ['g', 'i', '']. Therefore we have to allow # components[1] == 'i'. elif components[1].lower() == 'i' and components[0].lower() == 'g': components[1] += trans[1] components = separate(utils.join(components)) if trans[0] == u'+': # See this and you'll understand: # transform([u'nn', '', ''],'+n') = [u'nnn', '', ''] # transform([u'c', '', ''],'+o') = [u'c', 'o', ''] # transform([u'c', 'o', ''],'+o') = [u'c', 'oo', ''] # transform([u'c', 'o', ''],'+n') = [u'c', 'o', 'n'] if components[1] == u'': if utils.is_vowel(trans[1]): components[1] += trans[1] else: components[0] += trans[1] else: if components[2] == u'' and utils.is_vowel(trans[1]): components[1] += trans[1] else: components[2] += trans[1] # If there is any accent, remove and reapply it # because it is likely to be misplaced in previous transformations ac = accent.Accent.NONE for c in components[1]: ac = accent.get_accent_char(c) if ac: break if ac != accent.Accent.NONE: # Remove accent components = accent.add_accent(components, Accent.NONE) components = accent.add_accent(components, ac) return components action, factor = get_action (trans) if action == Action.ADD_ACCENT: components = accent.add_accent(components, factor) elif action == Action.ADD_MARK: if (mark.is_valid_mark(components, trans)): components = mark.add_mark(components, factor) return components
def atomic_separate(string, last_chars, last_is_vowel): if string == "" or (last_is_vowel != is_vowel(string[-1])): return (string, last_chars) else: return atomic_separate(string[:-1], string[-1] + last_chars, last_is_vowel)