def add_mark_char(char, mark): """ Add mark to a single char. """ if char == u'': return u'' case = char.isupper() ac = accent.get_accent_char(char) char = accent.add_accent_char(char.lower(), Accent.NONE) new_char = char if mark == Mark.HAT: if char in FAMILY_A: new_char = u"â" elif char in FAMILY_O: new_char = u"ô" elif char in FAMILY_E: new_char = u"ê" elif mark == Mark.HORN: if char in FAMILY_O: new_char = u"ơ" elif char in FAMILY_U: new_char = u"ư" elif mark == Mark.BREVE: if char in FAMILY_A: new_char = u"ă" elif mark == Mark.BAR: if char in FAMILY_D: new_char = u"đ" elif mark == Mark.NONE: if char in FAMILY_A: new_char = u"a" elif char in FAMILY_E: new_char = u"e" elif char in FAMILY_O: new_char = u"o" elif char in FAMILY_U: new_char = u"u" elif char in FAMILY_D: new_char = u"d" new_char = accent.add_accent_char(new_char, ac) return utils.change_case(new_char, case)
def is_valid_combination(components): """Check if a character combination complies to Vietnamese spelling. Input: components - a list of the form [u'c', u'a', u'm'] Output: True if OK, False otherwise. """ comps = list(components) # We only work with lower case for i in range(len(comps)): comps[i] = utils.change_case(comps[i], 0) # Allow 'đ' to appear in abbreviations like 'đm', 'đc', 'kgcđ', etc. #if comps[0] and not comps[1] and not comps[2] and \ #not comps[0] in ('gi', 'qu'): #for c in comps[0]: #if not c in CONSONANTS: #return False #return True if comps[0] and not comps[1] and not comps[2]: return True # Check if our start sound is a proper consonant if (comps[0] != u'') and (not (comps[0] in CONSONANTS)): return False # And if our ending sound is a proper ending consonant if (comps[2] != u'') and (not (comps[2] in ENDING_CONSONANTS)): return False vowel = accent.remove_accent_string(comps[1]) if len(vowel) > 1: if not (vowel in OPEN_COMPOUND_VOWELS or \ vowel in CLOSED_COMPOUND_VOWELS): return False if vowel in CLOSED_COMPOUND_VOWELS and \ not vowel in OPEN_COMPOUND_VOWELS and comps[2] != u'': return False # 'ăch'? if comps[2] == u'ch' and ((vowel in u'ăâeôơuư') or \ (vowel in OPEN_COMPOUND_VOWELS and not vowel in CLOSED_COMPOUND_VOWELS)): return False # 'ương' is ok but 'ơng' ? if comps[2] == u'ng' and vowel in (u'ơ'): return False # Sadly, this interferes with 'nhếch' :< #if comps[2] == u'c' and vowel in u'ê': # return False # Get the first accent ac = Accent.NONE for i in range(len(comps[1])): a = accent.get_accent_char(comps[1][i]) if a != Accent.NONE: ac = a break # These consonants can only go with ACUTE, DOT or NONE accents if comps[2] in [u'c', u'p', u't', u'ch'] and \ not ac in [Accent.NONE, Accent.ACUTE, Accent.DOT]: return False return True
def transform(comps, trans): """ Transform the given string with transform type trans """ components = list(comps) # Special case for 'ư, ơ' #if trans[0] == '<' and not trans[1] in (u'ư', u'ơ', u'Ư', u'Ơ'): # trans = '+' + trans[1] # (Not our job) if trans[0] == u'<': if not components[2]: # Undo operation if components[1][-1:] == trans[1]: return components # Only allow ư, ơ or ươ sitting alone in the middle part elif not components[1] or \ (components[1].lower() == u'ư' and trans[1].lower() == u'ơ'): components[1] += trans[1] # Quite a hack. If you want to type gi[f = 'giờ', separate() # will create ['g', 'i', '']. Therefore we have to allow # components[1] == 'i'. elif components[1].lower() == 'i' and components[0].lower() == 'g': components[1] += trans[1] components = separate(utils.join(components)) if trans[0] == u'+': # See this and you'll understand: # transform([u'nn', '', ''],'+n') = [u'nnn', '', ''] # transform([u'c', '', ''],'+o') = [u'c', 'o', ''] # transform([u'c', 'o', ''],'+o') = [u'c', 'oo', ''] # transform([u'c', 'o', ''],'+n') = [u'c', 'o', 'n'] if components[1] == u'': if utils.is_vowel(trans[1]): components[1] += trans[1] else: components[0] += trans[1] else: if components[2] == u'' and utils.is_vowel(trans[1]): components[1] += trans[1] else: components[2] += trans[1] # If there is any accent, remove and reapply it # because it is likely to be misplaced in previous transformations ac = accent.Accent.NONE for c in components[1]: ac = accent.get_accent_char(c) if ac: break if ac != accent.Accent.NONE: # Remove accent components = accent.add_accent(components, Accent.NONE) components = accent.add_accent(components, ac) return components action, factor = get_action(trans) if action == Action.ADD_ACCENT: components = accent.add_accent(components, factor) elif action == Action.ADD_MARK: if (mark.is_valid_mark(components, trans)): components = mark.add_mark(components, factor) return components
def transform(comps, trans): """ Transform the given string with transform type trans """ components = list(comps) # Special case for 'ư, ơ' #if trans[0] == '<' and not trans[1] in (u'ư', u'ơ', u'Ư', u'Ơ'): # trans = '+' + trans[1] # (Not our job) if trans[0] == u'<': if not components[2]: # Undo operation if components[1][-1:] == trans[1]: return components # Only allow ư, ơ or ươ sitting alone in the middle part elif not components[1] or \ (components[1].lower() == u'ư' and trans[1].lower() == u'ơ'): components[1] += trans[1] # Quite a hack. If you want to type gi[f = 'giờ', separate() # will create ['g', 'i', '']. Therefore we have to allow # components[1] == 'i'. elif components[1].lower() == 'i' and components[0].lower() == 'g': components[1] += trans[1] components = separate(utils.join(components)) if trans[0] == u'+': # See this and you'll understand: # transform([u'nn', '', ''],'+n') = [u'nnn', '', ''] # transform([u'c', '', ''],'+o') = [u'c', 'o', ''] # transform([u'c', 'o', ''],'+o') = [u'c', 'oo', ''] # transform([u'c', 'o', ''],'+n') = [u'c', 'o', 'n'] if components[1] == u'': if utils.is_vowel(trans[1]): components[1] += trans[1] else: components[0] += trans[1] else: if components[2] == u'' and utils.is_vowel(trans[1]): components[1] += trans[1] else: components[2] += trans[1] # If there is any accent, remove and reapply it # because it is likely to be misplaced in previous transformations ac = accent.Accent.NONE for c in components[1]: ac = accent.get_accent_char(c) if ac: break if ac != accent.Accent.NONE: # Remove accent components = accent.add_accent(components, Accent.NONE) components = accent.add_accent(components, ac) return components action, factor = get_action (trans) if action == Action.ADD_ACCENT: components = accent.add_accent(components, factor) elif action == Action.ADD_MARK: if (mark.is_valid_mark(components, trans)): components = mark.add_mark(components, factor) return components