Example #1
0
def add_mark_char(char, mark):
    """
    Add mark to a single char.
    """
    if char == u'':
        return u''
    case = char.isupper()
    ac = accent.get_accent_char(char)
    char = accent.add_accent_char(char.lower(), Accent.NONE)
    new_char = char
    if mark == Mark.HAT:
        if char in FAMILY_A:
            new_char = u"â"
        elif char in FAMILY_O:
            new_char = u"ô"
        elif char in FAMILY_E:
            new_char = u"ê"
    elif mark == Mark.HORN:
        if char in FAMILY_O:
            new_char = u"ơ"
        elif char in FAMILY_U:
            new_char = u"ư"
    elif mark == Mark.BREVE:
        if char in FAMILY_A:
            new_char = u"ă"
    elif mark == Mark.BAR:
        if char in FAMILY_D:
            new_char = u"đ"
    elif mark == Mark.NONE:
        if char in FAMILY_A:
            new_char = u"a"
        elif char in FAMILY_E:
            new_char = u"e"
        elif char in FAMILY_O:
            new_char = u"o"
        elif char in FAMILY_U:
            new_char = u"u"
        elif char in FAMILY_D:
            new_char = u"d"

    new_char = accent.add_accent_char(new_char, ac)
    return utils.change_case(new_char, case)
Example #2
0
def add_mark_char(char, mark):
    """
    Add mark to a single char.
    """
    if char == u'':
        return u''
    case = char.isupper()
    ac = accent.get_accent_char(char)
    char = accent.add_accent_char(char.lower(), Accent.NONE)
    new_char = char
    if mark == Mark.HAT:
        if char in FAMILY_A:
            new_char = u"â"
        elif char in FAMILY_O:
            new_char = u"ô"
        elif char in FAMILY_E:
            new_char = u"ê"
    elif mark == Mark.HORN:
        if char in FAMILY_O:
            new_char = u"ơ"
        elif char in FAMILY_U:
            new_char = u"ư"
    elif mark == Mark.BREVE:
        if char in FAMILY_A:
            new_char = u"ă"
    elif mark == Mark.BAR:
        if char in FAMILY_D:
            new_char = u"đ"
    elif mark == Mark.NONE:
        if char in FAMILY_A:
            new_char = u"a"
        elif char in FAMILY_E:
            new_char = u"e"
        elif char in FAMILY_O:
            new_char = u"o"
        elif char in FAMILY_U:
            new_char = u"u"
        elif char in FAMILY_D:
            new_char = u"d"

    new_char = accent.add_accent_char(new_char, ac)
    return utils.change_case(new_char, case)
Example #3
0
def is_valid_combination(components):
    """Check if a character combination complies to Vietnamese spelling.
    
    Input:
        components - a list of the form [u'c', u'a', u'm']
    Output:
        True if OK, False otherwise.
    """
    comps = list(components)
    # We only work with lower case
    for i in range(len(comps)):
        comps[i] = utils.change_case(comps[i], 0)

    # Allow 'đ' to appear in abbreviations like 'đm', 'đc', 'kgcđ', etc.
    #if comps[0] and not comps[1] and not comps[2] and \
    #not comps[0] in ('gi', 'qu'):
    #for c in comps[0]:
    #if not c in CONSONANTS:
    #return False
    #return True
    if comps[0] and not comps[1] and not comps[2]:
        return True

    # Check if our start sound is a proper consonant
    if (comps[0] != u'') and (not (comps[0] in CONSONANTS)):
        return False

    # And if our ending sound is a proper ending consonant
    if (comps[2] != u'') and (not (comps[2] in ENDING_CONSONANTS)):
        return False

    vowel = accent.remove_accent_string(comps[1])
    if len(vowel) > 1:
        if not (vowel in OPEN_COMPOUND_VOWELS or \
            vowel in CLOSED_COMPOUND_VOWELS):
            return False

    if vowel in CLOSED_COMPOUND_VOWELS and \
        not vowel in OPEN_COMPOUND_VOWELS and comps[2] != u'':
        return False

    # 'ăch'?
    if comps[2] == u'ch' and ((vowel in u'ăâeôơuư') or \
        (vowel in OPEN_COMPOUND_VOWELS and not vowel in CLOSED_COMPOUND_VOWELS)):
        return False

    # 'ương' is ok but 'ơng' ?
    if comps[2] == u'ng' and vowel in (u'ơ'):
        return False

    # Sadly, this interferes with 'nhếch' :<
    #if comps[2] == u'c' and vowel in u'ê':
    #    return False

    # Get the first accent
    ac = Accent.NONE
    for i in range(len(comps[1])):
        a = accent.get_accent_char(comps[1][i])
        if a != Accent.NONE:
            ac = a
            break

    # These consonants can only go with ACUTE, DOT or NONE accents
    if comps[2] in [u'c', u'p', u't', u'ch'] and \
        not ac in [Accent.NONE, Accent.ACUTE, Accent.DOT]:
        return False

    return True
def transform(comps, trans):
    """
    Transform the given string with transform type trans
    """

    components = list(comps)

    # Special case for 'ư, ơ'
    #if trans[0] == '<' and not trans[1] in (u'ư', u'ơ', u'Ư', u'Ơ'):
    #        trans = '+' + trans[1]
    # (Not our job)

    if trans[0] == u'<':
        if not components[2]:
            # Undo operation
            if components[1][-1:] == trans[1]:
                return components
            # Only allow ư, ơ or ươ sitting alone in the middle part
            elif not components[1] or \
                (components[1].lower() == u'ư' and trans[1].lower() == u'ơ'):
                components[1] += trans[1]
            # Quite a hack. If you want to type gi[f = 'giờ', separate()
            # will create ['g', 'i', '']. Therefore we have to allow
            # components[1] == 'i'.
            elif components[1].lower() == 'i' and components[0].lower() == 'g':
                components[1] += trans[1]
                components = separate(utils.join(components))

    if trans[0] == u'+':
        # See this and you'll understand:
        #   transform([u'nn', '', ''],'+n') = [u'nnn', '', '']
        #   transform([u'c', '', ''],'+o') = [u'c', 'o', '']
        #   transform([u'c', 'o', ''],'+o') = [u'c', 'oo', '']
        #   transform([u'c', 'o', ''],'+n') = [u'c', 'o', 'n']
        if components[1] == u'':
            if utils.is_vowel(trans[1]):
                components[1] += trans[1]
            else:
                components[0] += trans[1]
        else:
            if components[2] == u'' and utils.is_vowel(trans[1]):
                components[1] += trans[1]
            else:
                components[2] += trans[1]

        # If there is any accent, remove and reapply it
        # because it is likely to be misplaced in previous transformations
        ac = accent.Accent.NONE
        for c in components[1]:
            ac = accent.get_accent_char(c)
            if ac:
                break
        if ac != accent.Accent.NONE:
            # Remove accent
            components = accent.add_accent(components, Accent.NONE)
            components = accent.add_accent(components, ac)
        return components

    action, factor = get_action(trans)
    if action == Action.ADD_ACCENT:
        components = accent.add_accent(components, factor)
    elif action == Action.ADD_MARK:
        if (mark.is_valid_mark(components, trans)):
            components = mark.add_mark(components, factor)
    return components
def transform(comps, trans):
    """
    Transform the given string with transform type trans
    """
    
    components = list(comps)
    
    # Special case for 'ư, ơ'
    #if trans[0] == '<' and not trans[1] in (u'ư', u'ơ', u'Ư', u'Ơ'):
    #        trans = '+' + trans[1]
    # (Not our job)

    if trans[0] == u'<':
        if not components[2]:
            # Undo operation
            if components[1][-1:] == trans[1]:
                return components
            # Only allow ư, ơ or ươ sitting alone in the middle part
            elif not components[1] or \
                (components[1].lower() == u'ư' and trans[1].lower() == u'ơ'):
                components[1] += trans[1]
            # Quite a hack. If you want to type gi[f = 'giờ', separate()
            # will create ['g', 'i', '']. Therefore we have to allow
            # components[1] == 'i'.
            elif components[1].lower() == 'i' and components[0].lower() == 'g':
                components[1] += trans[1]
                components = separate(utils.join(components))

    if trans[0] == u'+':
        # See this and you'll understand:
        #   transform([u'nn', '', ''],'+n') = [u'nnn', '', '']
        #   transform([u'c', '', ''],'+o') = [u'c', 'o', '']
        #   transform([u'c', 'o', ''],'+o') = [u'c', 'oo', '']
        #   transform([u'c', 'o', ''],'+n') = [u'c', 'o', 'n']
        if components[1] == u'':
            if utils.is_vowel(trans[1]):
                components[1] += trans[1]
            else:
                components[0] += trans[1]
        else:
            if components[2] == u'' and utils.is_vowel(trans[1]):
                components[1] += trans[1]
            else:
                components[2] += trans[1]
        
        # If there is any accent, remove and reapply it
        # because it is likely to be misplaced in previous transformations
        ac = accent.Accent.NONE
        for c in components[1]:
            ac = accent.get_accent_char(c)
            if ac:
                break
        if ac != accent.Accent.NONE:
            # Remove accent
            components = accent.add_accent(components, Accent.NONE)
            components = accent.add_accent(components, ac)
        return components
            
    action, factor = get_action (trans)
    if action == Action.ADD_ACCENT:
        components =  accent.add_accent(components, factor)
    elif action == Action.ADD_MARK:
        if (mark.is_valid_mark(components, trans)):
            components = mark.add_mark(components, factor)
    return components
def is_valid_combination(components):
    """Check if a character combination complies to Vietnamese spelling.
    
    Input:
        components - a list of the form [u'c', u'a', u'm']
    Output:
        True if OK, False otherwise.
    """
    comps = list(components)
    # We only work with lower case
    for i in range(len(comps)):
        comps[i] = utils.change_case(comps[i], 0)
    
    # Allow 'đ' to appear in abbreviations like 'đm', 'đc', 'kgcđ', etc.
    #if comps[0] and not comps[1] and not comps[2] and \
        #not comps[0] in ('gi', 'qu'):
        #for c in comps[0]:
            #if not c in CONSONANTS:
                #return False
        #return True
    if comps[0] and not comps[1] and not comps[2]:
        return True
    
    # Check if our start sound is a proper consonant
    if (comps[0] != u'') and (not (comps[0] in CONSONANTS)):
        return False
    
    # And if our ending sound is a proper ending consonant
    if (comps[2] != u'') and (not (comps[2] in ENDING_CONSONANTS)):
        return False
    
    vowel = accent.remove_accent_string(comps[1])
    if len(vowel) > 1:
        if not (vowel in OPEN_COMPOUND_VOWELS or \
            vowel in CLOSED_COMPOUND_VOWELS):
            return False

    if vowel in CLOSED_COMPOUND_VOWELS and \
        not vowel in OPEN_COMPOUND_VOWELS and comps[2] != u'':
        return False
    
    # 'ăch'?
    if comps[2] == u'ch' and ((vowel in u'ăâeôơuư') or \
        (vowel in OPEN_COMPOUND_VOWELS and not vowel in CLOSED_COMPOUND_VOWELS)):
        return False
    
    # 'ương' is ok but 'ơng' ?
    if comps[2] == u'ng' and vowel in (u'ơ'):
        return False
    
    # Sadly, this interferes with 'nhếch' :<
    #if comps[2] == u'c' and vowel in u'ê':
    #    return False
    
    # Get the first accent
    ac = Accent.NONE
    for i in range(len(comps[1])):
        a = accent.get_accent_char(comps[1][i])
        if a != Accent.NONE:
            ac = a
            break
    
    # These consonants can only go with ACUTE, DOT or NONE accents
    if comps[2] in [u'c', u'p', u't', u'ch'] and \
        not ac in [Accent.NONE, Accent.ACUTE, Accent.DOT]:
        return False
    
    return True