Ejemplo n.º 1
0
def separate(string):
    """
        Separates a valid Vietnamese word into 3 components:
        the start sound, the middle sound and the end sound.
        Eg: toán -> [u't', u'oá', u't']
        Otherwise returns None (not a valid Vietnamese word).
    """
    comps = [u'', u'', u'']
    if string == u'':
        return comps
    
    # Search for the first vowel
    for i in range(len(string)):
        if utils.is_vowel(string[i]):
            comps[0] = u'' + string[:i]
            string = u'' + string[i:]
            break

    # No vowel?
    if comps[0] == u'' and not utils.is_vowel(string[0]):
        comps[0] = string
        string = u''
    
    # Search for the first consonant after the first vowel
    for i in range(len(string)):
        if not utils.is_vowel(string[i]):
            comps[1] = string[:i]
            comps[2] = string[i:]
            break
       
    # No ending consonant? Then the rest of the string must be the vowel part
    if comps[1] == u'':
        comps[1] = string
    
    # 'gi' and 'qu' need some special treatments
    # We want something like this:
    #     ['g', 'ia', ''] -> ['gi', 'a', '']
    if (comps[0] != u'' and comps[1] != u'') and \
    ((comps[0] in u'gG' and comps[1][0] in 'iI' and len(comps[1]) > 1) or \
    (comps[0] in u'qQ' and comps[1][0] in 'uU')):
        comps[0] += comps[1][:1]
        comps[1] = comps[1][1:]
    
    if not is_valid_combination(comps):
        return None
    return comps
Ejemplo n.º 2
0
def separate(string):
    """
        Separates a valid Vietnamese word into 3 components:
        the start sound, the middle sound and the end sound.
        Eg: toán -> [u't', u'oá', u't']
        Otherwise returns None (not a valid Vietnamese word).
    """
    comps = [u'', u'', u'']
    if string == u'':
        return comps

    # Search for the first vowel
    for i in range(len(string)):
        if utils.is_vowel(string[i]):
            comps[0] = u'' + string[:i]
            string = u'' + string[i:]
            break

    # No vowel?
    if comps[0] == u'' and not utils.is_vowel(string[0]):
        comps[0] = string
        string = u''

    # Search for the first consonant after the first vowel
    for i in range(len(string)):
        if not utils.is_vowel(string[i]):
            comps[1] = string[:i]
            comps[2] = string[i:]
            break

    # No ending consonant? Then the rest of the string must be the vowel part
    if comps[1] == u'':
        comps[1] = string

    # 'gi' and 'qu' need some special treatments
    # We want something like this:
    #     ['g', 'ia', ''] -> ['gi', 'a', '']
    if (comps[0] != u'' and comps[1] != u'') and \
    ((comps[0] in u'gG' and comps[1][0] in 'iI' and len(comps[1]) > 1) or \
    (comps[0] in u'qQ' and comps[1][0] in 'uU')):
        comps[0] += comps[1][:1]
        comps[1] = comps[1][1:]

    if not is_valid_combination(comps):
        return None
    return comps
Ejemplo n.º 3
0
def get_end(word, span):
    i = len(word)
    cnt = 0
    while True:
        i -= 1
        if is_vowel(word[i]) and not any(r(word, i) for r in RULES):
            cnt += 1
        if i == 0 or cnt == span:
            break

    return word[i:]
Ejemplo n.º 4
0
def gen_error(word):

        new_word = [] 
        for char in word:
                if not char.isalpha():
                        new_word.append(char)
                        continue
                to_do = random.randint(1, 4)

                if to_do == VOWEL and utils.is_vowel(char):
                        new_word.append(random.choice(vowels))
                elif to_do == CAPITALIZATION:
                        new_word.append(utils.change_case(char))
                elif to_do == REPETITION:
                        while 1:
                                new_word.append(char)
                                if random.randint(0, 1):
                                        break
                else:
                        new_word.append(char)
        return ''.join(new_word)
Ejemplo n.º 5
0
def transform(comps, trans):
    """
    Transform the given string with transform type trans
    """

    components = list(comps)

    # Special case for 'ư, ơ'
    #if trans[0] == '<' and not trans[1] in (u'ư', u'ơ', u'Ư', u'Ơ'):
    #        trans = '+' + trans[1]
    # (Not our job)

    if trans[0] == u'<':
        if not components[2]:
            # Undo operation
            if components[1][-1:] == trans[1]:
                return components
            # Only allow ư, ơ or ươ sitting alone in the middle part
            elif not components[1] or \
                (components[1].lower() == u'ư' and trans[1].lower() == u'ơ'):
                components[1] += trans[1]
            # Quite a hack. If you want to type gi[f = 'giờ', separate()
            # will create ['g', 'i', '']. Therefore we have to allow
            # components[1] == 'i'.
            elif components[1].lower() == 'i' and components[0].lower() == 'g':
                components[1] += trans[1]
                components = separate(utils.join(components))

    if trans[0] == u'+':
        # See this and you'll understand:
        #   transform([u'nn', '', ''],'+n') = [u'nnn', '', '']
        #   transform([u'c', '', ''],'+o') = [u'c', 'o', '']
        #   transform([u'c', 'o', ''],'+o') = [u'c', 'oo', '']
        #   transform([u'c', 'o', ''],'+n') = [u'c', 'o', 'n']
        if components[1] == u'':
            if utils.is_vowel(trans[1]):
                components[1] += trans[1]
            else:
                components[0] += trans[1]
        else:
            if components[2] == u'' and utils.is_vowel(trans[1]):
                components[1] += trans[1]
            else:
                components[2] += trans[1]

        # If there is any accent, remove and reapply it
        # because it is likely to be misplaced in previous transformations
        ac = accent.Accent.NONE
        for c in components[1]:
            ac = accent.get_accent_char(c)
            if ac:
                break
        if ac != accent.Accent.NONE:
            # Remove accent
            components = accent.add_accent(components, Accent.NONE)
            components = accent.add_accent(components, ac)
        return components

    action, factor = get_action(trans)
    if action == Action.ADD_ACCENT:
        components = accent.add_accent(components, factor)
    elif action == Action.ADD_MARK:
        if (mark.is_valid_mark(components, trans)):
            components = mark.add_mark(components, factor)
    return components
Ejemplo n.º 6
0
def transform(comps, trans):
    """
    Transform the given string with transform type trans
    """
    
    components = list(comps)
    
    # Special case for 'ư, ơ'
    #if trans[0] == '<' and not trans[1] in (u'ư', u'ơ', u'Ư', u'Ơ'):
    #        trans = '+' + trans[1]
    # (Not our job)

    if trans[0] == u'<':
        if not components[2]:
            # Undo operation
            if components[1][-1:] == trans[1]:
                return components
            # Only allow ư, ơ or ươ sitting alone in the middle part
            elif not components[1] or \
                (components[1].lower() == u'ư' and trans[1].lower() == u'ơ'):
                components[1] += trans[1]
            # Quite a hack. If you want to type gi[f = 'giờ', separate()
            # will create ['g', 'i', '']. Therefore we have to allow
            # components[1] == 'i'.
            elif components[1].lower() == 'i' and components[0].lower() == 'g':
                components[1] += trans[1]
                components = separate(utils.join(components))

    if trans[0] == u'+':
        # See this and you'll understand:
        #   transform([u'nn', '', ''],'+n') = [u'nnn', '', '']
        #   transform([u'c', '', ''],'+o') = [u'c', 'o', '']
        #   transform([u'c', 'o', ''],'+o') = [u'c', 'oo', '']
        #   transform([u'c', 'o', ''],'+n') = [u'c', 'o', 'n']
        if components[1] == u'':
            if utils.is_vowel(trans[1]):
                components[1] += trans[1]
            else:
                components[0] += trans[1]
        else:
            if components[2] == u'' and utils.is_vowel(trans[1]):
                components[1] += trans[1]
            else:
                components[2] += trans[1]
        
        # If there is any accent, remove and reapply it
        # because it is likely to be misplaced in previous transformations
        ac = accent.Accent.NONE
        for c in components[1]:
            ac = accent.get_accent_char(c)
            if ac:
                break
        if ac != accent.Accent.NONE:
            # Remove accent
            components = accent.add_accent(components, Accent.NONE)
            components = accent.add_accent(components, ac)
        return components
            
    action, factor = get_action (trans)
    if action == Action.ADD_ACCENT:
        components =  accent.add_accent(components, factor)
    elif action == Action.ADD_MARK:
        if (mark.is_valid_mark(components, trans)):
            components = mark.add_mark(components, factor)
    return components
Ejemplo n.º 7
0
 def atomic_separate(string, last_chars, last_is_vowel):
     if string == "" or (last_is_vowel != is_vowel(string[-1])):
         return (string, last_chars)
     else:
         return atomic_separate(string[:-1], string[-1] + last_chars, last_is_vowel)