Example #1
0
 def split_syllables(letters = []):
     """ Returns the syllables in a given word as a list """   
     
     ## Generic algorithm:
     ## Each vowel and combination is its own syllable. Consonants and aytham get added to the end of the previous syllable
     
     # ensure that the word is a valid word
     TamilWord.validate_word(''.join(letters))
     
     # initialize empty list
     syllables = []
     
     # loop through letters in the word
     for letter in letters: 
         
         # if letter is a vowel or combination, it gets its own syllable 
         if TamilLetter.is_combination(letter) or TamilLetter.is_vowel(letter):
             syllables.append(letter)
         
         # if codepoint is a consonant or aytham, add it to the end of the previously-added codepoint
         elif TamilLetter.is_consonant(letter) or TamilLetter.is_aytham(letter):
             
             # ensure that at least one character already exists
             if len(syllables) > 0:
                 syllables[-1] = syllables[-1] + letter
             
             # if the first letter is a consonant (probably b/c it' s a loanword), add it to the beginning of the string
             else:
                 syllables.append(letter) 
             
         # if codepoint was neither a vowel, aytham, a pulli or a combination ending, an unexpected error has occurred
         else:
             raise Exception("Unknown error: The letter \'%s\' in word %s is neither a vowel, consonant, combination or aytham" %(letter, ''.join(letters)))
Example #2
0
 def get_class(word = u''):
     """ Returns the noun class for a given word """
     
     ### See flowchart in docs/material/sendhil/noun_classes.png for more details
     
     word = TamilWord(word)
     
     ## For noun classes ending in a consonant:
     ## Ending in ம் -> class 1
     ## Ending in ல், ன், ய், ள், ண் AND has exactly two graphemes AND first letter is kuril: class 2
     ## All other consonant-ending nouns -> class 3
     
     if TamilLetter.is_consonant(word[-1]):
         
         if word[-1] == u'ம்':
             return 1
         
         elif word[-1] in (u'ல்', u'ன்', u'ய்', u'ள்', u'ண்') and len(word)==2 and TamilLetter.is_kuril(word[0]):
             return 2
         
         else:
             return 3
         
     ## For noun classes not ending in a combination:
     ## TODO: What if a noun ends in Aytham?
     ## TODO: Has thiTamilLettertested on one-letter long all-vowel nouns (e.g. ஈ)?
     ## Ends in இ, ஈ or ஐ -> class 4
     ## Ends in ஆ, ஏ, ஊ or ஓ -> class 5
     ## Has exactly two graphemes AND first letter is kuril -> class 6
     ## Ends in று and penultimate grapheme is not a consonant -> class 7
     ## Ends in டு and penultimate grapheme is not a consonant -> class 8
     ## TODO: The above two rules assume the word is at least two graphemes long. Valid assumption?
     ## All other cases: class 9
     
     else:
         
         _, ending_vowel = TamilLetter.split_combination(word[-1])
         
         if ending_vowel in (u'இ', u'ஈ', u'ஐ'):
             return 4
         
         elif ending_vowel in (u'ஆ', u'ஏ', u'ஊ', u'ஓ'):
             return 5
         
         elif len(word)==2 and TamilLetter.is_kuril(TamilWord[0]):
             return 6
         
         elif word[-1]==u'று' and not TamilLetter.is_consonant(word[-2]):
             return 7
         
         elif word[-1]==u'டு' and not TamilLetter.is_consonant(word[-2]):
             return 8
         
         else:
             return 9
Example #3
0
 def validate(self):
     """ Checks whether the given word is valid """   
     
     # simple test: every element of the string has to be a valid Tamil character
     for codepoint in self.text:
         TamilLetter.validate_letter(codepoint)
      
     
     #TODO: implement method this more thoroughly
     #TODO: check for pulli or combination_ending at the beginning of a word 
     
     return True
Example #4
0
    def direct_object(self):
        """Returns the direct object of a given noun."""

        direct_object = TamilNoun(self.word)

        # direct objects use the suffix 'ஐ'
        suffix = u"ஐ"

        # map noun class to its particular connector
        CONNECTOR_BY_CLASS = {
            1: u"த்த்",
            2: direct_object[-1],
            3: direct_object[-1],
            4: u"ய்",
            5: u"வ்",
            6: u"வ்",
            7: u"ற்ற்",
            8: u"ட்ட்",
            9: TamilLetter.split_combination(direct_object[-1])[0]
        }

        noun_class = TamilNoun.get_class(self.word)

        if noun_class not in CONNECTOR_BY_CLASS.keys():
            raise ValueError("""%s is an invalid noun class for word %s.
                Must be between 1 and 9""" % (noun_class, self.word))

        connector = TamilWord(CONNECTOR_BY_CLASS.get(noun_class))

        # remove last letter + add two-letter connector + suffix
        if noun_class in (1, 7, 8):

            del direct_object[-1]

            direct_object.word += (
                connector[0] +
                TamilLetter.get_combination(connector[-1], suffix))

        # remove last letter + add one-letter connector + suffix
        elif noun_class in (3, 9):

            del direct_object[-1]

            direct_object.word += \
                (TamilLetter.get_combination(connector[0], suffix))

        # nothing to remove; just add connector + suffix
        elif noun_class in (2, 4, 5, 6):

            direct_object.word += \
                (TamilLetter.get_combination(connector[0], suffix))

        return direct_object.word
Example #5
0
    def direct_object(self):
        """Returns the direct object of a given noun."""

        direct_object = TamilNoun(self.word)

        # direct objects use the suffix 'ஐ'
        suffix = u"ஐ"

        # map noun class to its particular connector
        CONNECTOR_BY_CLASS = {1: u"த்த்",
                           2: direct_object[-1],
                           3: direct_object[-1],
                           4: u"ய்",
                           5: u"வ்",
                           6: u"வ்",
                           7: u"ற்ற்",
                           8: u"ட்ட்",
                           9: TamilLetter.split_combination(direct_object[-1])[0]
                  }

        noun_class = TamilNoun.get_class(self.word)

        if noun_class not in CONNECTOR_BY_CLASS.keys():
            raise ValueError("""%s is an invalid noun class for word %s.
                Must be between 1 and 9""" % (noun_class, self.word))

        connector = TamilWord(CONNECTOR_BY_CLASS.get(noun_class))

        # remove last letter + add two-letter connector + suffix
        if noun_class in (1, 7, 8):

            del direct_object[-1]

            direct_object.word += (connector[0] +
                 TamilLetter.get_combination(connector[-1], suffix))

        # remove last letter + add one-letter connector + suffix
        elif noun_class in (3, 9):

            del direct_object[-1]

            direct_object.word += \
                (TamilLetter.get_combination(connector[0], suffix))

        # nothing to remove; just add connector + suffix
        elif noun_class in (2, 4, 5, 6):

            direct_object.word += \
                (TamilLetter.get_combination(connector[0], suffix))

        return direct_object.word
Example #6
0
 def split_letters(word=u''):
     """ Returns the graphemes (i.e. the Tamil characters) in a given word as a list """   
     
     # ensure that the word is a valid word
     TamilWord.validate_word(word)
     
     # list (which will be returned to user)
     letters = []
     
     # a tuple of all combination endings and of all அ combinations
     combination_endings = TamilLetter.get_combination_endings()
     a_combinations = TamilLetter.get_combination_column(u'அ').values()
     
     # loop through 
     for codepoint in word: 
         
         # if codepoint is an அ combination, a vowel, aytham or a space, add it to the list 
         if codepoint in a_combinations or TamilLetter.is_whitespace(codepoint) or TamilLetter.is_vowel(codepoint) or TamilLetter.is_aytham(codepoint):
             letters.append(codepoint)
         
         # if codepoint is a combination ending or a pulli ('்'), add it to the end of the previously-added codepoint
         elif codepoint in combination_endings or codepoint==TamilLetter.get_pulli():
             
             # ensure that at least one character already exists
             if len(letters) > 0:
                 letters[-1] = letters[-1] + codepoint 
             
             # otherwise raise an Error. validate_word() should catch this, however
             else:
                 raise ValueError("Unknown error: The combination ending %s cannot be the first character of a word" %(codepoint))
         
         # if codepoint was neither a vowel, aytham, a pulli or a combination ending, an unexpected error has occurred
         else:
             raise ValueError("Unknown error: The codepoint \'%s\' in word %s is neither a vowel, consonant, combination or aytham" %(codepoint, word))
         
     
     #TODO: Write extensive test cases for this
     
     return letters
Example #7
0
    def get_class(word=u''):
        """ Returns the noun class for a given noun """

        ### See flowchart (docs/material/sendhil/noun_classes.png) for details

        # Convert 'word' from a unicode object to a TamilWord object
        word = TamilWord(word)

        ## For noun classes ending in a consonant:
        ## Ending in ம் -> class 1
        ## Ending in ல், ன், ய், ள், ண் AND has exactly two graphemes AND
        ##    first letter is kuril: class 2
        ## All other consonant-ending nouns -> class 3

        if TamilLetter.is_consonant(word[-1]):

            if word[-1] == u'ம்':
                return 1

            elif (word[-1] in (u'ல்', u'ன்', u'ய்', u'ள்', u'ண்') and
                len(word) == 2 and TamilLetter.is_kuril(word[0])):
                return 2

            else:
                return 3

        ## For noun classes not ending in a combination:
        ## TODO: What if a noun ends in Aytham? Is this possible?
        ## TODO: Ensure this is tested on single letter nouns (e.g. ஈ)
        ## Ends in இ, ஈ or ஐ -> class 4
        ## Ends in ஆ, ஏ, ஊ or ஓ -> class 5
        ## Has exactly two graphemes AND first letter is kuril -> class 6
        ## Ends in று and penultimate grapheme is not a consonant -> class 7
        ## Ends in டு and penultimate grapheme is not a consonant -> class 8
        ## TODO: Is it ok that rules assume the word is >= two graphemes long?
        ## All other cases: class 9

        else:

            _, ending_vowel = TamilLetter.split_combination(word[-1])

            if ending_vowel in (u'இ', u'ஈ', u'ஐ'):
                return 4

            elif ending_vowel in (u'ஆ', u'ஏ', u'ஊ', u'ஓ'):
                return 5

            elif len(word) == 2 and TamilLetter.is_kuril(TamilWord[0]):
                return 6

            elif word[-1] == u'று' and not TamilLetter.is_consonant(word[-2]):
                return 7

            elif word[-1] == u'டு' and not TamilLetter.is_consonant(word[-2]):
                return 8

            else:
                return 9
Example #8
0
    def get_class(word=u''):
        """ Returns the noun class for a given noun """

        ### See flowchart (docs/material/sendhil/noun_classes.png) for details

        # Convert 'word' from a unicode object to a TamilWord object
        word = TamilWord(word)

        ## For noun classes ending in a consonant:
        ## Ending in ம் -> class 1
        ## Ending in ல், ன், ய், ள், ண் AND has exactly two graphemes AND
        ##    first letter is kuril: class 2
        ## All other consonant-ending nouns -> class 3

        if TamilLetter.is_consonant(word[-1]):

            if word[-1] == u'ம்':
                return 1

            elif (word[-1] in (u'ல்', u'ன்', u'ய்', u'ள்', u'ண்')
                  and len(word) == 2 and TamilLetter.is_kuril(word[0])):
                return 2

            else:
                return 3

        ## For noun classes not ending in a combination:
        ## TODO: What if a noun ends in Aytham? Is this possible?
        ## TODO: Ensure this is tested on single letter nouns (e.g. ஈ)
        ## Ends in இ, ஈ or ஐ -> class 4
        ## Ends in ஆ, ஏ, ஊ or ஓ -> class 5
        ## Has exactly two graphemes AND first letter is kuril -> class 6
        ## Ends in று and penultimate grapheme is not a consonant -> class 7
        ## Ends in டு and penultimate grapheme is not a consonant -> class 8
        ## TODO: Is it ok that rules assume the word is >= two graphemes long?
        ## All other cases: class 9

        else:

            _, ending_vowel = TamilLetter.split_combination(word[-1])

            if ending_vowel in (u'இ', u'ஈ', u'ஐ'):
                return 4

            elif ending_vowel in (u'ஆ', u'ஏ', u'ஊ', u'ஓ'):
                return 5

            elif len(word) == 2 and TamilLetter.is_kuril(word[0]):
                return 6

            elif word[-1] == u'று' and not TamilLetter.is_consonant(word[-2]):
                return 7

            elif word[-1] == u'டு' and not TamilLetter.is_consonant(word[-2]):
                return 8

            else:
                return 9