def split_syllables(letters = []): """ Returns the syllables in a given word as a list """ ## Generic algorithm: ## Each vowel and combination is its own syllable. Consonants and aytham get added to the end of the previous syllable # ensure that the word is a valid word TamilWord.validate_word(''.join(letters)) # initialize empty list syllables = [] # loop through letters in the word for letter in letters: # if letter is a vowel or combination, it gets its own syllable if TamilLetter.is_combination(letter) or TamilLetter.is_vowel(letter): syllables.append(letter) # if codepoint is a consonant or aytham, add it to the end of the previously-added codepoint elif TamilLetter.is_consonant(letter) or TamilLetter.is_aytham(letter): # ensure that at least one character already exists if len(syllables) > 0: syllables[-1] = syllables[-1] + letter # if the first letter is a consonant (probably b/c it' s a loanword), add it to the beginning of the string else: syllables.append(letter) # if codepoint was neither a vowel, aytham, a pulli or a combination ending, an unexpected error has occurred else: raise Exception("Unknown error: The letter \'%s\' in word %s is neither a vowel, consonant, combination or aytham" %(letter, ''.join(letters)))
def get_class(word = u''): """ Returns the noun class for a given word """ ### See flowchart in docs/material/sendhil/noun_classes.png for more details word = TamilWord(word) ## For noun classes ending in a consonant: ## Ending in ம் -> class 1 ## Ending in ல், ன், ய், ள், ண் AND has exactly two graphemes AND first letter is kuril: class 2 ## All other consonant-ending nouns -> class 3 if TamilLetter.is_consonant(word[-1]): if word[-1] == u'ம்': return 1 elif word[-1] in (u'ல்', u'ன்', u'ய்', u'ள்', u'ண்') and len(word)==2 and TamilLetter.is_kuril(word[0]): return 2 else: return 3 ## For noun classes not ending in a combination: ## TODO: What if a noun ends in Aytham? ## TODO: Has thiTamilLettertested on one-letter long all-vowel nouns (e.g. ஈ)? ## Ends in இ, ஈ or ஐ -> class 4 ## Ends in ஆ, ஏ, ஊ or ஓ -> class 5 ## Has exactly two graphemes AND first letter is kuril -> class 6 ## Ends in று and penultimate grapheme is not a consonant -> class 7 ## Ends in டு and penultimate grapheme is not a consonant -> class 8 ## TODO: The above two rules assume the word is at least two graphemes long. Valid assumption? ## All other cases: class 9 else: _, ending_vowel = TamilLetter.split_combination(word[-1]) if ending_vowel in (u'இ', u'ஈ', u'ஐ'): return 4 elif ending_vowel in (u'ஆ', u'ஏ', u'ஊ', u'ஓ'): return 5 elif len(word)==2 and TamilLetter.is_kuril(TamilWord[0]): return 6 elif word[-1]==u'று' and not TamilLetter.is_consonant(word[-2]): return 7 elif word[-1]==u'டு' and not TamilLetter.is_consonant(word[-2]): return 8 else: return 9
def validate(self): """ Checks whether the given word is valid """ # simple test: every element of the string has to be a valid Tamil character for codepoint in self.text: TamilLetter.validate_letter(codepoint) #TODO: implement method this more thoroughly #TODO: check for pulli or combination_ending at the beginning of a word return True
def direct_object(self): """Returns the direct object of a given noun.""" direct_object = TamilNoun(self.word) # direct objects use the suffix 'ஐ' suffix = u"ஐ" # map noun class to its particular connector CONNECTOR_BY_CLASS = { 1: u"த்த்", 2: direct_object[-1], 3: direct_object[-1], 4: u"ய்", 5: u"வ்", 6: u"வ்", 7: u"ற்ற்", 8: u"ட்ட்", 9: TamilLetter.split_combination(direct_object[-1])[0] } noun_class = TamilNoun.get_class(self.word) if noun_class not in CONNECTOR_BY_CLASS.keys(): raise ValueError("""%s is an invalid noun class for word %s. Must be between 1 and 9""" % (noun_class, self.word)) connector = TamilWord(CONNECTOR_BY_CLASS.get(noun_class)) # remove last letter + add two-letter connector + suffix if noun_class in (1, 7, 8): del direct_object[-1] direct_object.word += ( connector[0] + TamilLetter.get_combination(connector[-1], suffix)) # remove last letter + add one-letter connector + suffix elif noun_class in (3, 9): del direct_object[-1] direct_object.word += \ (TamilLetter.get_combination(connector[0], suffix)) # nothing to remove; just add connector + suffix elif noun_class in (2, 4, 5, 6): direct_object.word += \ (TamilLetter.get_combination(connector[0], suffix)) return direct_object.word
def direct_object(self): """Returns the direct object of a given noun.""" direct_object = TamilNoun(self.word) # direct objects use the suffix 'ஐ' suffix = u"ஐ" # map noun class to its particular connector CONNECTOR_BY_CLASS = {1: u"த்த்", 2: direct_object[-1], 3: direct_object[-1], 4: u"ய்", 5: u"வ்", 6: u"வ்", 7: u"ற்ற்", 8: u"ட்ட்", 9: TamilLetter.split_combination(direct_object[-1])[0] } noun_class = TamilNoun.get_class(self.word) if noun_class not in CONNECTOR_BY_CLASS.keys(): raise ValueError("""%s is an invalid noun class for word %s. Must be between 1 and 9""" % (noun_class, self.word)) connector = TamilWord(CONNECTOR_BY_CLASS.get(noun_class)) # remove last letter + add two-letter connector + suffix if noun_class in (1, 7, 8): del direct_object[-1] direct_object.word += (connector[0] + TamilLetter.get_combination(connector[-1], suffix)) # remove last letter + add one-letter connector + suffix elif noun_class in (3, 9): del direct_object[-1] direct_object.word += \ (TamilLetter.get_combination(connector[0], suffix)) # nothing to remove; just add connector + suffix elif noun_class in (2, 4, 5, 6): direct_object.word += \ (TamilLetter.get_combination(connector[0], suffix)) return direct_object.word
def split_letters(word=u''): """ Returns the graphemes (i.e. the Tamil characters) in a given word as a list """ # ensure that the word is a valid word TamilWord.validate_word(word) # list (which will be returned to user) letters = [] # a tuple of all combination endings and of all அ combinations combination_endings = TamilLetter.get_combination_endings() a_combinations = TamilLetter.get_combination_column(u'அ').values() # loop through for codepoint in word: # if codepoint is an அ combination, a vowel, aytham or a space, add it to the list if codepoint in a_combinations or TamilLetter.is_whitespace(codepoint) or TamilLetter.is_vowel(codepoint) or TamilLetter.is_aytham(codepoint): letters.append(codepoint) # if codepoint is a combination ending or a pulli ('்'), add it to the end of the previously-added codepoint elif codepoint in combination_endings or codepoint==TamilLetter.get_pulli(): # ensure that at least one character already exists if len(letters) > 0: letters[-1] = letters[-1] + codepoint # otherwise raise an Error. validate_word() should catch this, however else: raise ValueError("Unknown error: The combination ending %s cannot be the first character of a word" %(codepoint)) # if codepoint was neither a vowel, aytham, a pulli or a combination ending, an unexpected error has occurred else: raise ValueError("Unknown error: The codepoint \'%s\' in word %s is neither a vowel, consonant, combination or aytham" %(codepoint, word)) #TODO: Write extensive test cases for this return letters
def get_class(word=u''): """ Returns the noun class for a given noun """ ### See flowchart (docs/material/sendhil/noun_classes.png) for details # Convert 'word' from a unicode object to a TamilWord object word = TamilWord(word) ## For noun classes ending in a consonant: ## Ending in ம் -> class 1 ## Ending in ல், ன், ய், ள், ண் AND has exactly two graphemes AND ## first letter is kuril: class 2 ## All other consonant-ending nouns -> class 3 if TamilLetter.is_consonant(word[-1]): if word[-1] == u'ம்': return 1 elif (word[-1] in (u'ல்', u'ன்', u'ய்', u'ள்', u'ண்') and len(word) == 2 and TamilLetter.is_kuril(word[0])): return 2 else: return 3 ## For noun classes not ending in a combination: ## TODO: What if a noun ends in Aytham? Is this possible? ## TODO: Ensure this is tested on single letter nouns (e.g. ஈ) ## Ends in இ, ஈ or ஐ -> class 4 ## Ends in ஆ, ஏ, ஊ or ஓ -> class 5 ## Has exactly two graphemes AND first letter is kuril -> class 6 ## Ends in று and penultimate grapheme is not a consonant -> class 7 ## Ends in டு and penultimate grapheme is not a consonant -> class 8 ## TODO: Is it ok that rules assume the word is >= two graphemes long? ## All other cases: class 9 else: _, ending_vowel = TamilLetter.split_combination(word[-1]) if ending_vowel in (u'இ', u'ஈ', u'ஐ'): return 4 elif ending_vowel in (u'ஆ', u'ஏ', u'ஊ', u'ஓ'): return 5 elif len(word) == 2 and TamilLetter.is_kuril(TamilWord[0]): return 6 elif word[-1] == u'று' and not TamilLetter.is_consonant(word[-2]): return 7 elif word[-1] == u'டு' and not TamilLetter.is_consonant(word[-2]): return 8 else: return 9
def get_class(word=u''): """ Returns the noun class for a given noun """ ### See flowchart (docs/material/sendhil/noun_classes.png) for details # Convert 'word' from a unicode object to a TamilWord object word = TamilWord(word) ## For noun classes ending in a consonant: ## Ending in ம் -> class 1 ## Ending in ல், ன், ய், ள், ண் AND has exactly two graphemes AND ## first letter is kuril: class 2 ## All other consonant-ending nouns -> class 3 if TamilLetter.is_consonant(word[-1]): if word[-1] == u'ம்': return 1 elif (word[-1] in (u'ல்', u'ன்', u'ய்', u'ள்', u'ண்') and len(word) == 2 and TamilLetter.is_kuril(word[0])): return 2 else: return 3 ## For noun classes not ending in a combination: ## TODO: What if a noun ends in Aytham? Is this possible? ## TODO: Ensure this is tested on single letter nouns (e.g. ஈ) ## Ends in இ, ஈ or ஐ -> class 4 ## Ends in ஆ, ஏ, ஊ or ஓ -> class 5 ## Has exactly two graphemes AND first letter is kuril -> class 6 ## Ends in று and penultimate grapheme is not a consonant -> class 7 ## Ends in டு and penultimate grapheme is not a consonant -> class 8 ## TODO: Is it ok that rules assume the word is >= two graphemes long? ## All other cases: class 9 else: _, ending_vowel = TamilLetter.split_combination(word[-1]) if ending_vowel in (u'இ', u'ஈ', u'ஐ'): return 4 elif ending_vowel in (u'ஆ', u'ஏ', u'ஊ', u'ஓ'): return 5 elif len(word) == 2 and TamilLetter.is_kuril(word[0]): return 6 elif word[-1] == u'று' and not TamilLetter.is_consonant(word[-2]): return 7 elif word[-1] == u'டு' and not TamilLetter.is_consonant(word[-2]): return 8 else: return 9