예제 #1
0
    def add_ending(self, ending=u""):
        """ Add an ending to the word, combining vowel-consonant
        pairs if necessary """

        # If ending begins with a vowel, the word must end with a consonant
        # and this consonant-vowel pair should be combined
        # If the word doesn't end with a consonant, raise a value error
        if TamilLetter.is_vowel(ending[0]):

            if TamilLetter.is_consonant(self.word[-1]):

                # replace the last letter of the word with the consonant-vowel
                # combination, then copy over the rest of the ending
                self.word[-1] = TamilLetter.get_combination(self.word[-1], ending[0])
                self.word += ending[1:]

            # Unless word ends with a consonant, ending cannot start with a vowel
            else:
                raise ValueError(
                    """Invalid word-ending pair - ending
                    can only start with a vowel if word ends with consonant"""
                )

        # In all other cases, simply add the ending to the word
        self.word += ending
예제 #2
0
    def validate(word=u""):
        """ Asserts that a given word is valid """

        # if input was a TamilWord object, extract out the word portion
        if isinstance(word, TamilWord):
            word = word.word

        # simple test: every element of string must be valid Tamil character

        for codepoint in word:
            TamilLetter.assert_valid_letter(codepoint)

        # TODO: implement method this more thoroughly
        # TODO: check for pulli or combination_ending at beginning of word

        return True
예제 #3
0
    def validate(word=''):
        """ Asserts that a given word is valid """

        # if input was a TamilWord object, extract out the word portion
        if isinstance(word, TamilWord):
            word = word.word

        # simple test: every element of string must be valid Tamil character

        for codepoint in word:
            TamilLetter.assert_valid_letter(codepoint)

        # TODO: implement method this more thoroughly
        # TODO: check for pulli or combination_ending at beginning of word

        return True
예제 #4
0
    def split_syllables(letters=[]):
        """ Returns the syllables in a given word as a list """

        # Generic algorithm:
        # Each vowel and combination is its own syllable. Consonants and
        # aytham get added to the end of the previous syllable

        # ensure that the word is a valid word
        TamilWord.validate("".join(letters))

        # initialize empty list
        syllables = []

        # loop through letters in the word
        for letter in letters:

            # if letter is a vowel or combination, it gets its own syllable
            if TamilLetter.is_combination(letter) or TamilLetter.is_vowel(letter):
                syllables.append(letter)

            # if codepoint is a consonant or aytham, add it to the end
            # of the previously-added codepoint
            elif TamilLetter.is_consonant(letter) or TamilLetter.is_aytham(letter):

                # ensure that at least one character already exists
                if len(syllables) > 0:
                    syllables[-1] = syllables[-1] + letter

                # if the first letter is a consonant (probably b/c it' s a
                # loanword), add it to the beginning of the string
                else:
                    syllables.append(letter)

            # if codepoint was neither a vowel, aytham, a pulli or a
            # combination ending, an unexpected error has occurred
            else:
                raise Exception(
                    """Unknown error: \'%s\' in word %s is neither
                 a vowel, consonant, combination or aytham"""
                    % (letter, "".join(letters))
                )

        return syllables
예제 #5
0
    def split_syllables(letters=[]):
        """ Returns the syllables in a given word as a list """

        # Generic algorithm:
        # Each vowel and combination is its own syllable. Consonants and
        # aytham get added to the end of the previous syllable

        # ensure that the word is a valid word
        TamilWord.validate(''.join(letters))

        # initialize empty list
        syllables = []

        # loop through letters in the word
        for letter in letters:

            # if letter is a vowel or combination, it gets its own syllable
            if TamilLetter.is_combination(letter) or \
                TamilLetter.is_vowel(letter):
                syllables.append(letter)

            # if codepoint is a consonant or aytham, add it to the end
            # of the previously-added codepoint
            elif TamilLetter.is_consonant(letter) or \
                TamilLetter.is_aytham(letter):

                # ensure that at least one character already exists
                if len(syllables) > 0:
                    syllables[-1] = syllables[-1] + letter

                # if the first letter is a consonant (probably b/c it' s a
                # loanword), add it to the beginning of the string
                else:
                    syllables.append(letter)

            # if codepoint was neither a vowel, aytham, a pulli or a
            # combination ending, an unexpected error has occurred
            else:
                raise Exception("""Unknown error: \'%s\' in word %s is neither
                 a vowel, consonant, combination or aytham""" %
                                (letter, ''.join(letters)))
예제 #6
0
    def split_letters(word=u""):
        """ Returns the graphemes (i.e. the Tamil characters)
        in a given word as a list """

        # ensure that the word is a valid word
        TamilWord.validate(word)

        # list (which will be returned to user)
        letters = []

        # a tuple of all combination endings and of all அ combinations
        combination_endings = TamilLetter.get_combination_endings()
        a_combinations = TamilLetter.get_combination_column(u"அ").values()

        # loop through
        for codepoint in word:

            # if codepoint is an அ combination, a vowel, aytham or a space,
            # add it to the list
            if (
                codepoint in a_combinations
                or TamilLetter.is_whitespace(codepoint)
                or TamilLetter.is_vowel(codepoint)
                or TamilLetter.is_aytham(codepoint)
            ):

                letters.append(codepoint)

            # if codepoint is a combination ending or a pulli ('்'), add it
            # to the end of the previously-added codepoint
            elif codepoint in combination_endings or codepoint == TamilLetter.get_pulli():

                # ensure that at least one character already exists
                if len(letters) > 0:
                    letters[-1] = letters[-1] + codepoint

                # otherwise raise an Error. However, validate_word()
                # should catch this
                else:
                    raise ValueError(
                        """Unknown error: Combination ending %s
                    cannot be first character of a word"""
                        % (codepoint)
                    )

            # if codepoint was neither a vowel, aytham, a pulli or a
            # combination ending, an unexpected error has occurred
            else:
                raise ValueError(
                    """Unknown error: Codepoint \'%s\' in word %s
                    is neither a vowel, consonant, combination or aytham"""
                    % (codepoint, word)
                )

        # TODO: Write extensive test cases for this

        return letters
예제 #7
0
    def add_ending(self, ending=''):
        """ Add an ending to the word, combining vowel-consonant
        pairs if necessary """

        # If ending begins with a vowel, the word must end with a consonant
        # and this consonant-vowel pair should be combined
        # If the word doesn't end with a consonant, raise a value error
        if TamilLetter.is_vowel(ending[0]):

            if TamilLetter.is_consonant(self.word[-1]):

                # replace the last letter of the word with the consonant-vowel
                # combination, then copy over the rest of the ending
                self.word[-1] = TamilLetter.get_combination(self.word[-1], \
                                                            ending[0])
                self.word += ending[1:]

            # Unless word ends with a consonant, ending cannot start with a vowel
            else:
                raise ValueError("""Invalid word-ending pair - ending
                    can only start with a vowel if word ends with consonant""")

        # In all other cases, simply add the ending to the word
        self.word += ending
예제 #8
0
    def split_letters(word=''):
        """ Returns the graphemes (i.e. the Tamil characters)
        in a given word as a list """

        # ensure that the word is a valid word
        TamilWord.validate(word)

        # list (which will be returned to user)
        letters = []

        # a tuple of all combination endings and of all அ combinations
        combination_endings = TamilLetter.get_combination_endings()
        a_combinations = list(TamilLetter.get_combination_column('அ').values())

        # loop through
        for codepoint in word:

            # if codepoint is an அ combination, a vowel, aytham or a space,
            # add it to the list
            if codepoint in a_combinations or \
                TamilLetter.is_whitespace(codepoint) or \
                TamilLetter.is_vowel(codepoint) or \
                TamilLetter.is_aytham(codepoint):

                letters.append(codepoint)

            # if codepoint is a combination ending or a pulli ('்'), add it
            # to the end of the previously-added codepoint
            elif codepoint in combination_endings or \
                codepoint == TamilLetter.get_pulli():

                # ensure that at least one character already exists
                if len(letters) > 0:
                    letters[-1] = letters[-1] + codepoint

                # otherwise raise an Error. However, validate_word()
                # should catch this
                else:
                    raise ValueError("""Unknown error: Combination ending %s
                    cannot be first character of a word""" % (codepoint))

            # if codepoint was neither a vowel, aytham, a pulli or a
            # combination ending, an unexpected error has occurred
            else:
                raise ValueError("""Unknown error: Codepoint \'%s\' in word %s
                    is neither a vowel, consonant, combination or aytham""" %
                                 (codepoint, word))

        # TODO: Write extensive test cases for this

        return letters