Exemple #1
0
 def _compute_char_brkpoints(self):
     """
     This function uses ICU BreakIterator to identify and store extended grapheme clusters.
     """
     chars_break_iterator = BreakIterator.createCharacterInstance(
         Locale.getRoot())
     chars_break_iterator.setText(self.unsegmented)
     self.char_brkpoints = [0]
     for brkpoint in chars_break_iterator:
         self.char_brkpoints.append(brkpoint)
    def character_tokenize(self, word):
        """ Returns the tokenization in character level.
        
        Arguments:
            word {string} -- word to be tokenized in character level.
        
        Returns:
            [list] -- list of characters.
        """

        temp_ = BreakIterator.createCharacterInstance(Locale())
        temp_.setText(word)
        char = []
        i = 0
        for j in temp_:
            s = word[i:j]
            char.append(s)
            i = j

        return char