Exemple #1
0
 def _compute_char_brkpoints(self):
     """
     This function uses ICU BreakIterator to identify and store extended grapheme clusters.
     """
     chars_break_iterator = BreakIterator.createCharacterInstance(
         Locale.getRoot())
     chars_break_iterator.setText(self.unsegmented)
     self.char_brkpoints = [0]
     for brkpoint in chars_break_iterator:
         self.char_brkpoints.append(brkpoint)
Exemple #2
0
 def _compute_icu_segmented(self):
     """
     This function computes the ICU segmented version of the line using the unsegmented version. Therefore, in order
     to use it the unsegmented version must have been already computed.
     """
     words_break_iterator = BreakIterator.createWordInstance(
         Locale.getRoot())
     words_break_iterator.setText(self.unsegmented)
     self.icu_word_brkpoints = [0]
     for brkpoint in words_break_iterator:
         self.icu_word_brkpoints.append(brkpoint)
     self.icu_segmented = "|"
     for i in range(len(self.icu_word_brkpoints) - 1):
         self.icu_segmented += self.unsegmented[
             self.icu_word_brkpoints[i]:self.icu_word_brkpoints[i +
                                                                1]] + "|"