def _compute_char_brkpoints(self): """ This function uses ICU BreakIterator to identify and store extended grapheme clusters. """ chars_break_iterator = BreakIterator.createCharacterInstance( Locale.getRoot()) chars_break_iterator.setText(self.unsegmented) self.char_brkpoints = [0] for brkpoint in chars_break_iterator: self.char_brkpoints.append(brkpoint)
def _compute_icu_segmented(self): """ This function computes the ICU segmented version of the line using the unsegmented version. Therefore, in order to use it the unsegmented version must have been already computed. """ words_break_iterator = BreakIterator.createWordInstance( Locale.getRoot()) words_break_iterator.setText(self.unsegmented) self.icu_word_brkpoints = [0] for brkpoint in words_break_iterator: self.icu_word_brkpoints.append(brkpoint) self.icu_segmented = "|" for i in range(len(self.icu_word_brkpoints) - 1): self.icu_segmented += self.unsegmented[ self.icu_word_brkpoints[i]:self.icu_word_brkpoints[i + 1]] + "|"