def apply_break_iterator(break_iterator: BreakIterator, text: str) -> List[str]: """Apply ICU break iterator on a text.""" break_iterator.setText(text) parts = [] p0 = 0 for p1 in break_iterator: part = text[p0:p1].strip() if len(part) > 0: parts.append(part) p0 = p1 return parts
def gen_khm_words(text: str) -> str: bi = BreakIterator.createWordInstance(Locale("km")) bi.setText(text) start = bi.first() for end in bi: yield text[start:end] start = end
def _gen_words(text: str) -> str: bd = BreakIterator.createWordInstance(Locale("th")) bd.setText(text) p = bd.first() for q in bd: yield text[p:q] p = q
def __init__(self, locale="en"): from icu import Locale, BreakIterator # ICU includes lists of common abbreviations that can be used to filter, to ignore, # these false sentence boundaries for some languages. # (http://userguide.icu-project.org/boundaryanalysis) if locale in {"en", "de", "es", "it", "pt"}: locale += "@ss=standard" self.locale = Locale(locale) self.breaker = BreakIterator.createSentenceInstance(self.locale)
def _compute_char_brkpoints(self): """ This function uses ICU BreakIterator to identify and store extended grapheme clusters. """ chars_break_iterator = BreakIterator.createCharacterInstance( Locale.getRoot()) chars_break_iterator.setText(self.unsegmented) self.char_brkpoints = [0] for brkpoint in chars_break_iterator: self.char_brkpoints.append(brkpoint)
def divideIntoWords(txt, locale): loc = Locale.createFromName(locale) bi = BreakIterator.createWordInstance(loc) #print txt bi.setText(txt) res = [] while True: try: #print bi.next() res.append(bi.next()) except StopIteration: return res
def main(): print "ICU Break Iterator Sample Program" print "C++ Break Iteration in Python" stringToExamine = u"Aaa bbb ccc. Ddd eee fff." print "Examining: ", stringToExamine # print each sentence in forward and reverse order boundary = BreakIterator.createSentenceInstance(Locale.getUS()) boundary.setText(stringToExamine) print print "Sentence Boundaries... " print "----- forward: -----------" printEachForward(boundary) print "----- backward: ----------" printEachBackward(boundary) # print each word in order print print "Word Boundaries..." boundary = BreakIterator.createWordInstance(Locale.getUS()) boundary.setText(stringToExamine) print "----- forward: -----------" printEachForward(boundary) # print first element print "----- first: -------------" printFirst(boundary) # print last element print "----- last: --------------" printLast(boundary) # print word at charpos 10 print "----- at pos 10: ---------" printAt(boundary, 10) print print "End C++ Break Iteration in Python"
def _compute_icu_segmented(self): """ This function computes the ICU segmented version of the line using the unsegmented version. Therefore, in order to use it the unsegmented version must have been already computed. """ words_break_iterator = BreakIterator.createWordInstance( Locale.getRoot()) words_break_iterator.setText(self.unsegmented) self.icu_word_brkpoints = [0] for brkpoint in words_break_iterator: self.icu_word_brkpoints.append(brkpoint) self.icu_segmented = "|" for i in range(len(self.icu_word_brkpoints) - 1): self.icu_segmented += self.unsegmented[ self.icu_word_brkpoints[i]:self.icu_word_brkpoints[i + 1]] + "|"
def endElement(self, name): if name == u"Unicode": self.__isUni = False loc = Locale.createFromName("utf-8") bi = BreakIterator.createWordInstance(loc) bi.setText(self.__uniText) tokens = [] prev = 0 while True: try: ind = bi.next() tokens.append(self.__uniText[prev:ind]) prev = ind except StopIteration: break text = u"" for t in tokens: text += processToken(t) self.__downstream.characters(text) self.__downstream.endElement(name)
def character_tokenize(self, word): """ Returns the tokenization in character level. Arguments: word {string} -- word to be tokenized in character level. Returns: [list] -- list of characters. """ temp_ = BreakIterator.createCharacterInstance(Locale()) temp_.setText(word) char = [] i = 0 for j in temp_: s = word[i:j] char.append(s) i = j return char
def __init__(self): self.BreakIterator = BreakIterator.createWordInstance( Locale.createFromName('ar'))
def __init__(self): self.locale = Locale("tr") self.breakor = BreakIterator.createWordInstance(self.locale)
def __init__(self, lang: str = 'en'): """SentSplitter.""" self.lang = lang self.locale = Locale(lang) self.break_iterator = \ BreakIterator.createSentenceInstance(self.locale)
def _get_breaker(self, locale): return BreakIterator.createWordInstance(locale)
def __init__(self, locale='en'): super(WordTokenizer, self).__init__(locale) self.breaker = BreakIterator.createWordInstance(self.locale)
def __init__(self, locale='en'): super(SentenceTokenizer, self).__init__(locale) self.breaker = BreakIterator.createSentenceInstance(self.locale)
def _get_breaker(self, locale): return BreakIterator.createSentenceInstance(locale)