Example #1
0
def apply_break_iterator(break_iterator: BreakIterator,
                         text: str) -> List[str]:
    """Apply ICU break iterator on a text."""
    break_iterator.setText(text)
    parts = []
    p0 = 0
    for p1 in break_iterator:
        part = text[p0:p1].strip()
        if len(part) > 0:
            parts.append(part)
        p0 = p1
    return parts
def gen_khm_words(text: str) -> str:
    bi = BreakIterator.createWordInstance(Locale("km"))
    bi.setText(text)
    start = bi.first()
    for end in bi:
        yield text[start:end]
        start = end
Example #3
0
def _gen_words(text: str) -> str:
    bd = BreakIterator.createWordInstance(Locale("th"))
    bd.setText(text)
    p = bd.first()
    for q in bd:
        yield text[p:q]
        p = q
Example #4
0
def _gen_words(text: str) -> str:
    bd = BreakIterator.createWordInstance(Locale("th"))
    bd.setText(text)
    p = bd.first()
    for q in bd:
        yield text[p:q]
        p = q
    def __init__(self, locale="en"):
        from icu import Locale, BreakIterator

        # ICU includes lists of common abbreviations that can be used to filter, to ignore,
        # these false sentence boundaries for some languages.
        # (http://userguide.icu-project.org/boundaryanalysis)
        if locale in {"en", "de", "es", "it", "pt"}:
            locale += "@ss=standard"
        self.locale = Locale(locale)
        self.breaker = BreakIterator.createSentenceInstance(self.locale)
Example #6
0
 def _compute_char_brkpoints(self):
     """
     This function uses ICU BreakIterator to identify and store extended grapheme clusters.
     """
     chars_break_iterator = BreakIterator.createCharacterInstance(
         Locale.getRoot())
     chars_break_iterator.setText(self.unsegmented)
     self.char_brkpoints = [0]
     for brkpoint in chars_break_iterator:
         self.char_brkpoints.append(brkpoint)
Example #7
0
def divideIntoWords(txt, locale):
    loc = Locale.createFromName(locale)
    bi = BreakIterator.createWordInstance(loc)
    #print txt
    bi.setText(txt)
    res = []
    while True:
        try:
            #print bi.next()
            res.append(bi.next())
        except StopIteration:
            return res
Example #8
0
def main():

    print "ICU Break Iterator Sample Program"
    print "C++ Break Iteration in Python"
    
    stringToExamine = u"Aaa bbb ccc. Ddd eee fff."
    print "Examining: ", stringToExamine

    # print each sentence in forward and reverse order
    boundary = BreakIterator.createSentenceInstance(Locale.getUS())
    boundary.setText(stringToExamine)

    print
    print "Sentence Boundaries... "
    print "----- forward: -----------"
    printEachForward(boundary)
    print "----- backward: ----------"
    printEachBackward(boundary)

    # print each word in order
    print
    print "Word Boundaries..."
    boundary = BreakIterator.createWordInstance(Locale.getUS())
    boundary.setText(stringToExamine)
    print "----- forward: -----------"
    printEachForward(boundary)
    # print first element
    print "----- first: -------------"
    printFirst(boundary)
    # print last element
    print "----- last: --------------"
    printLast(boundary)
    # print word at charpos 10
    print "----- at pos 10: ---------"
    printAt(boundary, 10)

    print
    print "End C++ Break Iteration in Python"
Example #9
0
 def _compute_icu_segmented(self):
     """
     This function computes the ICU segmented version of the line using the unsegmented version. Therefore, in order
     to use it the unsegmented version must have been already computed.
     """
     words_break_iterator = BreakIterator.createWordInstance(
         Locale.getRoot())
     words_break_iterator.setText(self.unsegmented)
     self.icu_word_brkpoints = [0]
     for brkpoint in words_break_iterator:
         self.icu_word_brkpoints.append(brkpoint)
     self.icu_segmented = "|"
     for i in range(len(self.icu_word_brkpoints) - 1):
         self.icu_segmented += self.unsegmented[
             self.icu_word_brkpoints[i]:self.icu_word_brkpoints[i +
                                                                1]] + "|"
Example #10
0
	def endElement(self, name):
		if name == u"Unicode":
			self.__isUni = False
			loc = Locale.createFromName("utf-8")
			bi = BreakIterator.createWordInstance(loc)
			bi.setText(self.__uniText)
			tokens = []
			prev = 0
			while True:
				try:
					ind = bi.next()
					tokens.append(self.__uniText[prev:ind])
					prev = ind
				except StopIteration:
					break
			text = u""
			for t in tokens:
				text += processToken(t)
			self.__downstream.characters(text)
		self.__downstream.endElement(name)
    def character_tokenize(self, word):
        """ Returns the tokenization in character level.
        
        Arguments:
            word {string} -- word to be tokenized in character level.
        
        Returns:
            [list] -- list of characters.
        """

        temp_ = BreakIterator.createCharacterInstance(Locale())
        temp_.setText(word)
        char = []
        i = 0
        for j in temp_:
            s = word[i:j]
            char.append(s)
            i = j

        return char
Example #12
0
 def endElement(self, name):
     if name == u"Unicode":
         self.__isUni = False
         loc = Locale.createFromName("utf-8")
         bi = BreakIterator.createWordInstance(loc)
         bi.setText(self.__uniText)
         tokens = []
         prev = 0
         while True:
             try:
                 ind = bi.next()
                 tokens.append(self.__uniText[prev:ind])
                 prev = ind
             except StopIteration:
                 break
         text = u""
         for t in tokens:
             text += processToken(t)
         self.__downstream.characters(text)
     self.__downstream.endElement(name)
Example #13
0
 def __init__(self):
     self.BreakIterator = BreakIterator.createWordInstance(
         Locale.createFromName('ar'))
Example #14
0
 def __init__(self):
     self.locale = Locale("tr")
     self.breakor = BreakIterator.createWordInstance(self.locale)
 def __init__(self, lang: str = 'en'):
     """SentSplitter."""
     self.lang = lang
     self.locale = Locale(lang)
     self.break_iterator = \
         BreakIterator.createSentenceInstance(self.locale)
Example #16
0
 def _get_breaker(self, locale):
     return BreakIterator.createWordInstance(locale)
Example #17
0
 def __init__(self):
     self.BreakIterator = BreakIterator.createWordInstance(
                                               Locale.createFromName('ar'))
Example #18
0
 def __init__(self, locale='en'):
     super(WordTokenizer, self).__init__(locale)
     self.breaker = BreakIterator.createWordInstance(self.locale)
Example #19
0
 def __init__(self, locale='en'):
     super(SentenceTokenizer, self).__init__(locale)
     self.breaker = BreakIterator.createSentenceInstance(self.locale)
Example #20
0
 def __init__(self, locale='en'):
   super(SentenceTokenizer, self).__init__(locale)
   self.breaker = BreakIterator.createSentenceInstance(self.locale)
Example #21
0
 def __init__(self, locale='en'):
   super(WordTokenizer, self).__init__(locale)
   self.breaker = BreakIterator.createWordInstance(self.locale)
Example #22
0
 def _get_breaker(self, locale):
     return BreakIterator.createSentenceInstance(locale)