def test_break_iterator_iter_line02(self): # http://userguide.icu-project.org/boundaryanalysis text = 'Parlez-vous français ?' bi = UBreakIterator(UBreakIterator.UBRK_LINE, 'fr_FR') bi.set_text(text) items = [x for x in bi] expected = ['Parlez-', 'vous ', 'français ?'] self.assertEqual(expected, items)
def test_break_iterator_iter_word03(self): # https://stackoverflow.com/questions/44507838/breakiterator-not-working-correctly-with-chinese-text text = 'I like to eat apples. 我喜欢吃苹果。' bi = UBreakIterator(UBreakIterator.UBRK_WORD, 'zh_CN') bi.set_text(text) items = [x for x in bi] expected = [ 'I', ' ', 'like', ' ', 'to', ' ', 'eat', ' ', 'apples', '.', ' ', '我', '喜欢', '吃', '苹果', '。' ] self.assertEqual(expected, items)
def test_break_iterator_get_available(self): count = UBreakIterator.count_available() self.assertTrue(count > 0) locale = UBreakIterator.get_available(0) self.assertIsNotNone(locale) locale = UBreakIterator.get_available(count - 1) self.assertIsNotNone(locale) locale = UBreakIterator.get_available(count) self.assertIsNone(locale)
def test_break_iterator_iter_word01(self): # http://userguide.icu-project.org/boundaryanalysis # http://unicode.org/cldr/utility/bidi.jsp text = 'Your balance is $1,234.56... I think.' bi = UBreakIterator(UBreakIterator.UBRK_WORD, 'en_US') bi.set_text(text) items = [x for x in bi] expected = [ 'Your', ' ', 'balance', ' ', 'is', ' ', '$', '1,234.56', '.', '.', '.', ' ', 'I', ' ', 'think', '.' ] self.assertEqual(expected, items)
def test_break_iterator_iter_sentence01(self): # http://www.unicode.org/reports/tr29/#Sentence_Boundaries # http://unicode.org/cldr/utility/bidi.jsp text = "He said, “Are you going?” John shook his head." \ " “Are you going?” John asked." bi = UBreakIterator(UBreakIterator.UBRK_SENTENCE, 'en_US') bi.set_text(text) items = [x for x in bi] expected = [ 'He said, “Are you going?” ', 'John shook his head. ', '“Are you going?” ', 'John asked.' ] self.assertEqual(expected, items)
def test_break_iterator_word01(self): bi = UBreakIterator(UBreakIterator.UBRK_WORD, 'en_US') offset = bi.first() self.assertEqual(0, offset) offset = bi.next() self.assertEqual(-1, offset) offset = bi.last() self.assertEqual(0, offset) offset = bi.previous() self.assertEqual(offset, -1)
def test_break_iterator_word02(self): bi = UBreakIterator(UBreakIterator.UBRK_WORD, 'en_US', 'Make haste slowly.') # |Make| |haste| |slowly|.| # |0....4.5.....0.1......7.8 offset = bi.first() self.assertEqual(0, offset) offset = bi.next() self.assertEqual(4, offset) offset = bi.last() self.assertEqual(18, offset) offset = bi.previous() self.assertEqual(17, offset)
def test_break_iterator_iter_character01(self): bi = UBreakIterator(UBreakIterator.UBRK_CHARACTER, 'ja_JP', 'あアa\U00020000!') items = [x for x in bi] expected = ['あ', 'ア', 'a', '\U00020000', '!'] self.assertEqual(expected, items)