def test_tokenize_japanese_paragraph(): tokenizer = Tokenizer('japanese') expected = ( '1つ目の文章です。', 'その次は何が来ますか?', '「2つ目の文章」です。' ) paragraph = '1つ目の文章です。その次は何が来ますか? 「2つ目の文章」です。' assert expected == tokenizer.to_sentences(paragraph)
def test_tokenize_korean_paragraph(): tokenizer = Tokenizer('korean') expected = ( '회사 동료 분들과 다녀왔는데 분위기도 좋고 음식도 맛있었어요', '다만, 강남 토끼 정이 강남 쉑쉑 버거 골목길로 쭉 올라가야 하는데 다들 쉑쉑버거의 유혹에 넘어갈 뻔 했답니다', '강남 역 맛 집 토끼정의 외부 모습.') paragraph = '회사 동료 분들과 다녀왔는데 분위기도 좋고 음식도 맛있었어요 다만, 강남 토끼정이 강남 쉑쉑버거 골목길로 쭉 올라가야 하는데 다들 쉑쉑버거의 유혹에 넘어갈 뻔 했답니다 강남역 맛집 토끼정의 외부 모습.' assert expected == tokenizer.to_sentences(paragraph)
def test_tokenize_chinese_paragraph(): tokenizer = Tokenizer('chinese') expected = ( '我正在为这个软件添加中文支持。', '这个软件是用于文档摘要!', '这个软件支持网页和文本两种输入格式?' ) paragraph = '我正在为这个软件添加中文支持。这个软件是用于文档摘要!这个软件支持网页和文本两种输入格式?' assert expected == tokenizer.to_sentences(paragraph)
def test_tokenize_sentences_with_abbreviations(): tokenizer = Tokenizer("english") sentences = tokenizer.to_sentences( "There are people who are weird, e.g. normal people. These people know you." ) expected = ( "There are people who are weird, e.g. normal people.", "These people know you.", ) assert expected == sentences
def test_tokenize_paragraph(): tokenizer = Tokenizer("english") sentences = tokenizer.to_sentences(""" I am a very nice sentence with comma, but.. This is next sentence. "I'm bored", said Pepek. Ou jee, duffman is here. """) expected = ( "I am a very nice sentence with comma, but..", "This is next sentence.", '"I\'m bored", said Pepek.', "Ou jee, duffman is here.", ) assert expected == sentences
def test_ensure_czech_tokenizer_available(): tokenizer = Tokenizer("czech") assert "czech" == tokenizer.language sentences = tokenizer.to_sentences(""" Měl jsem sen, že toto je sen. Bylo to také zvláštní. Jakoby jsem plaval v moři rekurze. """) expected = ( "Měl jsem sen, že toto je sen.", "Bylo to také zvláštní.", "Jakoby jsem plaval v moři rekurze.", ) assert expected == sentences
def test_ensure_czech_tokenizer_available(self): tokenizer = Tokenizer("czech") self.assertEqual("czech", tokenizer.language) sentences = tokenizer.to_sentences(""" Měl jsem sen, že toto je sen. Bylo to také zvláštní. Jakoby jsem plaval v moři rekurze. """) expected = ( "Měl jsem sen, že toto je sen.", "Bylo to také zvláštní.", "Jakoby jsem plaval v moři rekurze.", ) self.assertEqual(expected, sentences)
def test_slovak_alias_into_czech_tokenizer(): tokenizer = Tokenizer("slovak") assert tokenizer.language == "slovak" sentences = tokenizer.to_sentences(""" Je to veľmi fajn. Bodaj by nie. Ale na druhej strane čo je to oproti inému? To nechám na čitateľa. """) expected = ( "Je to veľmi fajn.", "Bodaj by nie.", "Ale na druhej strane čo je to oproti inému?", "To nechám na čitateľa.", ) assert expected == sentences
def test_slovak_alias_into_czech_tokenizer(self): tokenizer = Tokenizer("slovak") self.assertEqual(tokenizer.language, "slovak") sentences = tokenizer.to_sentences(""" Je to veľmi fajn. Bodaj by nie. Ale na druhej strane čo je to oproti inému? To nechám na čitateľa. """) expected = ( "Je to veľmi fajn.", "Bodaj by nie.", "Ale na druhej strane čo je to oproti inému?", "To nechám na čitateľa.", ) self.assertEqual(expected, sentences)
def test_tokenize_sentences_with_abbreviations(): tokenizer = Tokenizer("english") sentences = tokenizer.to_sentences("There are people who are weird, e.g. normal people. These people know you.") expected = ("There are people who are weird, e.g. normal people.", "These people know you.",) assert expected == sentences
def test_tokenize_japanese_paragraph(self): tokenizer = Tokenizer('japanese') expected = ('1つ目の文章です。', 'その次は何が来ますか?', '「2つ目の文章」です。') paragraph = '1つ目の文章です。その次は何が来ますか? 「2つ目の文章」です。' self.assertEqual(expected, tokenizer.to_sentences(paragraph))