def test_tokenize_japanese_sentence():
    tokenizer = Tokenizer('japanese')
    assert tokenizer.language == 'japanese'

    sentence = 'この文章を、正しくトークン化したい。'
    expected = ('この', '文章', 'を', '正しく', 'トークン', '化', 'し', 'たい')
    assert expected == tokenizer.to_words(sentence)
def test_tokenize_chinese_sentence():
    tokenizer = Tokenizer('chinese')
    assert tokenizer.language == 'chinese'

    sentence = '好用的文档自动化摘要程序。'
    expected = ('好用', '的', '文档', '自动化', '摘要', '程序')
    assert expected == tokenizer.to_words(sentence)
Beispiel #3
0
def test_tokenize_korean_sentence():
    tokenizer = Tokenizer('korean')
    assert tokenizer.language == 'korean'

    sentence = '대학에서 DB, 통계학, 이산수학 등을 배웠지만...'
    expected = ('대학', '통계학', '이산', '이산수학', '수학', '등')
    assert expected == tokenizer.to_words(sentence)
Beispiel #4
0
    def test_tokenize_japanese_sentence(self):
        tokenizer = Tokenizer('japanese')
        self.assertEqual(tokenizer.language, 'japanese')

        sentence = 'この文章を、正しくトークン化したい。'
        expected = ('この', '文章', 'を', '正しく', 'トークン', '化', 'し', 'たい')
        self.assertEqual(expected, tokenizer.to_words(sentence))
def test_tokenize_sentence_to_words(language, sentence, expected_words):
    tokenizer = Tokenizer(language)

    words = tokenizer.to_words(sentence)

    assert words == expected_words
    assert tokenizer.language == language
Beispiel #6
0
    def test_tokenize_chinese_sentence(self):
        tokenizer = Tokenizer('chinese')
        self.assertEqual(tokenizer.language, 'chinese')

        sentence = '好用的文档自动化摘要程序。'
        expected = ('好用', '的', '文档', '自动化', '摘要', '程序')
        self.assertEqual(expected, tokenizer.to_words(sentence))
def test_tokenize_sentence():
    tokenizer = Tokenizer("english")
    words = tokenizer.to_words("I am a very nice sentence with comma, but..")

    expected = (
        "I", "am", "a", "very", "nice", "sentence",
        "with", "comma",
    )
    assert expected == words
# tokenizer = Tokenizer('japanese')
# expected = (
#             '1つ目の文章です。',
#             'その次は何が来ますか?',
#             '「2つ目の文章」です。'
#         )
# paragraph = '1つ目の文章です。その次は何が来ますか? 「2つ目の文章」です。'
# # print tokenizer.to_sentences(paragraph)
# for t in tokenizer.to_sentences(paragraph):
# 	print t 


# tokenizer = Tokenizer("vietnamese")
tokenizer = Tokenizer("english")
sentences = tokenizer.to_words("""he walked to the store yesterday""")
# words = "Anh_ấy xin chào mọi người... Ha ha vui quá? Một con mèo tên shi."
# words = tokenizer.to_words("Anh_ấy xin chào mọi người. Ha ha vui quá.")
# sentences = tokenizer.to_sentences(words)
# print words
for t in sentences:
	print t
# filename = "doc/document.txt"

# file = io.open(filename, "r", encoding="utf-8")
# text = file.read()

# tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
# test = tokenizer.tokenize(text)

# for t in words: