def test_tokenize_japanese_sentence(): tokenizer = Tokenizer('japanese') assert tokenizer.language == 'japanese' sentence = 'この文章を、正しくトークン化したい。' expected = ('この', '文章', 'を', '正しく', 'トークン', '化', 'し', 'たい') assert expected == tokenizer.to_words(sentence)
def test_tokenize_chinese_sentence(): tokenizer = Tokenizer('chinese') assert tokenizer.language == 'chinese' sentence = '好用的文档自动化摘要程序。' expected = ('好用', '的', '文档', '自动化', '摘要', '程序') assert expected == tokenizer.to_words(sentence)
def test_tokenize_korean_sentence(): tokenizer = Tokenizer('korean') assert tokenizer.language == 'korean' sentence = '대학에서 DB, 통계학, 이산수학 등을 배웠지만...' expected = ('대학', '통계학', '이산', '이산수학', '수학', '등') assert expected == tokenizer.to_words(sentence)
def test_tokenize_japanese_sentence(self): tokenizer = Tokenizer('japanese') self.assertEqual(tokenizer.language, 'japanese') sentence = 'この文章を、正しくトークン化したい。' expected = ('この', '文章', 'を', '正しく', 'トークン', '化', 'し', 'たい') self.assertEqual(expected, tokenizer.to_words(sentence))
def test_tokenize_sentence_to_words(language, sentence, expected_words): tokenizer = Tokenizer(language) words = tokenizer.to_words(sentence) assert words == expected_words assert tokenizer.language == language
def test_tokenize_chinese_sentence(self): tokenizer = Tokenizer('chinese') self.assertEqual(tokenizer.language, 'chinese') sentence = '好用的文档自动化摘要程序。' expected = ('好用', '的', '文档', '自动化', '摘要', '程序') self.assertEqual(expected, tokenizer.to_words(sentence))
def test_tokenize_sentence(): tokenizer = Tokenizer("english") words = tokenizer.to_words("I am a very nice sentence with comma, but..") expected = ( "I", "am", "a", "very", "nice", "sentence", "with", "comma", ) assert expected == words
# tokenizer = Tokenizer('japanese') # expected = ( # '1つ目の文章です。', # 'その次は何が来ますか?', # '「2つ目の文章」です。' # ) # paragraph = '1つ目の文章です。その次は何が来ますか? 「2つ目の文章」です。' # # print tokenizer.to_sentences(paragraph) # for t in tokenizer.to_sentences(paragraph): # print t # tokenizer = Tokenizer("vietnamese") tokenizer = Tokenizer("english") sentences = tokenizer.to_words("""he walked to the store yesterday""") # words = "Anh_ấy xin chào mọi người... Ha ha vui quá? Một con mèo tên shi." # words = tokenizer.to_words("Anh_ấy xin chào mọi người. Ha ha vui quá.") # sentences = tokenizer.to_sentences(words) # print words for t in sentences: print t # filename = "doc/document.txt" # file = io.open(filename, "r", encoding="utf-8") # text = file.read() # tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') # test = tokenizer.tokenize(text) # for t in words: