class TestAlphabet(unittest.TestCase): def setUp(self): self.preprocessesor = JpTokenPreprocessing(number=False, case="lower", min_len=2) def test_case_lower(self): token = ["ab", "ABC", "AbC"] expect = ["ab", "abc", "abc"] got = list(self.preprocessesor.preprocessing(token)) assert got == expect def test_case_upper(self): self.preprocessesor.case = "upper" token = ["ab", "ABC", "AbC"] expect = ["AB", "ABC", "ABC"] got = list(self.preprocessesor.preprocessing(token)) assert got == expect def test_case_capitalize(self): self.preprocessesor.case = "capitalize" token = ["ab", "ABC", "AbC"] expect = ["Ab", "Abc", "Abc"] got = list(self.preprocessesor.preprocessing(token)) assert got == expect def test_number(self): token = ["12", "A1", "1v1"] expect = ["a1", "1v1"] got = list(self.preprocessesor.preprocessing(token)) assert got == expect def test_symbol(self): token = ["!?", "A!", "wow!"] expect = ["a!", "wow!"] got = list(self.preprocessesor.preprocessing(token)) assert got == expect
def setUp(self): self.preprocessesor = JpTokenPreprocessing(min_len=2) token = ['a', 'ab'] expect = ['ab'] got = list(self.preprocessesor.preprocessing(token)) assert got == expect self.preprocessesor.min_len = 1 token = ['a', 'ab'] expect = ['a', 'ab'] got = list(self.preprocessesor.preprocessing(token)) assert got == expect
class TestLength(unittest.TestCase): def setUp(self): self.preprocessesor = JpTokenPreprocessing(min_len=2) token = ['a', 'ab'] expect = ['ab'] got = list(self.preprocessesor.preprocessing(token)) assert got == expect self.preprocessesor.min_len = 1 token = ['a', 'ab'] expect = ['a', 'ab'] got = list(self.preprocessesor.preprocessing(token)) assert got == expect
class TestLength(unittest.TestCase): def setUp(self): self.preprocessesor = JpTokenPreprocessing(min_len=2) token = ["a", "ab"] expect = ["ab"] got = list(self.preprocessesor.preprocessing(token)) assert got == expect self.preprocessesor.min_len = 1 token = ["a", "ab"] expect = ["a", "ab"] got = list(self.preprocessesor.preprocessing(token)) assert got == expect
class TestNumber(unittest.TestCase): def setUp(self): self.preprocessesor = JpTokenPreprocessing(number=True, min_len=2) def test_number(self): token = ['12', 'ab', 'A1', '1v1'] expect = ['12', 'ab', 'a1', '1v1'] got = list(self.preprocessesor.preprocessing(token)) assert got == expect def test_symbol(self): token = ['!?', '12!', '[1]'] expect = ['12!', '[1]'] got = list(self.preprocessesor.preprocessing(token)) assert got == expect
class TestNumber(unittest.TestCase): def setUp(self): self.preprocessesor = JpTokenPreprocessing(number=True, min_len=2) def test_number(self): token = ["12", "ab", "A1", "1v1"] expect = ["12", "ab", "a1", "1v1"] got = list(self.preprocessesor.preprocessing(token)) assert got == expect def test_symbol(self): token = ["!?", "12!", "[1]"] expect = ["12!", "[1]"] got = list(self.preprocessesor.preprocessing(token)) assert got == expect
class TestUnicodeNormalize(unittest.TestCase): def setUp(self): self.preprocessesor = JpTokenPreprocessing(number=True, case='lower', unicode='NFKC', min_len=2) def test_alphabet_num(self): token = ['123', 'ABC', 'AB1'] expect = ['123', 'abc', 'ab1'] got = list(self.preprocessesor.preprocessing(token)) assert got == expect def test_symbol(self): self.preprocessesor.symbol = True token = ['!?', 'ABC?'] expect = ['!?', 'abc?'] got = list(self.preprocessesor.preprocessing(token)) assert got == expect
def tokenize(text): tagger = MeCab.Tagger() node = tagger.parseToNode(text) while node: if '名詞' in node.feature: surface = node.surface yield surface node = node.next if __name__ == '__main__': text = """ これは自然言語処理に必須な前処理のためのモジュールです。 形態素解析や、n-gramでトークン化した後のフィルタリング、正規化を補助します。 一語だけのトークンや'1234'のような数字だけのトークン、'!!'のような記号だけのトークンのフィルタリング、 全角文字'PYTHON'の半角化、英単語'Word'の小文字化といった正規化も行えます。 さらに必ず除外したいトークンをストップワードに設定することもできます。 """ stopwords = ['これ', 'こと'] tokens = tokenize(text) print(list(tokens)) tokens = tokenize(text) preprocessor = JpTokenPreprocessing(number=False, symbol=False, case='lower', unicode='NFKC', min_len=2, stopwords=stopwords) tokens = preprocessor.preprocessing(tokens) print(list(tokens))
def setUp(self): self.preprocessesor = JpTokenPreprocessing(min_len=2) token = ["a", "ab"] expect = ["ab"] got = list(self.preprocessesor.preprocessing(token)) assert got == expect self.preprocessesor.min_len = 1 token = ["a", "ab"] expect = ["a", "ab"] got = list(self.preprocessesor.preprocessing(token)) assert got == expect
def setUp(self): self.preprocessesor = JpTokenPreprocessing(number=True, case='lower', str='NFKC', min_len=2)
tokens = tokenize(text) """ >>> print(list(tokens)) ['', '', '言語', '処理', '必須', '前', '処理', 'ため', 'モジュール', '形態素', '解析', 'n', '-', 'gram', 'トー', 'クン', '化', '後', 'フィルタ', 'リング', '正規', '化', '補助', '一語', 'トーク', 'ン', "'", '1234', "'", 'よう', '数字','トー', 'クン', "'!!'", 'よう', '記号', 'トー', 'クン', 'フィルタ', 'リング', '全角', '文字', "'", 'PYTHON', "'", '半角', '化', '英単語', "'", 'Word',"'", '小文字', '化', '正規', '化', '除外', 'トーク', 'ン', 'ストップ', 'ワード', '設定', 'こと'] """ tokens = tokenize(text) preprocessor = JpTokenPreprocessing(number=False, symbol=False, case='lower', str='NFKC', min_len=2, stopwords=stopwords) tokens = preprocessor.preprocessing(tokens) # Return iterator of tokens. Using list() for print sample. """ >>> print(list(tokens)) ['言語', '処理', '必須', '処理', 'ため', 'モジュール', '形態素', '解析', 'gram', 'トー', 'クン', 'フィルタ', 'リング', '正規', '補助', '一語', 'トーク', 'よう', '数字', 'トー', 'クン', 'よう', '記号', 'トー', 'クン', 'フィルタ', 'リング', '全角', '文字', 'python', '半角', '英単語', 'word', '小文字', '正規', '除外', 'トーク', 'ストップ', 'ワード', '設定'] """
def setUp(self): self.preprocessesor = JpTokenPreprocessing(number=True, case='lower', unicode='NFKC', min_len=2)
def setUp(self): self.preprocessesor = JpTokenPreprocessing(number=True, min_len=2)
def setUp(self): self.preprocessesor = JpTokenPreprocessing(number=False, case='lower', min_len=2)
def setUp(self): self.preprocessesor = JpTokenPreprocessing(number=False, case="lower", min_len=2)
tokens = tokenize(text) """ >>> print(list(tokens)) ['', '', '言語', '処理', '必須', '前', '処理', 'ため', 'モジュール', '形態素', '解析', 'n', '-', 'gram', 'トー', 'クン', '化', '後', 'フィルタ', 'リング', '正規', '化', '補助', '一語', 'トーク', 'ン', "'", '1234', "'", 'よう', '数字','トー', 'クン', "'!!'", 'よう', '記号', 'トー', 'クン', 'フィルタ', 'リング', '全角', '文字', "'", 'PYTHON', "'", '半角', '化', '英単語', "'", 'Word',"'", '小文字', '化', '正規', '化', '除外', 'トーク', 'ン', 'ストップ', 'ワード', '設定', 'こと'] """ tokens = tokenize(text) preprocessor = JpTokenPreprocessing(number=False, symbol=False, case='lower', unicode='NFKC', min_len=2, stopwords=stopwords) tokens = preprocessor.preprocessing(tokens) # Return iterator of tokens. Using list() for print sample. """ >>> print(list(tokens)) ['言語', '処理', '必須', '処理', 'ため', 'モジュール', '形態素', '解析', 'gram', 'トー', 'クン', 'フィルタ', 'リング', '正規', '補助', '一語', 'トーク', 'よう', '数字', 'トー', 'クン', 'よう', '記号', 'トー', 'クン', 'フィルタ', 'リング', '全角', '文字', 'python', '半角', '英単語', 'word', '小文字', '正規', '除外', 'トーク', 'ストップ', 'ワード', '設定'] """