def test_baseform_normalizer(self): tokenizer = Tokenizer(lang="en") text = "goes playing" tokens = tokenizer.transform([text]) normalized = tfm.BaseFormNormalizer().transform(tokens)[0] self.assertTrue("go", normalized[0]) self.assertTrue("play", normalized[1])
def test_ja_tokenize(self): tokenizer = Tokenizer(lang="ja") text = "日本語の形態素解析、マジ卍。" tokens = tokenizer.transform(text) for t in tokens: self.assertFalse(t.is_spacy) self.assertTrue(t.is_ja) self.assertTrue(t.surface) self.assertTrue(t.base_form) self.assertTrue(t.pos) self.assertTrue(t.tag)
def test_split_tokenize(self): tokenizer = Tokenizer(lang=None) text = "Tom goes to a park that Mary is playing." tokens = tokenizer.transform(text) for t in tokens: self.assertFalse(t.is_spacy) self.assertFalse(t.is_ja) self.assertTrue(t.surface) self.assertTrue(t.base_form) self.assertTrue(t.pos) self.assertTrue(t.tag)
def test_setter_token(self): vocab = Vocabulary() text = "you are making the vocabulary" words = Tokenizer(lang="en").transform([text])[0] vocab.set(words) vocab_size = len(words) + len(["_pad_", "_unk_", "_bos", "_eos"]) self.assertEqual(len(vocab.get()), vocab_size) print(vocab.get())
def __init__(self, tokenizer=None, text_transformers=(), token_transformers=(), vocabulary=None, other_transformers=()): self.tokenizer = tokenizer if isinstance(self.tokenizer, str): self.tokenizer = Tokenizer(self.tokenizer) self.text_transformers = list(text_transformers) self.token_transformers = list(token_transformers) self.vocabulary = vocabulary self.other_transformers = list(other_transformers)
def test_baseform_normalizer(self): tokenizer = Tokenizer(lang="en") text = "five players of 3 state on 1,000 location" tokens = tokenizer.transform([text]) normalized = tfm.NumberNormalizer().transform(tokens)[0] self.assertEqual(2, len([t for t in normalized if t.surface == "0"])) tokenizer = Tokenizer(lang="ja") text = "百に一つの場所に2人の人がいる" tokens = tokenizer.transform([text]) normalized = tfm.NumberNormalizer().transform(tokens)[0] self.assertEqual(2, len([t for t in normalized if t.surface == "0"]))
def test_stopword_ja(self): tokenizer = Tokenizer(lang="ja") text = "わたしの形態素解析、マジ卍。" tokens = tokenizer.transform([text]) filtered = tfm.StopwordFilter(lang="ja").transform(tokens) self.assertTrue(1, len(tokens) - len(filtered))
def test_en_tokenize(self): tokenizer = Tokenizer(lang="en") text = "Tom goes to a park that Mary is playing." tokens = tokenizer.transform([text]) filtered = tfm.StopwordFilter(lang="en").transform(tokens) self.assertTrue(4, len(tokens) - len(filtered))
def test_word_frequency_filter_en(self): tokenizer = Tokenizer(lang="en") text = "Tom goes to a park that Mary is playing. Tom and Mary is playing tennis in the park." tokens = tokenizer.transform([text]) filtered = tfm.WordFrequencyFilter(n=3, min_freq=1).fit_transform(tokens)[0] self.assertTrue(6, len(filtered))
def test_word_frequency_filter_ja(self): tokenizer = Tokenizer(lang="ja") text = "わたしの形態素解析は楽しい。わたしはまだまだ。" tokens = tokenizer.transform([text]) filtered = tfm.WordFrequencyFilter(n=1, min_freq=1).fit_transform(tokens)[0] self.assertTrue(4, len(filtered))