コード例 #1
0
 def test_baseform_normalizer(self):
     tokenizer = Tokenizer(lang="en")
     text = "goes playing"
     tokens = tokenizer.transform([text])
     normalized = tfm.BaseFormNormalizer().transform(tokens)[0]
     self.assertTrue("go", normalized[0])
     self.assertTrue("play", normalized[1])
コード例 #2
0
    def test_ja_tokenize(self):
        tokenizer = Tokenizer(lang="ja")
        text = "日本語の形態素解析、マジ卍。"
        tokens = tokenizer.transform(text)

        for t in tokens:
            self.assertFalse(t.is_spacy)
            self.assertTrue(t.is_ja)
            self.assertTrue(t.surface)
            self.assertTrue(t.base_form)
            self.assertTrue(t.pos)
            self.assertTrue(t.tag)
コード例 #3
0
    def test_split_tokenize(self):
        tokenizer = Tokenizer(lang=None)
        text = "Tom goes to a park that Mary is playing."
        tokens = tokenizer.transform(text)

        for t in tokens:
            self.assertFalse(t.is_spacy)
            self.assertFalse(t.is_ja)
            self.assertTrue(t.surface)
            self.assertTrue(t.base_form)
            self.assertTrue(t.pos)
            self.assertTrue(t.tag)
コード例 #4
0
    def test_setter_token(self):
        vocab = Vocabulary()
        text = "you are making the vocabulary"
        words = Tokenizer(lang="en").transform([text])[0]
        vocab.set(words)

        vocab_size = len(words) + len(["_pad_", "_unk_", "_bos", "_eos"])
        self.assertEqual(len(vocab.get()), vocab_size)
        print(vocab.get())
コード例 #5
0
ファイル: preprocessor.py プロジェクト: thinkzink/chariot
 def __init__(self,
              tokenizer=None,
              text_transformers=(),
              token_transformers=(),
              vocabulary=None,
              other_transformers=()):
     self.tokenizer = tokenizer
     if isinstance(self.tokenizer, str):
         self.tokenizer = Tokenizer(self.tokenizer)
     self.text_transformers = list(text_transformers)
     self.token_transformers = list(token_transformers)
     self.vocabulary = vocabulary
     self.other_transformers = list(other_transformers)
コード例 #6
0
    def test_baseform_normalizer(self):
        tokenizer = Tokenizer(lang="en")
        text = "five players of 3 state on 1,000 location"
        tokens = tokenizer.transform([text])
        normalized = tfm.NumberNormalizer().transform(tokens)[0]
        self.assertEqual(2, len([t for t in normalized if t.surface == "0"]))

        tokenizer = Tokenizer(lang="ja")
        text = "百に一つの場所に2人の人がいる"
        tokens = tokenizer.transform([text])
        normalized = tfm.NumberNormalizer().transform(tokens)[0]
        self.assertEqual(2, len([t for t in normalized if t.surface == "0"]))
コード例 #7
0
 def test_stopword_ja(self):
     tokenizer = Tokenizer(lang="ja")
     text = "わたしの形態素解析、マジ卍。"
     tokens = tokenizer.transform([text])
     filtered = tfm.StopwordFilter(lang="ja").transform(tokens)
     self.assertTrue(1, len(tokens) - len(filtered))
コード例 #8
0
 def test_en_tokenize(self):
     tokenizer = Tokenizer(lang="en")
     text = "Tom goes to a park that Mary is playing."
     tokens = tokenizer.transform([text])
     filtered = tfm.StopwordFilter(lang="en").transform(tokens)
     self.assertTrue(4, len(tokens) - len(filtered))
コード例 #9
0
 def test_word_frequency_filter_en(self):
     tokenizer = Tokenizer(lang="en")
     text = "Tom goes to a park that Mary is playing. Tom and Mary is playing tennis in the park."
     tokens = tokenizer.transform([text])
     filtered = tfm.WordFrequencyFilter(n=3, min_freq=1).fit_transform(tokens)[0]
     self.assertTrue(6, len(filtered))
コード例 #10
0
 def test_word_frequency_filter_ja(self):
     tokenizer = Tokenizer(lang="ja")
     text = "わたしの形態素解析は楽しい。わたしはまだまだ。"
     tokens = tokenizer.transform([text])
     filtered = tfm.WordFrequencyFilter(n=1, min_freq=1).fit_transform(tokens)[0]
     self.assertTrue(4, len(filtered))