コード例 #1
0
 def test_ngrams(self):
     assert list(pytextparser.word_tokenize(text="foo bar bomb blar", ngrams=2)) == [
         ("foo", "bar"),
         ("bar", "bomb"),
         ("bomb", "blar"),
     ]
コード例 #2
0
 def test_ignores_numeric(self):
     assert list(pytextparser.word_tokenize(text="one two 3 four")) == [("one",), ("two",), ("four",)]
コード例 #3
0
 def test_min_length(self):
     assert list(pytextparser.word_tokenize(text="one for the money two for the go", min_length=4)) == [("money",)]
コード例 #4
0
 def test_ignores_stopwords(self):
     assert list(
         pytextparser.word_tokenize(
             text="The first rule of python is", stopwords=set(["the", "of", "is"]), min_length=1
         )
     ) == [("first",), ("rule",), ("python",)]
コード例 #5
0
 def test_splits_punctuation(self):
     assert list(pytextparser.word_tokenize(text="first. second")) == [("first",), ("second",)]
コード例 #6
0
 def test_sentence(self):
     assert list(pytextparser.word_tokenize(text="hello cruel world")) == [("hello",), ("cruel",), ("world",)]