def test_ngrams(self): assert list(pytextparser.word_tokenize(text="foo bar bomb blar", ngrams=2)) == [ ("foo", "bar"), ("bar", "bomb"), ("bomb", "blar"), ]
def test_ignores_numeric(self): assert list(pytextparser.word_tokenize(text="one two 3 four")) == [("one",), ("two",), ("four",)]
def test_min_length(self): assert list(pytextparser.word_tokenize(text="one for the money two for the go", min_length=4)) == [("money",)]
def test_ignores_stopwords(self): assert list( pytextparser.word_tokenize( text="The first rule of python is", stopwords=set(["the", "of", "is"]), min_length=1 ) ) == [("first",), ("rule",), ("python",)]
def test_splits_punctuation(self): assert list(pytextparser.word_tokenize(text="first. second")) == [("first",), ("second",)]
def test_sentence(self): assert list(pytextparser.word_tokenize(text="hello cruel world")) == [("hello",), ("cruel",), ("world",)]