def test_tokenize(): # a snippet from Hitchhiker's Guide that just happens to have # most of the examples of punctuation we're looking for. # # TODO: test wacky behavior with "n't" and "cannot" and stuff. text1 = "Time is an illusion. Lunchtime, doubly so." text2 = ( '"Very deep," said Arthur, "you should send that in to the ' "Reader's Digest. They've got a page for people like you.\"" ) eq_(tokenize(text1), ["Time", "is", "an", "illusion", ".", "Lunchtime", ",", "doubly", "so", "."]) eq_(untokenize(tokenize(text1)), text1) if nltk.__version__ >= "3": eq_(untokenize(tokenize(text2)), text2)
def test_tokenize(): # a snippet from Hitchhiker's Guide that just happens to have # most of the examples of punctuation we're looking for. # # TODO: test wacky behavior with "n't" and "cannot" and stuff. text1 = "Time is an illusion. Lunchtime, doubly so." text2 = ('"Very deep," said Arthur, "you should send that in to the ' 'Reader\'s Digest. They\'ve got a page for people like you."') eq_(tokenize(text1), ['Time', 'is', 'an', 'illusion', '.', 'Lunchtime', ',', 'doubly', 'so', '.'] ) eq_(untokenize(tokenize(text1)), text1) if nltk.__version__ >= '3': eq_(untokenize(tokenize(text2)), text2)
def test_tokenize(): # a snippet from Hitchhiker's Guide that just happens to have # most of the examples of punctuation we're looking for. # # TODO: test wacky behavior with "n't" and "cannot" and stuff. text1 = "Time is an illusion. Lunchtime, doubly so." text2 = ('"Very deep," said Arthur, "you should send that in to the ' 'Reader\'s Digest. They\'ve got a page for people like you."') eq_(tokenize(text1), [ 'Time', 'is', 'an', 'illusion', '.', 'Lunchtime', ',', 'doubly', 'so', '.' ]) eq_(untokenize(tokenize(text1)), text1) if nltk.__version__ >= '3': eq_(untokenize(tokenize(text2)), text2)
def normalize(text): """ Get a string made from the non-stopword word stems in the text. See normalize_list(). """ return untokenize(normalize_list(text))