def test_tokenize(): # a snippet from Hitchhiker's Guide that just happens to have # most of the examples of punctuation we're looking for. # # TODO: test wacky behavior with "n't" and "cannot" and stuff. text1 = "Time is an illusion. Lunchtime, doubly so." text2 = ( '"Very deep," said Arthur, "you should send that in to the ' "Reader's Digest. They've got a page for people like you.\"" ) eq_(tokenize(text1), ["Time", "is", "an", "illusion", ".", "Lunchtime", ",", "doubly", "so", "."]) eq_(untokenize(tokenize(text1)), text1) if nltk.__version__ >= "3": eq_(untokenize(tokenize(text2)), text2)
def test_tokenize(): # a snippet from Hitchhiker's Guide that just happens to have # most of the examples of punctuation we're looking for. # # TODO: test wacky behavior with "n't" and "cannot" and stuff. text1 = "Time is an illusion. Lunchtime, doubly so." text2 = ('"Very deep," said Arthur, "you should send that in to the ' 'Reader\'s Digest. They\'ve got a page for people like you."') eq_(tokenize(text1), ['Time', 'is', 'an', 'illusion', '.', 'Lunchtime', ',', 'doubly', 'so', '.'] ) eq_(untokenize(tokenize(text1)), text1) if nltk.__version__ >= '3': eq_(untokenize(tokenize(text2)), text2)
def test_tokenize(): # a snippet from Hitchhiker's Guide that just happens to have # most of the examples of punctuation we're looking for. # # TODO: test wacky behavior with "n't" and "cannot" and stuff. text1 = "Time is an illusion. Lunchtime, doubly so." text2 = ('"Very deep," said Arthur, "you should send that in to the ' 'Reader\'s Digest. They\'ve got a page for people like you."') eq_(tokenize(text1), [ 'Time', 'is', 'an', 'illusion', '.', 'Lunchtime', ',', 'doubly', 'so', '.' ]) eq_(untokenize(tokenize(text1)), text1) if nltk.__version__ >= '3': eq_(untokenize(tokenize(text2)), text2)
def tag_and_stem(text): """ Returns a list of (stem, tag, token) triples: - stem: the word's uninflected form - tag: the word's part of speech - token: the original word, so we can reconstruct it later """ tokens = tokenize(text) tagged = nltk.pos_tag(tokens) out = [] for token, tag in tagged: stem = morphy_stem(token, tag) out.append((stem, tag, token)) return out
def normalize_list(text): """ Get a list of word stems that appear in the text. Stopwords and an initial 'to' will be stripped, unless this leaves nothing in the stem. >>> normalize_list('the dog') ['dog'] >>> normalize_list('big dogs') ['big', 'dog'] >>> normalize_list('the') ['the'] """ pieces = [morphy_stem(word) for word in tokenize(text)] pieces = [piece for piece in pieces if good_lemma(piece)] if not pieces: return [text] if pieces[0] == 'to': pieces = pieces[1:] return pieces