コード例 #1
0
ファイル: test_tokens.py プロジェクト: pombredanne/metanl
def test_tokenize():
    # a snippet from Hitchhiker's Guide that just happens to have
    # most of the examples of punctuation we're looking for.
    #
    # TODO: test wacky behavior with "n't" and "cannot" and stuff.
    text1 = "Time is an illusion. Lunchtime, doubly so."
    text2 = (
        '"Very deep," said Arthur, "you should send that in to the '
        "Reader's Digest. They've got a page for people like you.\""
    )
    eq_(tokenize(text1), ["Time", "is", "an", "illusion", ".", "Lunchtime", ",", "doubly", "so", "."])
    eq_(untokenize(tokenize(text1)), text1)
    if nltk.__version__ >= "3":
        eq_(untokenize(tokenize(text2)), text2)
コード例 #2
0
ファイル: allPythonContent.py プロジェクト: Mondego/pyreco
def test_tokenize():
    # a snippet from Hitchhiker's Guide that just happens to have
    # most of the examples of punctuation we're looking for.
    #
    # TODO: test wacky behavior with "n't" and "cannot" and stuff.
    text1 = "Time is an illusion. Lunchtime, doubly so."
    text2 = ('"Very deep," said Arthur, "you should send that in to the '
             'Reader\'s Digest. They\'ve got a page for people like you."')
    eq_(tokenize(text1),
        ['Time', 'is', 'an', 'illusion', '.', 'Lunchtime', ',',
         'doubly', 'so', '.']
    )
    eq_(untokenize(tokenize(text1)), text1)
    if nltk.__version__ >= '3':
        eq_(untokenize(tokenize(text2)), text2)
コード例 #3
0
ファイル: allPythonContent.py プロジェクト: Mondego/pyreco
def tag_and_stem(text):
    """
    Returns a list of (stem, tag, token) triples:

    - stem: the word's uninflected form
    - tag: the word's part of speech
    - token: the original word, so we can reconstruct it later
    """
    tokens = tokenize(text)
    tagged = nltk.pos_tag(tokens)
    out = []
    for token, tag in tagged:
        stem = morphy_stem(token, tag)
        out.append((stem, tag, token))
    return out
コード例 #4
0
ファイル: allPythonContent.py プロジェクト: Mondego/pyreco
def normalize_list(text):
    """
    Get a list of word stems that appear in the text. Stopwords and an initial
    'to' will be stripped, unless this leaves nothing in the stem.

    >>> normalize_list('the dog')
    ['dog']
    >>> normalize_list('big dogs')
    ['big', 'dog']
    >>> normalize_list('the')
    ['the']
    """
    pieces = [morphy_stem(word) for word in tokenize(text)]
    pieces = [piece for piece in pieces if good_lemma(piece)]
    if not pieces:
        return [text]
    if pieces[0] == 'to':
        pieces = pieces[1:]
    return pieces