Beispiel #1
0
def calcTfidf(word):
    files = nltk.corpus.movie_reviews.abspaths()
    texts = []
    for f in files: 
        texts.append(nltk.Text(nltk.word_tokenize(raw(f))))
    rank = nltk_funcs.tfidf(word, texts)
    print rank[:5]
Beispiel #2
0
def test1 (word): 
    print "Retrieving docs from online"
    t1 = time.time()
    holmes = nltk_funcs.cleanText('http://www.gutenberg.org/files/1661/1661-h/1661-h.htm')
    # austen = nltk_funcs.cleanText('http://www.gutenberg.org/files/1342/1342-h/1342-h.htm')
    carroll = nltk_funcs.cleanText('http://www.gutenberg.org/files/11/11-h/11-h.htm')
    t2 = time.time()
    print "Done retrieving. Time taken (s): ", t2-t1
    
    print "Ranking..."
    t3 = time.time()
    rank = nltk_funcs.tfidf(word, texts=[holmes, carroll])
    t4 = time.time()
    print "Done ranking. Time taken (s): ", t4-t3
    
    sortedTexts = []
    for tup in rank: 
        r, t = tup
        contexts = nltk_funcs.concordance(t, word)
        for c in contexts: 
            print " Found: ... ", c