예제 #1
0
def svgTextFromCnnUrl(cnnurl, N=8, nGram=2, width=740, height=500):
    t = time()
    p = posts.getPostsFromUrl(cnnurl)
    print str(time()-t) + " seconds to fetch " + str(len(p)) + " unfiltered posts from disqus"
    t = time()
    # currently just displays the most common bigrams
    common_ngrams = stats.mostCommonNGramsFromPosts(p,N)
    common_ngrams = stats.nGramsListToTokenStringsList(common_ngrams[nGram-1])
    print str(time()-t) + " seconds to parse all the comments"
    s = svggen.commonTokensListToSVGImage(common_ngrams, width, height)
    return s
예제 #2
0
def commonNGrams(cnnurl, p, N=30, bn=12):
    article_ngrams = stats.mostCommonNGramsFromString(art.articleTextFromURL(cnnurl),10)
    if p == None:
        p = posts.getPostsFromUrl(cnnurl)
    common_ngrams = stats.mostCommonNGramsFromPosts(p,N)
    bigrams = filter(lambda x: x[0] not in article_ngrams[1], common_ngrams[1])
    unigrams = filter(lambda x: x[0] not in article_ngrams[0], common_ngrams[0])
    bigrams = bigrams[0:bn]
    # fake reweighting by multiplying the bigram counts by a constant
    bigrams = map(lambda x: tuple([x[0], x[1]*3]), bigrams)
    unigrams = unigrams[0:(N-bn)]
    common_ngrams = unigrams
    common_ngrams.extend(bigrams)
    common_ngrams = stats.nGramsListToTokenStringsList(common_ngrams)
    return common_ngrams
예제 #3
0
 def getPostsContaining(token, posts):
     return thread.findPostsContaining(posts, token)