コード例 #1
0
ファイル: topics.py プロジェクト: latuji/news-media-topics
def get_topics_non_dictionary_overlapping(num_news, k, url='http://cnn.com'):

    texts = get_news(url, num_news)

    gb = SimpleGraphBuilder(text_processing.only_non_dictionary_words, stem_words=False)
    gb.load_texts(texts) 
    G = gb.create_graph()
    print "Graph built"

    words_by_part = graph_cluster.get_overlap_clusters(G, k, 1)

    #print_topics_from_partitions(G, words_by_part, 10)

    return G
コード例 #2
0
ファイル: topics.py プロジェクト: latuji/news-media-topics
def get_topics_by_standard_words(num_news, draw=False, url='http://cnn.com'):

    texts = get_news(url, num_news)

    gb = SimpleGraphBuilder(text_processing.clean_punctuation_and_stopwords)
    gb.load_texts(texts)
    G = gb.create_graph()
    print "Graph built"

    partition = community.best_partition(G)
    words_by_part = get_words_by_partition(partition)

    mod = community.modularity(partition,G)
    print("modularity:", mod)

    print_topics_from_partitions(G, words_by_part, 10)
    if draw:
        values = [partition.get(node) for node in G.nodes()]
        nx.draw_spring(G, cmap = plt.get_cmap('jet'), node_color = values, node_size=30, with_labels=False)
        plt.show()

    return G
コード例 #3
0
# for text in texts:
#     print text
#     print "##################################################"
#
# print "##################################################"
# print  "TOKENIZED SENTENCES"
# print "##################################################"

# Show tokenized sentences
for text in gb.text_sentences[:1]:
    print "##################################################"
    for sentence in text:
        print sentence

# Building graph
G = gb.create_graph()
t2 = time.time()
print "Graph built in %.2f sec" %(t2-t1)

# Clustering
# ex = 2
# r = 2
# tol = 1e-3
# threshold = 1e-5
# M = graph_cluster.MCL_cluster(G,ex,r,tol,threshold)
# t3 = time.time()
# print "Graph clustered in %.2f sec" %(t3-t2)

# LOUVAIN
partition = community.best_partition(G)
words_by_part = get_words_by_partition(partition)
コード例 #4
0
# for text in texts:
#     print text
#     print "##################################################"
#
# print "##################################################"
# print  "TOKENIZED SENTENCES"
# print "##################################################"

# Show tokenized sentences
for text in gb.text_sentences[:1]:
    print "##################################################"
    for sentence in text:
        print sentence

# Building graph
G = gb.create_graph()
t2 = time.time()
print "Graph built in %.2f sec" % (t2 - t1)

# Clustering
# ex = 2
# r = 2
# tol = 1e-3
# threshold = 1e-5
# M = graph_cluster.MCL_cluster(G,ex,r,tol,threshold)
# t3 = time.time()
# print "Graph clustered in %.2f sec" %(t3-t2)

# LOUVAIN
partition = community.best_partition(G)
words_by_part = get_words_by_partition(partition)