def get_topics_non_dictionary_overlapping(num_news, k, url='http://cnn.com'): texts = get_news(url, num_news) gb = SimpleGraphBuilder(text_processing.only_non_dictionary_words, stem_words=False) gb.load_texts(texts) G = gb.create_graph() print "Graph built" words_by_part = graph_cluster.get_overlap_clusters(G, k, 1) #print_topics_from_partitions(G, words_by_part, 10) return G
def get_topics_by_standard_words(num_news, draw=False, url='http://cnn.com'): texts = get_news(url, num_news) gb = SimpleGraphBuilder(text_processing.clean_punctuation_and_stopwords) gb.load_texts(texts) G = gb.create_graph() print "Graph built" partition = community.best_partition(G) words_by_part = get_words_by_partition(partition) mod = community.modularity(partition,G) print("modularity:", mod) print_topics_from_partitions(G, words_by_part, 10) if draw: values = [partition.get(node) for node in G.nodes()] nx.draw_spring(G, cmap = plt.get_cmap('jet'), node_color = values, node_size=30, with_labels=False) plt.show() return G
# for text in texts: # print text # print "##################################################" # # print "##################################################" # print "TOKENIZED SENTENCES" # print "##################################################" # Show tokenized sentences for text in gb.text_sentences[:1]: print "##################################################" for sentence in text: print sentence # Building graph G = gb.create_graph() t2 = time.time() print "Graph built in %.2f sec" %(t2-t1) # Clustering # ex = 2 # r = 2 # tol = 1e-3 # threshold = 1e-5 # M = graph_cluster.MCL_cluster(G,ex,r,tol,threshold) # t3 = time.time() # print "Graph clustered in %.2f sec" %(t3-t2) # LOUVAIN partition = community.best_partition(G) words_by_part = get_words_by_partition(partition)
# for text in texts: # print text # print "##################################################" # # print "##################################################" # print "TOKENIZED SENTENCES" # print "##################################################" # Show tokenized sentences for text in gb.text_sentences[:1]: print "##################################################" for sentence in text: print sentence # Building graph G = gb.create_graph() t2 = time.time() print "Graph built in %.2f sec" % (t2 - t1) # Clustering # ex = 2 # r = 2 # tol = 1e-3 # threshold = 1e-5 # M = graph_cluster.MCL_cluster(G,ex,r,tol,threshold) # t3 = time.time() # print "Graph clustered in %.2f sec" %(t3-t2) # LOUVAIN partition = community.best_partition(G) words_by_part = get_words_by_partition(partition)