def evaluate(): texts = get_texts() gb = words_graph.SimpleGraphBuilder(text_processing.clean_punctuation_and_stopwords, stem_words=False) #gb = words_graph.SimpleGraphBuilder(text_processing.only_non_dictionary_words, stem_words=False) #gb = words_graph.WindowGraphBuilder(text_processing.clean_punctuation_and_stopwords, stem_words=False) #gb = words_graph.NounPhraseGraphBuilder(text_processing.only_non_dictionary_words, stem_words=False) gb.load_texts(texts) G = gb.create_graph() partition = community.best_partition(G) #words_by_part = topics.get_words_by_partition(partition) words_by_part = graph_cluster.get_overlap_clusters(G, 9, 1) computed_topics = topics.get_topics_from_partitions(G, words_by_part) #Word splitter # computed_topics2 = [] # for topic in computed_topics: # new_topic = [] # for phrase in topic: # new_topic.extend(phrase.split(' ')) # computed_topics2.append(new_topic) print compute_score(computed_topics, true_topics)
def evaluate(): texts = get_texts() gb = words_graph.SimpleGraphBuilder( text_processing.clean_punctuation_and_stopwords, stem_words=False) #gb = words_graph.SimpleGraphBuilder(text_processing.only_non_dictionary_words, stem_words=False) #gb = words_graph.WindowGraphBuilder(text_processing.clean_punctuation_and_stopwords, stem_words=False) #gb = words_graph.NounPhraseGraphBuilder(text_processing.only_non_dictionary_words, stem_words=False) gb.load_texts(texts) G = gb.create_graph() partition = community.best_partition(G) #words_by_part = topics.get_words_by_partition(partition) words_by_part = graph_cluster.get_overlap_clusters(G, 9, 1) computed_topics = topics.get_topics_from_partitions(G, words_by_part) #Word splitter # computed_topics2 = [] # for topic in computed_topics: # new_topic = [] # for phrase in topic: # new_topic.extend(phrase.split(' ')) # computed_topics2.append(new_topic) print compute_score(computed_topics, true_topics)
def get_topics_non_dictionary_overlapping(num_news, k, url='http://cnn.com'): texts = get_news(url, num_news) gb = SimpleGraphBuilder(text_processing.only_non_dictionary_words, stem_words=False) gb.load_texts(texts) G = gb.create_graph() print "Graph built" words_by_part = graph_cluster.get_overlap_clusters(G, k, 1) #print_topics_from_partitions(G, words_by_part, 10) return G
def get_topics_noun_phrases_overlapping(num_news, k, url='http://cnn.com'): texts = get_news(url, num_news) gb = NounPhraseGraphBuilder(text_processing.clean_punctuation_and_stopwords) gb.load_texts(texts) G = gb.create_graph() print "Graph built" words_by_part = graph_cluster.get_overlap_clusters(G, k, 1) #print_topics_from_partitions(G, words_by_part, 10) topics = get_topics_from_partitions(G, words_by_part, 10) return G, topics
print "Graph built in %.2f sec" %(t2-t1) # Clustering # ex = 2 # r = 2 # tol = 1e-3 # threshold = 1e-5 # M = graph_cluster.MCL_cluster(G,ex,r,tol,threshold) # t3 = time.time() # print "Graph clustered in %.2f sec" %(t3-t2) # LOUVAIN partition = community.best_partition(G) words_by_part = get_words_by_partition(partition) # OVERLAPPING words_by_part = graph_cluster.get_overlap_clusters(G, 2, 1) # In order to get partitions in a given level of the dendogram (bigger level, smaller communities) # although it seems that there are only usually 2 levels... #dendogram = community.generate_dendogram(G) #partition = community.partition_at_level(dendogram, 0) #partition = community.partition_at_level(dendogram, 1) # -- example using noun phrases
print "Graph built in %.2f sec" % (t2 - t1) # Clustering # ex = 2 # r = 2 # tol = 1e-3 # threshold = 1e-5 # M = graph_cluster.MCL_cluster(G,ex,r,tol,threshold) # t3 = time.time() # print "Graph clustered in %.2f sec" %(t3-t2) # LOUVAIN partition = community.best_partition(G) words_by_part = get_words_by_partition(partition) # OVERLAPPING words_by_part = graph_cluster.get_overlap_clusters(G, 2, 1) # In order to get partitions in a given level of the dendogram (bigger level, smaller communities) # although it seems that there are only usually 2 levels... #dendogram = community.generate_dendogram(G) #partition = community.partition_at_level(dendogram, 0) #partition = community.partition_at_level(dendogram, 1) # -- example using noun phrases # # gb = NounPhraseGraphBuilder(text_processing.clean_punctuation_and_stopwords) # texts = (article['text'] for article in news.polished()) # gb.load_texts(texts) # G = gb.create_graph(graphtype='occurence') # # partition = community.best_partition(G)