def evaluate():

    texts = get_texts()
    gb = words_graph.SimpleGraphBuilder(text_processing.clean_punctuation_and_stopwords, stem_words=False)
    #gb = words_graph.SimpleGraphBuilder(text_processing.only_non_dictionary_words, stem_words=False)
    #gb = words_graph.WindowGraphBuilder(text_processing.clean_punctuation_and_stopwords, stem_words=False)
    #gb = words_graph.NounPhraseGraphBuilder(text_processing.only_non_dictionary_words, stem_words=False)

    gb.load_texts(texts)
    G = gb.create_graph()

    partition = community.best_partition(G)
    #words_by_part = topics.get_words_by_partition(partition)
    words_by_part = graph_cluster.get_overlap_clusters(G, 9, 1)

    computed_topics = topics.get_topics_from_partitions(G, words_by_part)

    #Word splitter
    # computed_topics2 = []
    # for topic in computed_topics:
    #     new_topic = []
    #     for phrase in topic:
    #         new_topic.extend(phrase.split(' '))
    #     computed_topics2.append(new_topic)

    print compute_score(computed_topics, true_topics)
Example #2
0
def evaluate():

    texts = get_texts()
    gb = words_graph.SimpleGraphBuilder(
        text_processing.clean_punctuation_and_stopwords, stem_words=False)
    #gb = words_graph.SimpleGraphBuilder(text_processing.only_non_dictionary_words, stem_words=False)
    #gb = words_graph.WindowGraphBuilder(text_processing.clean_punctuation_and_stopwords, stem_words=False)
    #gb = words_graph.NounPhraseGraphBuilder(text_processing.only_non_dictionary_words, stem_words=False)

    gb.load_texts(texts)
    G = gb.create_graph()

    partition = community.best_partition(G)
    #words_by_part = topics.get_words_by_partition(partition)
    words_by_part = graph_cluster.get_overlap_clusters(G, 9, 1)

    computed_topics = topics.get_topics_from_partitions(G, words_by_part)

    #Word splitter
    # computed_topics2 = []
    # for topic in computed_topics:
    #     new_topic = []
    #     for phrase in topic:
    #         new_topic.extend(phrase.split(' '))
    #     computed_topics2.append(new_topic)

    print compute_score(computed_topics, true_topics)
Example #3
0
def get_topics_non_dictionary_overlapping(num_news, k, url='http://cnn.com'):

    texts = get_news(url, num_news)

    gb = SimpleGraphBuilder(text_processing.only_non_dictionary_words, stem_words=False)
    gb.load_texts(texts) 
    G = gb.create_graph()
    print "Graph built"

    words_by_part = graph_cluster.get_overlap_clusters(G, k, 1)

    #print_topics_from_partitions(G, words_by_part, 10)

    return G
Example #4
0
def get_topics_noun_phrases_overlapping(num_news, k, url='http://cnn.com'):

    texts = get_news(url, num_news)

    gb = NounPhraseGraphBuilder(text_processing.clean_punctuation_and_stopwords)
    gb.load_texts(texts)
    G = gb.create_graph()
    print "Graph built"

    words_by_part = graph_cluster.get_overlap_clusters(G, k, 1)

    #print_topics_from_partitions(G, words_by_part, 10)

    topics = get_topics_from_partitions(G, words_by_part, 10)

    return G, topics
print "Graph built in %.2f sec" %(t2-t1)

# Clustering
# ex = 2
# r = 2
# tol = 1e-3
# threshold = 1e-5
# M = graph_cluster.MCL_cluster(G,ex,r,tol,threshold)
# t3 = time.time()
# print "Graph clustered in %.2f sec" %(t3-t2)

# LOUVAIN
partition = community.best_partition(G)
words_by_part = get_words_by_partition(partition)
# OVERLAPPING
words_by_part = graph_cluster.get_overlap_clusters(G, 2, 1)

# In order to get partitions in a given level of the dendogram (bigger level, smaller communities)
# although it seems that there are only usually 2 levels...
#dendogram = community.generate_dendogram(G)
#partition = community.partition_at_level(dendogram, 0)
#partition = community.partition_at_level(dendogram, 1)








# -- example using noun phrases
Example #6
0
print "Graph built in %.2f sec" % (t2 - t1)

# Clustering
# ex = 2
# r = 2
# tol = 1e-3
# threshold = 1e-5
# M = graph_cluster.MCL_cluster(G,ex,r,tol,threshold)
# t3 = time.time()
# print "Graph clustered in %.2f sec" %(t3-t2)

# LOUVAIN
partition = community.best_partition(G)
words_by_part = get_words_by_partition(partition)
# OVERLAPPING
words_by_part = graph_cluster.get_overlap_clusters(G, 2, 1)

# In order to get partitions in a given level of the dendogram (bigger level, smaller communities)
# although it seems that there are only usually 2 levels...
#dendogram = community.generate_dendogram(G)
#partition = community.partition_at_level(dendogram, 0)
#partition = community.partition_at_level(dendogram, 1)

# -- example using noun phrases
#
# gb = NounPhraseGraphBuilder(text_processing.clean_punctuation_and_stopwords)
# texts = (article['text'] for article in news.polished())
# gb.load_texts(texts)
# G = gb.create_graph(graphtype='occurence')
#
# partition = community.best_partition(G)