Esempio n. 1
0
def get_topics_non_dictionary_overlapping(num_news, k, url='http://cnn.com'):

    texts = get_news(url, num_news)

    gb = SimpleGraphBuilder(text_processing.only_non_dictionary_words, stem_words=False)
    gb.load_texts(texts) 
    G = gb.create_graph()
    print "Graph built"

    words_by_part = graph_cluster.get_overlap_clusters(G, k, 1)

    #print_topics_from_partitions(G, words_by_part, 10)

    return G
Esempio n. 2
0
def get_topics_by_standard_words(num_news, draw=False, url='http://cnn.com'):

    texts = get_news(url, num_news)

    gb = SimpleGraphBuilder(text_processing.clean_punctuation_and_stopwords)
    gb.load_texts(texts)
    G = gb.create_graph()
    print "Graph built"

    partition = community.best_partition(G)
    words_by_part = get_words_by_partition(partition)

    mod = community.modularity(partition,G)
    print("modularity:", mod)

    print_topics_from_partitions(G, words_by_part, 10)
    if draw:
        values = [partition.get(node) for node in G.nodes()]
        nx.draw_spring(G, cmap = plt.get_cmap('jet'), node_color = values, node_size=30, with_labels=False)
        plt.show()

    return G
Esempio n. 3
0
            words_by_part[partition[elem]].append(elem)

    return words_by_part

t0 = time.time()

news = NewsScraper('http://cnn.com', nthreads = 10)
news.pull()
news.scrape(10)
texts = (article['text'] for article in news.polished())

t1 = time.time()
print "Data retrieved in %.2f sec" %(t1-t0)

# Create a graph builder
gb = SimpleGraphBuilder(text_processing.clean_punctuation_and_stopwords)

gb.load_texts(texts)

# Show texts in the builder
# for text in texts:
#     print text
#     print "##################################################"
#
# print "##################################################"
# print  "TOKENIZED SENTENCES"
# print "##################################################"

# Show tokenized sentences
for text in gb.text_sentences[:1]:
    print "##################################################"
Esempio n. 4
0
    return words_by_part


t0 = time.time()

news = NewsScraper('http://cnn.com', nthreads=10)
news.pull()
news.scrape(10)
texts = (article['text'] for article in news.polished())

t1 = time.time()
print "Data retrieved in %.2f sec" % (t1 - t0)

# Create a graph builder
gb = SimpleGraphBuilder(text_processing.clean_punctuation_and_stopwords)

gb.load_texts(texts)

# Show texts in the builder
# for text in texts:
#     print text
#     print "##################################################"
#
# print "##################################################"
# print  "TOKENIZED SENTENCES"
# print "##################################################"

# Show tokenized sentences
for text in gb.text_sentences[:1]:
    print "##################################################"