def parse_articles(field): model = load_idf_model(field) print "Extracting..." #articles = 0 for article in articles.find(_limit=400): print "\n\nDOCUMENT", article['article_url'] most = article_terms(model, article) pprint(most[:5])
def load_articles(limit): for article in articles.find(_limit=limit): if 'spiegel.de/international' in article['article_url']: continue yield { 'url': article['article_url'], 'text': article['body_text'], 'bigrams': list(make_bigrams(article['body_text'])), 'tokens': list(tokenize(article['body_text'])) }
def topic_graph(): print "Loading IDF model..." model = load_idf_model('tokens') print "Making a graph..." #articles = 0 edges = defaultdict(int) for article in articles.find(_limit=10000): if 'spiegel.de/international' in article['article_url']: continue terms = article_terms(model, article)[:15] for (term1, score1), (term2, score2) in combinations(terms, 2): key = max(term1, term2), min(term1, term2) edges[key] += score1 * score2 G = nx.Graph() #for (s, d), w in edges.items(): for (s, d), w in sorted(edges.items(), key=lambda (a, b): b, reverse=True)[:20000]: G.add_edge(s, d, weight=w) nx.write_gexf(G, data_path('topic_graph_abridged.gexf'))