def make_corpus(): utils.init() interests = list(utils.get_all_interests()) corpus = MyCorpus(interests) corpus.build_interests_to_articles() corpus.build_dict() corpus.write_corpus()
def main(): utils.init() #interests = set(list(utils.get_all_interests())[:50]) interests = utils.get_all_interests() matrix = build_article_adjacencies(interests) write_matrix(matrix) write_ids_to_indexes()
def main(): utils.init() # interests = set(list(utils.get_all_interests())[:50]) interests = utils.get_all_interests() matrix = build_article_adjacencies(interests) write_matrix(matrix) write_ids_to_indexes()
def print_interest_subclusters(): for i in utils.get_all_interests(): print i g = make_interest_graph(i) for j in g['map']: print '\t\t%s:' % j for k in g['map'][j]: print '\t\t\t%s' % k print
def test_sample_interest_graph(): for i in random.sample(utils.get_all_interests(), 100): print "=" * 80 print print "results for ", i make_full_interest_graph(i) print print print
def test_sample_interest_graph(): for i in random.sample(utils.get_all_interests(), 100): print '=' * 80 print print 'results for ', i make_full_interest_graph(i) print print print
def describe_lda(): utils.init() model = gensim.models.ldamodel.LdaModel.load('svd/lda.txt') def article_name(article_id): name = utils.get_article_name(article_id) return name.encode('ascii', 'ignore') if name else 'unknown' # print 'information about topics:' # for i in random.sample(range(model.num_topics), 50): # print 'topic %d:' % i # topic = model.state.get_lambda()[i] # topic = topic / topic.sum() # normalize to probability dist # for id in numpy.argsort(topic)[::-1][:10]: # score = topic[id] # article_id = model.id2word[id] # print '\t%.6f: %s' % (score, article_name(article_id)) dictionary = model.id2word interests = list(utils.get_all_interests()) for i in random.sample(interests, 50): article_id1 = utils.get_article_id_for_interest(i) if not article_id1: continue doc = make_doc(i, dictionary) doc_lda = model[doc] doc_lda.sort(key=lambda pair: pair[1]) doc_lda.reverse() sys.stdout.write('topics for %s (article %s):\n' % (i.text, article_name(article_id1))) for (topic_id, topic_score) in doc_lda: sys.stdout.write('\t%.6f topic %d:' % (topic_score, topic_id)) topic = model.state.get_lambda()[topic_id] topic = topic / topic.sum() # normalize to probability dist for id in numpy.argsort(topic)[::-1][:10]: score = topic[id] article_id = model.id2word[id] sys.stdout.write(', ' + article_name(article_id)) sys.stdout.write('\n')
import collections import logging import math import pymongo import random import re import sys import users import utils logging.basicConfig(level=logging.INFO) utils.init() #interests = set(list(utils.get_all_interests())[:250]) interests = utils.get_all_interests() sims = utils.get_correlation_matrix5(interests) for i1, i1_sims in sims.items(): for i2, sim in i1_sims.items(): if sim >= 0.003: print '%s=%s %s=%s %s' % (i1.id, i1.text, i2.id, i2.text, sim)