def make_doc(interest, dictionary): article_id1 = utils.get_article_id_for_interest(interest) if not article_id1: return None doc = [] ranks = utils.get_article_similarity_ranks(article_id1, 2000).items() for (article_id2, rank) in ranks: if article_id2 in dictionary.token2id: id = dictionary.token2id[article_id2] score = 1.0 / (math.log(rank + 5) / math.log(2)) doc.append((id, score)) return doc
def build_article_adjacencies(interests): article_sims = collections.defaultdict(list) for i in interests: article_id = utils.get_article_id_for_interest(i) if not article_id: continue index1 = id_to_index(article_id) ranks = utils.get_article_similarity_ranks(article_id, 2000).items() ranks.sort(key=lambda pair: pair[1]) for (article_id2, rank) in ranks: article_sims[index1].append(article_id2) return article_sims
def describe_lda(): utils.init() model = gensim.models.ldamodel.LdaModel.load('svd/lda.txt') def article_name(article_id): name = utils.get_article_name(article_id) return name.encode('ascii', 'ignore') if name else 'unknown' # print 'information about topics:' # for i in random.sample(range(model.num_topics), 50): # print 'topic %d:' % i # topic = model.state.get_lambda()[i] # topic = topic / topic.sum() # normalize to probability dist # for id in numpy.argsort(topic)[::-1][:10]: # score = topic[id] # article_id = model.id2word[id] # print '\t%.6f: %s' % (score, article_name(article_id)) dictionary = model.id2word interests = list(utils.get_all_interests()) for i in random.sample(interests, 50): article_id1 = utils.get_article_id_for_interest(i) if not article_id1: continue doc = make_doc(i, dictionary) doc_lda = model[doc] doc_lda.sort(key=lambda pair: pair[1]) doc_lda.reverse() sys.stdout.write('topics for %s (article %s):\n' % (i.text, article_name(article_id1))) for (topic_id, topic_score) in doc_lda: sys.stdout.write('\t%.6f topic %d:' % (topic_score, topic_id)) topic = model.state.get_lambda()[topic_id] topic = topic / topic.sum() # normalize to probability dist for id in numpy.argsort(topic)[::-1][:10]: score = topic[id] article_id = model.id2word[id] sys.stdout.write(', ' + article_name(article_id)) sys.stdout.write('\n')
def build_interests_to_articles(self): for i in self.interests: article_id = utils.get_article_id_for_interest(i) if article_id: self.mapped_interests.append((i, article_id))