def __iter__(self): for (i, article_id1) in self.mapped_interests: doc = [] ranks = utils.get_article_similarity_ranks(article_id1, 2000).items() for (article_id2, rank) in ranks: if article_id2 in self.dictionary.token2id: id = self.dictionary.token2id[article_id2] score = 1.0 / (math.log(rank + 5) / math.log(2)) doc.append((id, score)) yield doc
def make_doc(interest, dictionary): article_id1 = utils.get_article_id_for_interest(interest) if not article_id1: return None doc = [] ranks = utils.get_article_similarity_ranks(article_id1, 2000).items() for (article_id2, rank) in ranks: if article_id2 in dictionary.token2id: id = dictionary.token2id[article_id2] score = 1.0 / (math.log(rank + 5) / math.log(2)) doc.append((id, score)) return doc
def build_dict(self): # force interest articles into resultset #article_doc = self.mapped_interests.values() #for i in range(5): #self.dictionary.doc2bow(article_doc, True) for (i, article_id) in self.mapped_interests: doc = list(utils.get_article_similarity_ranks(article_id, 2000).keys()) self.dictionary.doc2bow(doc, True) self.dictionary.filter_extremes() self.dictionary.save_as_text('svd/dictionary.txt')
def build_article_adjacencies(interests): article_sims = collections.defaultdict(list) for i in interests: article_id = utils.get_article_id_for_interest(i) if not article_id: continue index1 = id_to_index(article_id) ranks = utils.get_article_similarity_ranks(article_id, 2000).items() ranks.sort(key=lambda pair: pair[1]) for (article_id2, rank) in ranks: article_sims[index1].append(article_id2) return article_sims