def tfidf_profile(self, items_repository, size, content_filter, time_context=0): """ Return the most relevant tags for the user list of packages based on the sublinear tfidf weight of packages' tags. """ docs = data.axi_search_pkgs(items_repository, self.pkg_profile) # weights = data.tfidf_plus(items_repository,docs,content_filter) weights = data.tfidf_weighting(items_repository, docs, content_filter, time_context=time_context) # Eliminate duplicated stemmed term profile = self._eliminate_duplicated([w[0] for w in weights], size) return profile
def eset_profile(self, items_repository, size, content_filter): """ Return most relevant tags for a list of packages. """ # Store package documents in a relevant set enquire = xapian.Enquire(items_repository) docs = data.axi_search_pkgs(items_repository, self.pkg_profile) rset_packages = xapian.RSet() for d in docs: rset_packages.add_document(d.docid) # Get expanded query terms (statistically good differentiators) eset_tags = enquire.get_eset(size * 2, rset_packages, xapian.Enquire.INCLUDE_QUERY_TERMS, 1, content_filter) # Eliminate duplicated stemmed term profile = self._eliminate_duplicated([res.term for res in eset_tags], size) return profile