def local_search(self, cid, term_unstemmed, recent): term = self.stem(term_unstemmed) exemplar_pids = self.get_term_exemplars(cid, term) if len(exemplar_pids) < 4: return self.fulltext(cid, term, recent) log_tmp("SEARCH: %s exemplars" % len(exemplar_pids)) guesser = Bayes() for ex_pid in exemplar_pids: ex = state.the.get_post(ex_pid, content=True) log_tmp("SEARCH: exemplar tokens: [%s]" % ex.tokens()) guesser.train("relevant", ex.tokens()) # get normalized content from p. # TODO Toss in other factors, if possible. for neg_ex_pid in state.the.get_random_pids(len(exemplar_pids)): # probably cacheable, if we use a bigger pool guesser.train("random", state.the.get_post(neg_ex_pid, content=True).tokens()) log_tmp("SEARCH: trained") proportions = [ (tok, (count + 1) / (1.0 * guesser.pools["random"].get(tok, 0) + 1)) for (tok, count) in guesser.poolData("relevant") ] proportions = [ # knock out the weak and irrelevant ones before sorting (tok, prop) for (tok, prop) in proportions if prop > 2 ] fulltext_fallback = len(proportions) < 3 if fulltext_fallback: query = xapian.Query(xapian.Query.OP_AND, [term]) else: proportions.sort(key=operator.itemgetter(1), reverse=True) log_tmp("SEARCH: proportions: " + str(proportions)) # search for the twelve best words query = xapian.Query(xapian.Query.OP_OR, [tok for (tok, prop) in proportions[:12]]) log_tmp("SEARCH: query: " + str(query)) enq = xapian.Enquire(self.mainabase) enq.set_query( # xapian.Query(xapian.Query.OP_AND, query # , ##Something scoring for BROAD_SUPPORT##) ##Something scoring for recency, if appropriate ) mset = enq.get_mset(0, 25) results = [] for m in mset: doc = m.get_document() post = state.the.get_post(int(doc.get_data()), True) for (pool, prob) in guesser.guess(post.tokens()): if pool == "relevant": rel_prob = prob score = rel_prob score *= post.broad_support if recent: score *= _post_age_score(post) results.append(SearchResult(post, term, score)) # results.append( (post, score, "rel: %f b_s: %f root age: %f" % # (rel_prob, post.broad_support, sqrt(age_days)) ) ) results.sort(lambda x, y: cmp(x.score, y.score), reverse=True) return results[:10]