def query_by_body(body, top_k=5): ''' query top_k most probable replies based on raw email body ''' doc = body2doc(body) print '[doc]', doc vec = avg_word_vec(doc) # retrieve top-k past inquery emails. candidates = query_by_vec_online('inbox', vec, top_k) # get past replies. db = EmailDB('outbox') replies = [] for (email, score) in candidates: reply = db.select( where='message_id=:message_id', data=dict(message_id=email.message_id) ) reply = list(reply)[0].body replies.append((reply, score)) return replies
def query_by_vec_online(table, vec, top_k=5): init_word2vec() db = EmailDB(table) scores = [] for email in db.select(): if email.message_id in CACHE: candidate_vec = CACHE[email.message_id] else: doc = body2doc(email.body) print "[cache doc]", doc candidate_vec = avg_word_vec(doc) CACHE[email.message_id] = candidate_vec score = np.dot(vec, candidate_vec) / npla.norm(vec) / npla.norm(candidate_vec) # cosine similarity. score = float(score) scores.append((email, score)) scores = [pair for pair in scores if pair[1] == pair[1]] scores = sorted(scores, key=lambda pair: pair[1], reverse=True) print "[matched]", scores[:top_k] return scores[:top_k]