def add(self, text, doc_id): # Convert text to dictionary of words mapping to their term frequency wf = word_freq(clean_text(text)) # Get doc length by summing all term frequencies doc_length = reduce(lambda x, y: x + y, wf.values()) self.db.insert(create_document_words(wf, doc_id)) self.db.insert([Document(id=doc_id, length=doc_length)])
def search(self, query): """Searches the index for the query. Returns a list of dictiories containing keys "document_id" and "score" sorted in descending order. """ words = word_tokenize(clean_text(query)) results = self.index_reader.search(words) doc_scores_sum = defaultdict(float) for word, doc_scores in results.items(): for doc_id, score in doc_scores.items(): doc_scores_sum[doc_id] += score # Sort all the doc_ids by the score descending sorted_doc_ids = sorted(doc_scores_sum, key=doc_scores_sum.get, reverse=True) return [{"document_id" : doc_id, "score" : doc_scores_sum[doc_id]} for doc_id in sorted_doc_ids]