def __get_doc_scores(self, posting_list, query: Query): click.secho("[Search Engine] Computing search scores ...", fg="bright_blue") query_tf_idf = {} norm_query_vector = 0 query_vocabulary = query.get_vocabulary() for token in query_vocabulary: tf_idf = query.get_tf(token) * self.collection.get_idf(token) query_tf_idf[token] = tf_idf norm_query_vector += tf_idf**2 norm_query_vector = sqrt(norm_query_vector) doc_scores = {} for doc_id in posting_list: score = 0 for token in query_vocabulary: if self.weighting_model == "tw-idf": weight = self.collection.get_tw_idf(target_term=token, target_doc_id=doc_id, b=0.003) elif self.weighting_model == "tf-idf": weight = self.collection.get_piv_plus(target_term=token, target_doc_id=doc_id, b=0.2) else: weight = self.collection.get_bm25_plus( target_term=token, target_doc_id=doc_id, b=0.75, k1=1.2) score += query_tf_idf[token] * weight score /= self.collection.documents_norms[doc_id] * norm_query_vector doc_scores[doc_id] = score return doc_scores
def __get_posting_list(self, query: Query): final_posting_list = [] vocabulary = query.get_vocabulary() for token in vocabulary: if not final_posting_list: final_posting_list = self.collection.get_posting_list(token) else: posting_list = self.collection.get_posting_list(token) final_posting_list = merge_or_postings_list( final_posting_list, posting_list) return final_posting_list
def get_list_of_documents(self, query: Query): """Return documents where the words of the query appear""" target_documents_list = [] # Get words of query vocabulary = query.get_vocabulary() for word in vocabulary: if not target_documents_list: target_documents_list = self.collection.get_documents_containing_term(word) print( f"[Search Engine] the word {word} is present in {len(target_documents_list)} items" ) else: documents_list = self.collection.get_documents_containing_term(word) print( f"[Search Engine] the word {word} is present in {len(documents_list)} items" ) print("Merge ...") # merge the two lists and order the final list target_documents_list = sorted(list(set(target_documents_list) | set(documents_list))) return target_documents_list
def compute_scores(self, list_of_docs, query: Query): """ Scores each document, depending of the tokens it contains.""" print("Search Engine is computing search scores ...") query_tf_idf = {} vocab_query = query.get_vocabulary() # get the tf_idf for words in the query for word in vocab_query: tf_idf = query.get_term_frequency(word) * self.collection.compute_idf(word) query_tf_idf[word] = tf_idf # score the documents which contain the words doc_scores = {} for doc_id in list_of_docs: score = 0 for word in vocab_query: normalized_tf = self.collection.log_normalization( term=word, id_document=doc_id ) if normalized_tf == 0: # the word is not in the document doc_tf_idf = 0 else: doc_tf_idf = normalized_tf * self.collection.compute_idf(word) #tf-idf for the word in the document score += query_tf_idf[word] * doc_tf_idf doc_scores[doc_id] = score return doc_scores