def rank_documents_bm25f(self, inverted_indexes_bm25f, doc_length_info_bm25f, parameters, path_topics, path_results_dir=r"../trec_eval-master/our_data/", results_file_name="results_BM25F"): # Retrieve the queries queries = self.extract_queries(path_topics) # The path to the output file output_file_path = path_results_dir + results_file_name + ".txt" # Clear the contents of the output file open(output_file_path, "w").close() query_nr = 1 # Used to keep track of which query is being processed for query in queries: # For each query ... print(f"Processing query {query_nr}: '{query}'") # Transform the query terms to the desired form (i.e. tokenized, stemmed, ...) query_terms = processQuery(query) # Compute the BM25 score for each document for the current query doc_scores = self.compute_doc_scores_BM25F(query_terms, inverted_indexes_bm25f, doc_length_info_bm25f, parameters) # Write the top 1000 document scores for this query to a .txt file self.write_output_file(query_nr, doc_scores, output_file_path) # Increment the query number for the next iteration query_nr += 1
def rank_documents(self, inverted_indexes, doc_lengths, parameters, path_topics, path_results_dir=r"../trec_eval-master/our_data/", results_file_name="results"): """ Scores and ranks each document for each query. Given a list of queries this function determines for each query the BM25 score for each document. For each query the best 10,000 documents are then ranked based on their score (higher is better) and the results are written to a .txt file in such a form that it can be used as input to the TREC evaluation tool. """ # Retrieve the queries queries = self.extract_queries(path_topics) # The path to the output file output_file_path = path_results_dir + results_file_name + ".txt" # Clear the contents of the output file open(output_file_path, "w").close() query_nr = 1 # Used to keep track of which query is being processed for query in queries: # For each query ... print(f"Processing query {query_nr}: '{query}'") # Transform the query terms to the desired form (i.e. tokenized, stemmed, ...) query_terms = processQuery(query) # Compute the BM25 score for each document for the current query doc_scores = self.compute_doc_scores(query_terms, inverted_indexes, doc_lengths, parameters) # Write the top 1000 document scores for this query to a .txt file self.write_output_file(query_nr, doc_scores, output_file_path) # Increment the query number for the next iteration query_nr += 1
def rank_documents_rocchio(self, inverted_indexes, doc_lengths, documents, path_topics, path_results_dir=r"../trec_eval-master/our_data/", results_file_name="results"): """ Scores and ranks each document for each query, then expand the query and rank it again. Given a list of queries this function determines for each query the BM25 score for each document. For each query the best 10,000 documents are then ranked based on their score (higher is better) and the results are written to a .txt file in such a form that it can be used as input to the TREC evaluation tool. """ # Retrieve the queries queries = self.extract_queries(path_topics) # The path to the output file output_file_path = path_results_dir + results_file_name + ".txt" output_file_path_2 = path_results_dir + results_file_name + "_2.txt" # Clear the contents of the output file open(output_file_path, "w").close() open(output_file_path_2, "w").close() query_nr = 1 # Used to keep track of which query is being processed for query in queries: # For each query ... print(f"Processing query {query_nr}: '{query}'") # Transform the query terms to the desired form (i.e. tokenized, stemmed, ...) query_terms = processQuery(query) # Compute the BM25 score for each document for the current query doc_scores = self.compute_doc_scores(query_terms, inverted_indexes, doc_lengths) # Set of relevant documents top_k = 20 rel_docs = dict() for rank in dict(list(doc_scores.items())[:top_k]): rel_docs[rank] = doc_scores[rank] expansion = rocchio(query_terms, rel_docs, inverted_indexes, documents) expanded_query = list(expansion.keys()) # Compute the BM25 score for each document for the expanded query doc_scores_2 = self.compute_doc_scores(expanded_query, inverted_indexes, doc_lengths) ''' top_n = dict(sorted(doc_scores.items(), key = itemgetter(1), reverse = True)[:10]) top_n_2 = dict(sorted(doc_scores_2.items(), key = itemgetter(1), reverse = True)[:10]) for t in top_n: print("{}: {}, {}, doc len: {}".format(t, doc_scores[t], doc_scores_2[t], doc_lengths[t])) print("----------------------") for t in top_n_2: print("{}: {}, {}".format(t, doc_scores[t], doc_scores_2[t])) print(query_terms) print("***********************") print(expanded_query) ''' # Write the top 1000 document scores for this query to a .txt file self.write_output_file(query_nr, doc_scores, output_file_path) self.write_output_file(query_nr, doc_scores_2, output_file_path_2) # Increment the query number for the next iteration query_nr += 1 if query_nr > 50: break
def rank_BM25F_with_reranker(self, inverted_indexes_bm25f, doc_length_info_bm25f, documents_dict, parameters, path_topics, path_results_dir=r"../trec_eval-master/our_data/", results_file_name="results__BM25F_rerank"): """ Scores and ranks each document for each query with BM24F, then reranks them. This function first scores each document with BM25F, then it reranks the highest scored 1000 of them using a BERT model trained on MS MARCO (real user queries from Bing seach engine). """ def rerank(query, documents_dict, doc_scores, model): query_embedding = model.encode(query) for cord_uid in doc_scores.keys(): document = documents_dict[cord_uid] title = document.title abstract = document.abstract text_string = title + " " + abstract passage_embedding = model.encode(text_string) rerank_score = util.pytorch_cos_sim(query_embedding, passage_embedding)[0][0].item() doc_scores[cord_uid] = doc_scores[cord_uid] + Constants.rerank_multiplier * rerank_score model = SentenceTransformer('distilroberta-base-msmarco-v2') # Retrieve the queries queries = self.extract_queries(path_topics) # The path to the output file output_file_path = path_results_dir + results_file_name + ".txt" # Clear the contents of the output file open(output_file_path, "w").close() query_nr = 1 # Used to keep track of which query is being processed for query in queries: # For each query ... print(f"Processing query {query_nr}: '{query}'") # Transform the query terms to the desired form (i.e. tokenized, stemmed, ...) query_terms = processQuery(query) # Compute the BM25 score for each document for the current query doc_scores = self.compute_doc_scores_BM25F(query_terms, inverted_indexes_bm25f, doc_length_info_bm25f, parameters) # Sort by score and select the n highest scored documents doc_scores = dict(sorted(doc_scores.items(), key = itemgetter(1), reverse = True)[:1000]) rerank(query, documents_dict, doc_scores, model) # Sort by score and select the n highest scored documents doc_scores = dict(sorted(doc_scores.items(), key = itemgetter(1), reverse = True)) # Write the top 1000 document scores for this query to a .txt file self.write_output_file(query_nr, doc_scores, output_file_path) # Increment the query number for the next iteration query_nr += 1