Example #1
0
 def BM25_parameter_tests(self):
     
     # Load the inverted indexes and the document lengths
     inverted_indexes = load_pickle(Constants.path_inverted_indexes)
     document_lengths = load_pickle(Constants.path_document_lengths)
 
     ks = [5.0] # 5.0 appears to be more or less the best value for k
     bs = [0.8] # 0.8 appears to be more or less the best value for b
     
     for k in ks:
         for b in bs:
             parameters = ParametersBM25(k=k, b=b)
                             
             path_results_dir = Constants.path_results_dir + r"BM25_parameter_tests/"
             
             results_file_name = (f"results_BM25-k={k}-b={b}")
             
             parameters.print_parameters()
     
             # Rank the documents
             document_ranker = DocumentRanker()
             document_ranker.rank_documents(inverted_indexes,
                                                 document_lengths,
                                                 parameters,
                                                 Constants.path_topics,
                                                 path_results_dir,
                                                 results_file_name)
                                             
     
     del inverted_indexes, document_lengths
     
     print("Done ranking documents.")
    def rank_documents_rocchio(self):
        """Score and rank each document for each query."""

        # Load the inverted indexes and the document lengths
        inverted_indexes = load_pickle(Constants.path_inverted_indexes)
        document_lengths = load_pickle(Constants.path_document_lengths)

        documents = load_pickle(Constants.path_final_documents)
        document_lengths_bm25f = load_pickle(
            Constants.path_document_length_info_bm25f)
        inverted_indexes_bm25f = load_pickle(
            Constants.path_inverted_indexes_bm25f)

        # Rank the documents
        # self.document_ranker.rank_documents_rocchio(inverted_indexes,
        #                                                        document_lengths,
        #                                                        documents,
        #                                                        Constants.path_topics,
        #                                                        Constants.path_results_dir,
        #                                                        Constants.results_file_name)

        self.document_ranker.rank_documents_rocchio_with_bm25f(
            inverted_indexes, document_lengths, document_lengths_bm25f,
            inverted_indexes_bm25f, documents, Constants.path_topics,
            Constants.path_results_dir, Constants.results_file_name)
        print("Done ranking documents.")
        del inverted_indexes, document_lengths, documents
    def rank_documents_with_reranker(self):
        """Score and rank each document for each query, then rerank them."""

        # Load the required data
        inverted_indexes = load_pickle(Constants.path_inverted_indexes)
        doc_lengths = load_pickle(Constants.path_document_lengths)
        documents_dictionary = load_pickle(Constants.path_documents_dictionary)

        parameters = ParametersBM25()

        self.document_ranker.rank_with_reranker(
            inverted_indexes, doc_lengths, documents_dictionary, parameters,
            Constants.path_topics, Constants.path_results_dir,
            Constants.results_rerank_file_name)
        del inverted_indexes, doc_lengths, documents_dictionary
    def rank_documents(self):
        """Score and rank each document for each query."""

        # Load the inverted indexes and the document lengths
        inverted_indexes = load_pickle(Constants.path_inverted_indexes)
        document_lengths = load_pickle(Constants.path_document_lengths)

        parameters = ParametersBM25()

        # Rank the documents
        self.document_ranker.rank_documents(inverted_indexes, document_lengths,
                                            parameters, Constants.path_topics,
                                            Constants.path_results_dir,
                                            Constants.results_file_name)
        del inverted_indexes, document_lengths

        print("Done ranking documents.")
    def rank_documents_BM25F(self):
        """Score and rank each document for each query."""

        # Load the inverted indexes and the document lengths
        inverted_indexes_bm25f = load_pickle(
            Constants.path_inverted_indexes_bm25f)
        doc_length_info_bm25f = load_pickle(
            Constants.path_doc_length_info_bm25f)

        parameters = ParametersBM25F()

        # Rank the documents
        self.document_ranker.rank_documents_bm25f(
            inverted_indexes_bm25f, doc_length_info_bm25f, parameters,
            Constants.path_topics, Constants.path_results_dir, "results_BM25F")
        del inverted_indexes_bm25f, doc_length_info_bm25f

        print("Done ranking documents.")
    def rank_documents_BM25F_with_reranker(self):
        """Score and rank each document for each query with BM25F, then rerank them."""

        # Load the required data
        inverted_indexes_bm25f = load_pickle(
            Constants.path_inverted_indexes_bm25f)
        doc_length_info_bm25f = load_pickle(
            Constants.path_doc_length_info_bm25f)
        documents_dictionary = load_pickle(Constants.path_documents_dictionary)

        parameters = ParametersBM25F()

        self.document_ranker.rank_BM25F_with_reranker(
            inverted_indexes_bm25f,
            doc_length_info_bm25f,
            documents_dictionary,
            parameters,
            Constants.path_topics,
            Constants.path_results_dir,
            results_file_name="results_BM25F_rerank")
        del inverted_indexes_bm25f, doc_length_info_bm25f, documents_dictionary
    def create_inverted_indexes_bm25f(self):

        # Load the documents
        documents = load_pickle(Constants.path_documents)

        # Create the inverted indexes, also retrieve information on number of terms per field
        inverted_indexes, doc_length_info = self.index_creator.create_BM25_inverted_indexes(
            documents)

        # Save the retrieved information
        save_pickle(inverted_indexes, 'inverted_indexes_bm25f')
        save_pickle(doc_length_info, 'doc_length_info_bm25f')
        del inverted_indexes, doc_length_info

        print("Done creating inverted indexes for the bm25f algorithm.")
Example #8
0
    def process_documents(self, path_linked_documents, path_unlinked_documents,
                          path_parsed_documents, path_merged_documents,
                          path_final_documents):
        """ (Deprecated!) Process the documents into their final form and store them."""

        # Load all document info
        linked_documents = load_pickle(path_linked_documents)
        unlinked_documents = load_pickle(path_unlinked_documents)
        parsed_documents = load_pickle(path_parsed_documents)

        # Merge the appropriate documents
        near_matches = self.find_near_matches(unlinked_documents,
                                              parsed_documents)
        self.merge_documents(unlinked_documents, parsed_documents,
                             near_matches, path_merged_documents)
        del near_matches, unlinked_documents, parsed_documents

        # Load the newly created merged documents
        merged_documents = load_pickle(path_merged_documents)

        # Create a final list of documents and store it
        final_documents = linked_documents + merged_documents
        save_pickle(final_documents, path_final_documents)
        del linked_documents, merged_documents, final_documents
def filter_judged_documents(self, path_final_documents,
                            path_relevance_judgements, path_judged_documents):

    judged_cord_uids = set()
    with open(path_relevance_judgements, 'r') as f:
        for line in f:
            judged_cord_uid = line.split(" ")[2]
            judged_cord_uids.add(judged_cord_uid)
    print(f"Retrieved {len(judged_cord_uids)} cord_uids of judged documents.")

    final_documents = load_pickle(path_final_documents)
    judged_documents = []
    for document in final_documents:
        if document.cord_uid in judged_cord_uids:
            judged_documents.append(document)
    print(f"Filtered {len(judged_documents)} judged documents.")

    save_pickle(judged_documents, path_judged_documents)
    def create_inverted_indexes(self):
        """
        For the complete documents: create inverted indexes and determine the
        length of each document.
        """

        # Load the documents
        documents = load_pickle(Constants.path_documents)

        # Create the inverted indexes
        inverted_indexes, document_lengths = self.index_creator.create_inverted_indexes(
            documents)

        # Store the inverted indexes and the document lengths
        save_pickle(inverted_indexes, Constants.path_inverted_indexes)
        save_pickle(document_lengths, Constants.path_document_lengths)
        del inverted_indexes, document_lengths

        print("Done creating inverted indexes for the complete documents.")
#                                                  Constants.path_topics,
#                                                  results_file_name="results_judged")
#     del judged_inverted_indexes
#     del judged_doc_lengths
# =============================================================================

# =============================================================================
#     judged_documents = load_pickle(Constants.path_judged_documents)
#     judged_document_lengths = load_pickle(Constants.path_judged_document_lengths)
#     Util.compute_document_statistics(judged_documents, judged_document_lengths,
#                                      Constants.path_relevance_judgements)
#     del judged_documents
#     del judged_document_lengths
# =============================================================================
from Util import load_pickle
inverted_indexes = load_pickle(Constants.path_inverted_indexes)
doc_lengths = load_pickle(Constants.path_document_lengths)
documents_dict = load_pickle(Constants.path_doc_dict)

search_system.document_ranker.rank_with_rerank_light(
    inverted_indexes,
    doc_lengths,
    Constants.path_topics,
    documents_dict,
    path_results_dir=r"../trec_eval-master/our_data/",
    results_file_name="results_rerank")


def filter_judged_documents(self, path_final_documents,
                            path_relevance_judgements, path_judged_documents):
Example #12
0
 def BM25F_parameter_tests(self):
     
     # The (non-field-specific) 'b' parameter is included by mistake
     def run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                  k, b, 
                  weight_title, weight_author,
                  weight_abstract, weight_sections,
                  b_title, b_author, 
                  b_abstract, b_sections):
     
         parameters = ParametersBM25F(k=k,
                  weight_title=weight_title, weight_author=weight_author,
                  weight_abstract=weight_abstract, weight_sections=weight_sections,
                  b_title=b_title, b_author=b_author, 
                  b_abstract=b_abstract, b_sections=b_sections)
             
         parameters.print_parameters()
                                             
         path_results_dir = Constants.path_results_dir + r"BM25F_parameter_tests/"
         
         results_file_name = (f"results_BM25F_test_{test_id}")
         
         document_ranker = DocumentRanker()
         document_ranker.rank_documents_bm25f(inverted_indexes_bm25f,
                                                   doc_length_info_bm25f,
                                                   parameters,
                                                   Constants.path_topics,
                                                   path_results_dir,
                                                   results_file_name)
     
     inverted_indexes_bm25f = load_pickle(Constants.path_inverted_indexes_bm25f)
     doc_length_info_bm25f = load_pickle(Constants.path_doc_length_info_bm25f)
     
     round_nr = 11
     if round_nr == 0: # round 0 - getting a feel of impact field weights
         test_id = "00"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=1.0, weight_author=1.0,
                                      weight_abstract=1.0, weight_sections=1.0,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.1872
         
         test_id = "01"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=1.0, weight_author=0.2,
                                      weight_abstract=1.0, weight_sections=1.0,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.1872
         
         test_id = "02"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=1.0, weight_author=1.0,
                                      weight_abstract=1.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2331
         
         test_id = "03"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=1.0, weight_author=1.0,
                                      weight_abstract=1.0, weight_sections=0.5,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2093
         
         test_id = "04"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=1.0, weight_author=1.0,
                                      weight_abstract=1.0, weight_sections=1.5,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.1754
         
         # Conclusions:
         #   -author field can probably be disregarded (as expected)
         #   -section field should have relatively low weight
         
     elif round_nr == 1: # round 1 - getting a better feel of field weights
         test_id = "10"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=1.0, weight_author=0.0,
                                      weight_abstract=1.0, weight_sections=1.0,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.1872
         
         test_id = "11"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=1.0, weight_author=5.0,
                                      weight_abstract=1.0, weight_sections=1.0,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.1871
         
         test_id = "12"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=1.0, weight_author=0.0,
                                      weight_abstract=0.5, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2132
         
         test_id = "13"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=1.0, weight_author=0.0,
                                      weight_abstract=1.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2331
         
         test_id = "14"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=1.0, weight_author=0.0,
                                      weight_abstract=1.5, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2390
         
         test_id = "15"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=0.5, weight_author=0.0,
                                      weight_abstract=1.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2218
         
         test_id = "16"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=1.0, weight_author=0.0,
                                      weight_abstract=1.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2331
         
         test_id = "17"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=1.5, weight_author=0.0,
                                      weight_abstract=1.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2398
         
         # Conclusions:
         #   -author field be disregarded set to 0
         #   -abstract field should have a relatively high weight (as expected)
         #   -title field should have a relatively hight weight (as expected)
         
     elif round_nr == 2: # round 2 - focussing on abstract weights
         test_id = "20"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=1.0, weight_author=0.0,
                                      weight_abstract=1.0, weight_sections=0.01,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2016
         
         test_id = "21"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=1.0, weight_author=0.0,
                                      weight_abstract=1.0, weight_sections=0.05,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2210
         
         test_id = "22"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=1.0, weight_author=0.0,
                                      weight_abstract=1.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2331
         
         test_id = "23"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=1.0, weight_author=0.0,
                                      weight_abstract=1.0, weight_sections=0.3,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2248
         
         # Conclusions:
         #   -With the other weights as they are, sections weight of 0.2 is good
         
     elif round_nr == 3: # round 3
         test_id = "30"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=0.8, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2409
         
         test_id = "31"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=1.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2465
         
         test_id = "32"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=1.5, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=2536
         
         test_id = "33"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2540
         
         test_id = "34"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=3.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2493
         
         test_id = "35"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=5.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2348
         
         # Conclusions:
         #   -With the other weights as they are, abstract weight of 2.0 is good
         
     elif round_nr == 4: # round 4 - focussing on title weights
         test_id = "40"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=0.8, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2357
         
         test_id = "41"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=1.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2390
         
         test_id = "42"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=1.5, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2459
         
         test_id = "43"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=2.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2499
         
         test_id = "44"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2536
         
         test_id = "45"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=5.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2514
         
         # Conclusions:
         #   -With the other weights as they are, title weight of 3.0 is good
         
     elif round_nr == 5: # round 5 - focussing on parameter 'k'
         test_id = "50"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=1.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2371
         
         test_id = "51"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=2.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2480
         
         test_id = "52"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2510
         
         test_id = "53"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=4.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         
         # map=0.2509
         
         test_id = "54"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=4.5, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2502
         
         test_id = "55"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2493
         
         test_id = "56"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=5.5, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2481
         
         test_id = "57"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=6.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2470
         
         test_id = "58"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=7.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2449
         
         # Conclusions:
         #   -With the other weights as they are, a 'k' of 3.0 is good
         
     elif round_nr == 6: # round 6 - focussing on parameter 'b' (which does not exist)
         test_id = "60"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.2, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2596
         
         test_id = "61"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.5, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2596
         
         test_id = "62"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.6, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2596
         
         test_id = "63"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.7, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2596
         
         test_id = "64"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2596
         
         test_id = "65"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.9, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2596
         
         # Conclusions:
         #   -There is no general b parameter...
         
     elif round_nr == 7: # round 7 - focussing on field 'b' parameters
         test_id = "70"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.7, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2596
         
         test_id = "71"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.9, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.8)
         # map=0.2595
         
         test_id = "72"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.7, b_sections=0.8)
         # map=0.2615
         
         test_id = "73"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.9, b_sections=0.8)
         # map=0.2563
         
         test_id = "74"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.7)
         # map=0.2595
         
         test_id = "75"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.8, b_sections=0.9)
         # map=0.2589
         
         test_id = "76"
         b_fields = 0.7
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=b_fields, b_author=b_fields, 
                                      b_abstract=b_fields, b_sections=b_fields)
         # map=0.2613
         
         test_id = "77"
         b_fields = 0.9
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=b_fields, b_author=b_fields, 
                                      b_abstract=b_fields, b_sections=b_fields)
         # map=0.2556
         
         # Conclusions:
         #   -Except for sections 0.7 appears to be a better 'b' than 0.8
         
     elif round_nr == 8: # round 8 - Focussing again on field 'b' parameters
         test_id = "80"
         b_fields = 0.4
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=b_fields, b_author=b_fields, 
                                      b_abstract=b_fields, b_sections=0.8)
         # map=0.2637
         
         test_id = "81"
         b_fields = 0.5
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=b_fields, b_author=b_fields, 
                                      b_abstract=b_fields, b_sections=0.8)
         # map=0.2634
         
         test_id = "82"
         b_fields = 0.6
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=b_fields, b_author=b_fields, 
                                      b_abstract=b_fields, b_sections=0.8)
         # map=0.2628
         
         test_id = "83"
         b_fields = 0.7
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=b_fields, b_author=b_fields, 
                                      b_abstract=b_fields, b_sections=0.8)
         # map=0.2615
         
         test_id = "84"
         b_fields = 0.8
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=b_fields, b_author=b_fields, 
                                      b_abstract=b_fields, b_sections=0.8)
         # map=0.2596
         
         test_id = "85"
         b_fields = 0.9
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=b_fields, b_author=b_fields, 
                                      b_abstract=b_fields, b_sections=0.8)
         # map=0.2562
         
         test_id = "86"
         b_fields = 0.7
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=b_fields, b_author=b_fields, 
                                      b_abstract=b_fields, b_sections=0.7)
         # map=0.2613
         
         # Conclusions:
         #   -For sections the 'b' parameter of 0.8 is good
         #   -For title and abstract the 'b' parameter of either or both
         #      may be best below 0.4
         
     elif round_nr == 9: # round 9 - Focussing again on field 'b' parameters
         test_id = "90"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.3, b_author=0.8, 
                                      b_abstract=0.4, b_sections=0.8)
         # map=0.2633
         
         test_id = "91"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.5, b_author=0.8, 
                                      b_abstract=0.4, b_sections=0.8)
         # map=0.2638
         
         test_id = "92"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.4, b_author=0.8, 
                                      b_abstract=0.3, b_sections=0.8)
         # map=0.2634
         
         test_id = "93"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.4, b_author=0.8, 
                                      b_abstract=0.5, b_sections=0.8)
         # map=0.2632
         
         test_id = "94"
         b_fields = 0.4
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=b_fields, b_author=b_fields, 
                                      b_abstract=b_fields, b_sections=0.8)
         # map=0.2637
         
         test_id = "95"
         b_fields = 0.4
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=b_fields, b_author=0.8, 
                                      b_abstract=b_fields, b_sections=0.8)
         # map=0.2637
         
         test_id = "96"
         b_fields = 0.3
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=b_fields, b_author=0.8, 
                                      b_abstract=b_fields, b_sections=0.8)
         # map=0.2632
         
         test_id = "97"
         b_fields = 0.2
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=b_fields, b_author=0.8, 
                                      b_abstract=b_fields, b_sections=0.8)
         # map=0.2617
         
         test_id = "98"
         b_fields = 0.1
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=b_fields, b_author=0.8, 
                                      b_abstract=b_fields, b_sections=0.8)
         # map=0.2587
         
         # Conclusions:
         #   -For the title a 'b' parameter of 0.5 or higher appears best
         #   -For the abstract a 'b' parameter of 0.4 appears best
     
         
     elif round_nr == 10: # round 10 - Final b test for title
         test_id = "100"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.3, b_author=0.8, 
                                      b_abstract=0.4, b_sections=0.8)
         # map=0.2633
         
         test_id = "101"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.4, b_author=0.8, 
                                      b_abstract=0.4, b_sections=0.8)
         # map=0.2637
         
         test_id = "102"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.5, b_author=0.8, 
                                      b_abstract=0.4, b_sections=0.8)
         # map=0.2638
         
         test_id = "103"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.6, b_author=0.8, 
                                      b_abstract=0.4, b_sections=0.8)
         # map=0.2640
         
         test_id = "104"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.7, b_author=0.8, 
                                      b_abstract=0.4, b_sections=0.8)
         # map=0.2642
         
         test_id = "105"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.8, b_author=0.8, 
                                      b_abstract=0.4, b_sections=0.8)
         # map=0.2641
         
         test_id = "106"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.2,
                                      b_title=0.9, b_author=0.8, 
                                      b_abstract=0.4, b_sections=0.8)
         # map=0.2641
         
         # Conclusions:
         #   -For the title a 'b' parameter of 0.7 appears best
         
     elif round_nr == 11: # round 11 - Some final tweaking
         test_id = "110"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.1,
                                      b_title=0.7, b_author=0.8, 
                                      b_abstract=0.4, b_sections=0.8)
         # map=0.2537
         
         test_id = "111"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.3,
                                      b_title=0.7, b_author=0.8, 
                                      b_abstract=0.4, b_sections=0.8)
         # map=0.2648
         
         test_id = "112"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.4,
                                      b_title=0.7, b_author=0.8, 
                                      b_abstract=0.4, b_sections=0.8)
         # map=0.2601
         
         test_id = "113"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.5,
                                      b_title=0.7, b_author=0.8, 
                                      b_abstract=0.4, b_sections=0.8)
         # map=0.2528
         
         test_id = "114"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.7,
                                      b_title=0.7, b_author=0.8, 
                                      b_abstract=0.4, b_sections=0.8)
         # map=0.2373
         
         test_id = "115"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=0.8,
                                      b_title=0.7, b_author=0.8, 
                                      b_abstract=0.4, b_sections=0.8)
         # map=0.2300
         
         test_id = "116"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=1.0,
                                      b_title=0.7, b_author=0.8, 
                                      b_abstract=0.4, b_sections=0.8)
         # map=0.2173
         
         test_id = "117"
         run_test(test_id, inverted_indexes_bm25f, doc_length_info_bm25f,
                                      k=3.0, b=0.8, 
                                      weight_title=3.0, weight_author=0.0,
                                      weight_abstract=2.0, weight_sections=1.5,
                                      b_title=0.7, b_author=0.8, 
                                      b_abstract=0.4, b_sections=0.8)
         # map=0.1937
         
         # Conclusions:
         #   - The following appear to be good parameters:
         #   -   k=3.0, b=0.8
         #   -   weight_title=3.0, weight_author=0.0, weight_abstract=2.0, weight_sections=0.3
         #   -   b_title=0.7, b_author=0.8, b_abstract=0.4, b_sections=0.8
 
     
     print("Done testing BM25F parameters.")
Example #13
0
    def create_complete_documents(self, path_merged_documents,
                                  path_linked_cord_uids, path_all_documents,
                                  path_documents):
        """
        Complete document information where necessary and possible.
        
        A number of documents have no direct reference to their full-text
        parse. Previous functions have linked these documents to full-text
        parses by matching their titles. This function integrates this
        information into a single 'complete' document set.
        """

        # Load the required data
        merged_documents = load_pickle(path_merged_documents)
        linked_cord_uids = load_pickle(path_linked_cord_uids)
        all_documents = load_pickle(path_all_documents)

        # This can be used to look up documents by cord_uid
        all_documents_dictionary = create_document_dictionary(all_documents)

        i = ti = au = ab = 0
        for merged_doc in merged_documents:

            cord_uid = merged_doc.cord_uid
            completed_documents = []
            completed_cord_uids = set()

            # If the document does not already have body text...
            if cord_uid not in linked_cord_uids:
                unlinked_doc = all_documents_dictionary[cord_uid]

                # If the title is missing, retrieve title information
                if is_empty(unlinked_doc.title):
                    unlinked_doc.title = merged_doc.title
                    ti += 1

                # If the abstract is missing, retrieve abstract information
                if is_empty(unlinked_doc.abstract):
                    unlinked_doc.abstract = merged_doc.abstract
                    ab += 1

                # If the authors are missing, retrieve author information
                unlinked_author_string = (
                    "" if unlinked_doc.authors == None else " ".join(
                        filter(None, unlinked_doc.authors)))
                if is_empty(unlinked_author_string):
                    unlinked_doc.authors = merged_doc.authors
                    au += 1

                # Retrieve the body text
                unlinked_doc.sections = merged_doc.sections

                # Track the cord_uids of documents to which info will be added
                completed_cord_uids.add(cord_uid)

                # Store the unlinked and now completed document
                completed_documents.append(unlinked_doc)

                i += 1
                if i % 1000 == 0:
                    print(
                        f"iteration={i}, potentially retrieved information on:"
                        + f" {ti} titles, {ab} abstracts, and {au} authors.")
        print(f"Potentially retrieved information on: {ti} titles," +
              f" {ab} abstracts, and {au} authors.")

        # Add documents that were already complete to the now completed documents
        for document in all_documents:

            # Only the documents that were already completed will have to be added
            if document.cord_uid not in completed_cord_uids:
                completed_documents.append(document)

        # Save the completed documents
        save_pickle(completed_documents, path_documents)

        # Free memory
        del merged_documents, linked_cord_uids, all_documents
        del all_documents_dictionary, completed_documents, completed_cord_uids