def run_engine(corpus_path, output_path, stemming=False): """ Builds the retrieval model. Preprocess, parse and index corpus. :return: a tuple of number_of_documents in the corpus and average_document_length """ number_of_documents = 0 total_document_length = 0 reader = ReadFile(corpus_path) parser = Parse() indexer = Indexer(output_path) # read all parquet data files files = glob(corpus_path + "/**/*.parquet", recursive=True) # read, parse and index document in batches. Posting files are divided by english alphabet # a batch is defined as all the documents in a single parquet file # each batch is first written as many sub-batches indicated by an index and later merged into one coherent batch batch_index = 0 file_index = 0 while file_index < len(files): # batch two files at a time to reduce disk seek time penalty first_file = files[file_index] first_documents_list = reader.read_file(first_file) if file_index + 1 < len(files): second_file = files[file_index + 1] second_documents_list = reader.read_file(second_file) documents_list = first_documents_list + second_documents_list else: # if only one batch left for the last batch documents_list = first_documents_list file_index += 2 # Iterate over every document in the file # parse documents parsed_file = set() for document_as_list in documents_list: parsed_document = parser.parse_doc(document_as_list, stemming) parsed_file.add(parsed_document) total_document_length += parsed_document.doc_length number_of_documents += 1 # index parsed documents indexer.index_batch(parsed_file, str(batch_index)) batch_index += 1 # calculate average document length average_document_length = float( total_document_length) / number_of_documents # after indexing all non-entity terms in the corpus, index legal entities indexer.index_entities() # save index dictionary to disk utils.save_obj(indexer.inverted_idx, output_path + "inverted_idx") # after indexing the whole corpus, consolidate all partial posting files indexer.consolidate_postings() return number_of_documents, average_document_length