def run_engine(corpus_path='', output_path='', stemming=False): """ :return: """ # Create PostingFile directory if it doesn't exist number_of_documents = 0 config = ConfigClass() r = ReadFile(corpus_path=corpus_path) p = Parse(stemming) indexer = Indexer(config, output_path) # Get all parquet files from corpus path parquets = [] for root, dirs, files in os.walk(corpus_path): for name in files: if name.endswith((".parquet", ".htm")): parquets.append((root, name)) for index in range(len(parquets)): r.corpus_path = parquets[index][0] documents_list = r.read_file(file_name=parquets[index][1]) # Create a new process for each document with Pool(CPUCOUNT) as _p: for parsed_doc in _p.imap_unordered(p.parse_doc, documents_list): number_of_documents += 1 indexer.add_new_doc(parsed_doc) _p.close() _p.join() p.entities.clear() indexer.finish_index() save_obj(indexer.term_dict, output_path + '/' + "inverted_idx") save_obj(indexer.document_dict, output_path + '/' + "doc_dictionary") indexer.document_dict.clear() indexer.term_dict.clear()