def run_engine(corpus_path, stemming, output_path): """ :return: """ r = ReadFile(corpus_path) p = Parse(stemming) m = BinaryMemoryPosting(os.path.join(output_path, PostingFile)) indexer = Indexer() max_posting_size = 100000 if os.path.exists(os.path.join(output_path, PostingFile)): os.remove(os.path.join(output_path, PostingFile)) if os.path.exists(InvertedIndexFile + '.pkl'): os.remove(InvertedIndexFile + '.pkl') if not os.path.exists(output_path): os.mkdir(output_path) # Iterate over every document in the file idx = 0 for documents_list in r: step = 1 / len(documents_list) for document in documents_list: parsed_list = p.parse_doc(document) # index the document data indexer.add_new_doc(parsed_list, idx, document[0]) idx += 1 if idx % max_posting_size == 0: m.Save(p.word_dict) r.progressbar.update(step) r.progressbar.close() m.Save(p.word_dict) global_table = utils.load_obj(f'global_table_{stemming}') inv_index = indexer.CreatInvertedIndex(p.word_dict, idx, global_table) m.Merge(inv_index) utils.save_obj(inv_index, InvertedIndexFile)