def run_engine(corpus_path="testData", output_path="posting", stemming=True, glove_dict=None): """ This function build the inverted index over the corpus. send each tweet to parsing and indexing. if the stemming is True the parsing will use the stemmer on the tokens. :param glove_dict: Glove file including all word vectors :param corpus_path: root folder containing the raw tweet files :param output_path for the inverted index, posting files and tweets dictionary :param stemming if True use stemmer on terms """ config = ConfigClass(corpus_path, number_of_term_buckets=26, number_of_entities_buckets=2, output_path=output_path) r = ReadFile(corpus_path=config.get_corpusPath()) p = Parse(stemming) indexer = Indexer(config) all_files_paths = glob.glob(config.get_corpusPath() + "\\*\\*.snappy.parquet") all_files_names = [file_name[file_name.find("\\") + 1:] for file_name in all_files_paths] start_time = time.time() file_counter = 0 for file_name in all_files_names: file_start_time = time.time() # print("start file :", file_counter) documents_list = [document for document in r.read_file(file_name=file_name)] # Iterate over every document in the file for idx, document in enumerate(documents_list): parsed_document = p.parse_doc(document) indexer.add_new_doc(parsed_document, glove_dict) # print("end file number ", file_counter, " in: ", time.time() - file_start_time) file_counter += 1 total_time = time.time() - start_time indexer.finish_indexing()