def reset(self): global cache_dictionary global final_dictionary global documents_dictionary cache_dictionary = None final_dictionary = None documents_dictionary = None Indexer.reset() Writer.reset() Stemmer.reset() Reader.reset()
def load_data(self, folder_path, stem_mode): global documents_dictionary global cache_dictionary global final_dictionary if final_dictionary is None or cache_dictionary is None or documents_dictionary is None: pass else: final_dictionary.clear() cache_dictionary.clear() documents_dictionary.clear() documents_dictionary = Writer.load_documents(folder_path, stem_mode) cache_dictionary = Writer.load_cache(folder_path, stem_mode) final_dictionary = Writer.load_final_dictionary(folder_path, stem_mode)
def run(self): global cache_dictionary global final_dictionary global documents_dictionary start_time = time.time() cache_dictionary = {} final_dictionary = {} documents_dictionary = {} # Creates a list with all of the file paths in the corpus. Pops to remove the corpus file path sub_dirs = [x[0] for x in os.walk(corpus_path)] sub_dirs.pop(0) files_list = [] # This list will save each part file_index = 1 # This index point to current file iterate_over_parts = 1 # This part point to the current part next_part = int( fileNum / parts) * iterate_over_parts # The last index of the first part if thread_mode == 'on': # Here we using ThreadPool # Init for ThreadPool with number of threads from config file executor = concurrent.futures.ThreadPoolExecutor( max_workers=number_of_threads) for subdir in sub_dirs: textList = Reader.separate(subdir) files_list.extend(textList) if file_index == next_part: executor.submit(handle_files, files_list, documents_dictionary) files_list = [] # cleaning the files list if not iterate_over_parts + 1 == parts: iterate_over_parts += 1 # update the last index of the next part next_part = (int(fileNum / parts) * iterate_over_parts) if file_index == fileNum: # The last index of the last part executor.submit(handle_files, files_list, documents_dictionary) break # if we not iterate over the whole corpus file_index += 1 # This function shut down the ThreadPool but wait until the Threads will finish executor.shutdown(wait=True) else: for subdir in sub_dirs: textList = Reader.separate(subdir) files_list.extend(textList) if file_index == next_part: handle_files(files_list, documents_dictionary) files_list = [] # cleaning the files list if not iterate_over_parts + 1 == parts: iterate_over_parts += 1 # update the last index of the next part next_part = (int(fileNum / parts) * iterate_over_parts) if file_index == fileNum: # The last index of the last part handle_files(files_list, documents_dictionary) break # if we not iterate over the whole corpus file_index += 1 sub_dirs = None files_list = None Stemmer.clean_cache() # Merge the temp files and removed them final_dictionary, cache_dictionary, posting_file_size = Indexer.merge_files( documents_dictionary) end_time = time.time() total_time = end_time - start_time # Stemmer.write_cache() print("Number of documents: " + str(len(documents_dictionary))) print("Number of terms: " + str(len(final_dictionary))) print("Time: " + str("{:.2f}".format(total_time)) + " seconds") print("Time: " + str("{:.2f}".format(total_time / 60)) + " minutes") final_dictionary_file_size = sys.getsizeof(final_dictionary) cache_file_size = sys.getsizeof(cache_dictionary) print("Posting file size: " + str(posting_file_size) + " Bytes") print("Dictionary file size: " + str(final_dictionary_file_size) + " Bytes") print("Cache file size: " + str(cache_file_size) + " Bytes") Writer.remove_temp_file() # Announce to the gui that indexing has concluded. global stem_mode self.view.finished_indexing(str(len(documents_dictionary)), str(final_dictionary_file_size), str(cache_file_size), str(int(total_time)), str(len(final_dictionary)), str(posting_file_size), stem_mode)
def fn_save_query_file_results(self, results, save_results_path): # Calls the writer to save the file Writer.save_query_file_results(results, save_results_path)
def fn_save_results(self, results, save_results_path): # Calls the writer to save the file Writer.save_regular_result(results, save_results_path)
def setDictSave(self, folder_name): # Save to final dictionary to disk Writer.save_final_dictionary(final_dictionary, folder_name)
def setCacheSave(self, folder_name): # Save to final dictionary to disk Writer.save_cache(cache_dictionary, folder_name)
def set_documents_save(self, folder_name): # Save to final dictionary to disk Writer.save_documents(documents_dictionary, folder_name)