Exemple #1
0
 def reset(self):
     global cache_dictionary
     global final_dictionary
     global documents_dictionary
     cache_dictionary = None
     final_dictionary = None
     documents_dictionary = None
     Indexer.reset()
     Writer.reset()
     Stemmer.reset()
     Reader.reset()
Exemple #2
0
    def load_data(self, folder_path, stem_mode):
        global documents_dictionary
        global cache_dictionary
        global final_dictionary
        if final_dictionary is None or cache_dictionary is None or documents_dictionary is None:
            pass
        else:
            final_dictionary.clear()
            cache_dictionary.clear()
            documents_dictionary.clear()

        documents_dictionary = Writer.load_documents(folder_path, stem_mode)
        cache_dictionary = Writer.load_cache(folder_path, stem_mode)
        final_dictionary = Writer.load_final_dictionary(folder_path, stem_mode)
Exemple #3
0
    def run(self):
        global cache_dictionary
        global final_dictionary
        global documents_dictionary
        start_time = time.time()
        cache_dictionary = {}
        final_dictionary = {}
        documents_dictionary = {}

        # Creates a list with all of the file paths in the corpus. Pops to remove the corpus file path
        sub_dirs = [x[0] for x in os.walk(corpus_path)]
        sub_dirs.pop(0)

        files_list = []  # This list will save each part
        file_index = 1  # This index point to current file
        iterate_over_parts = 1  # This part point to the current part

        next_part = int(
            fileNum /
            parts) * iterate_over_parts  # The last index of the first part
        if thread_mode == 'on':  # Here we using ThreadPool
            # Init for ThreadPool with number of threads from config file
            executor = concurrent.futures.ThreadPoolExecutor(
                max_workers=number_of_threads)
            for subdir in sub_dirs:
                textList = Reader.separate(subdir)
                files_list.extend(textList)
                if file_index == next_part:
                    executor.submit(handle_files, files_list,
                                    documents_dictionary)
                    files_list = []  # cleaning the files list
                    if not iterate_over_parts + 1 == parts:
                        iterate_over_parts += 1
                        # update the last index of the next part
                        next_part = (int(fileNum / parts) * iterate_over_parts)
                if file_index == fileNum:  # The last index of the last part
                    executor.submit(handle_files, files_list,
                                    documents_dictionary)
                    break  # if we not iterate over the whole corpus
                file_index += 1
                # This function shut down the ThreadPool but wait until the Threads will finish
            executor.shutdown(wait=True)
        else:
            for subdir in sub_dirs:
                textList = Reader.separate(subdir)
                files_list.extend(textList)
                if file_index == next_part:
                    handle_files(files_list, documents_dictionary)
                    files_list = []  # cleaning the files list
                    if not iterate_over_parts + 1 == parts:
                        iterate_over_parts += 1
                        # update the last index of the next part
                        next_part = (int(fileNum / parts) * iterate_over_parts)
                if file_index == fileNum:  # The last index of the last part
                    handle_files(files_list, documents_dictionary)
                    break  # if we not iterate over the whole corpus
                file_index += 1

        sub_dirs = None
        files_list = None
        Stemmer.clean_cache()
        # Merge the temp files and removed them
        final_dictionary, cache_dictionary, posting_file_size = Indexer.merge_files(
            documents_dictionary)

        end_time = time.time()
        total_time = end_time - start_time

        # Stemmer.write_cache()
        print("Number of documents: " + str(len(documents_dictionary)))
        print("Number of terms: " + str(len(final_dictionary)))
        print("Time: " + str("{:.2f}".format(total_time)) + " seconds")
        print("Time: " + str("{:.2f}".format(total_time / 60)) + " minutes")

        final_dictionary_file_size = sys.getsizeof(final_dictionary)
        cache_file_size = sys.getsizeof(cache_dictionary)

        print("Posting file size: " + str(posting_file_size) + " Bytes")
        print("Dictionary file size: " + str(final_dictionary_file_size) +
              " Bytes")
        print("Cache file size: " + str(cache_file_size) + " Bytes")
        Writer.remove_temp_file()

        # Announce to the gui that indexing has concluded.
        global stem_mode
        self.view.finished_indexing(str(len(documents_dictionary)),
                                    str(final_dictionary_file_size),
                                    str(cache_file_size), str(int(total_time)),
                                    str(len(final_dictionary)),
                                    str(posting_file_size), stem_mode)
Exemple #4
0
 def fn_save_query_file_results(self, results, save_results_path):
     # Calls the writer to save the file
     Writer.save_query_file_results(results, save_results_path)
Exemple #5
0
 def fn_save_results(self, results, save_results_path):
     # Calls the writer to save the file
     Writer.save_regular_result(results, save_results_path)
Exemple #6
0
 def setDictSave(self, folder_name):
     # Save to final dictionary to disk
     Writer.save_final_dictionary(final_dictionary, folder_name)
Exemple #7
0
 def setCacheSave(self, folder_name):
     # Save to final dictionary to disk
     Writer.save_cache(cache_dictionary, folder_name)
Exemple #8
0
 def set_documents_save(self, folder_name):
     # Save to final dictionary to disk
     Writer.save_documents(documents_dictionary, folder_name)