def get_gold_standard_categorization(self): gold_standard_categorization = {} categories = self.__get_all_category_directory_names() for category in categories: category_path = self.__get_category_path(category) all_documents_in_category = dataset_handler.get_names_of_files_in_directory(category_path) category_index_term = n_gram_handler.string_to_index_term(category) gold_standard_categorization[category_index_term] = all_documents_in_category return gold_standard_categorization
def get_all_test_documents(self): all_test_documents = {} sub_directories = dataset_handler.get_all_subdirectory_names(self.data_path) for sub_directory in sub_directories: sub_directory_path = os.path.join(self.data_path, sub_directory) files_indices = dataset_handler.get_names_of_files_in_directory(sub_directory_path) for file_id in files_indices: file_path = os.path.join(sub_directory_path, file_id) all_test_documents[file_id] = dataset_handler.get_document_as_string(file_path,self.encoding) return all_test_documents