def load_docs(self): """ Aims at loading all the collection's documents (processed) in the collection instance. """ pickle_path = f"pickle/{self.name}_docs.p" try: self.documents = load(open(pickle_path, "rb")) self.number_of_docs = len(self.documents) except FileNotFoundError: number_document_loaded = 0 for id_directory in range(10): print(f"Loading directory {id_directory}") path_directory = self.path_to_corpus + str(id_directory) for text_file in listdir(path_directory): # create a document instance document = Document( id_doc=number_document_loaded, id_folder=id_directory, address=text_file, ) # load data and process documents (filter, remove stopwords and lemmatize) document.get_content(self.path_to_corpus) document.process_document(stopwords_list=self.stopwords, lemmatizer=self.lemmatizer) self.documents.append(document) number_document_loaded += 1 makedirs(path.dirname(pickle_path), exist_ok=True) dump(self.documents, open(pickle_path, "wb")) self.number_of_docs = number_document_loaded