def test_get_all_files_from_directory(): root_dir = os.path.dirname(os.path.abspath(__file__)) directory = 'test_resources/documents' path = os.path.join(root_dir, directory) files = basic_tools.get_all_files_from_directory(path) assert os.path.join(path, 'test1.txt') in files assert os.path.join(path, 'test2.txt') in files path2 = os.path.join(path, 'test_directory') files2 = basic_tools.get_all_files_from_directory(path, True) print(files2) assert os.path.join(path, 'test1.txt') in files2 assert os.path.join(path, 'test2.txt') in files2 assert os.path.join(path2, 'test3.txt') in files2 assert os.path.join(path2, 'test_dir2/test4.txt') in files2
def __init__(self, dataset_directory, is_recursive=True): """ dataset_directory -- a relative or absolute file path to the \ directory that contains the documents directory and the dataset_metadata.txt file. is_recursive -- find documents recursively in the documents directory, \ by default documents will be found recursively. Document metadata types will be infered from the document metadata. """ # create commonly used directory and file paths self._dataset_directory = dataset_directory self._abs_dataset_directory = os.path.abspath(dataset_directory) self._metadata_file = os.path.join(self._abs_dataset_directory, 'dataset_metadata.txt') self._documents_directory = os.path.join(self._abs_dataset_directory, 'documents') self.is_recursive = is_recursive self._filters = [] self.metadata = {} # load the dataset metadata with io.open(self._metadata_file, 'r', encoding='utf-8', errors='ignore') as meta_file: content = meta_file.read() metadata, __ = basic_tools.seperate_metadata_and_content(content) self.metadata = basic_tools.metadata_to_dict(metadata) self.metadata_types = {} basic_tools.collect_types(self.metadata_types, self.metadata) if not 'readable_name' in self.metadata: identifier = self._dataset_directory.replace('_', ' ').replace( '/', ' ').title() else: identifier = self.metadata['readable_name'] self.name = basic_tools.remove_punctuation(identifier) # find and sort all file paths self._list_of_documents = basic_tools.get_all_files_from_directory( self._documents_directory, self.is_recursive) self._list_of_documents.sort() # find any bad documents and find document metadata self.document_metadata_types = {} bad_doc_indices = [] for doc_index, doc in enumerate(self): try: basic_tools.collect_types(self.document_metadata_types, doc.metadata) except Exception as e: print("Bad document: ", self._list_of_documents[doc_index]) bad_doc_indices.append(doc_index) while len(bad_doc_indices) != 0: remove_index = bad_doc_indices.pop() del self._list_of_documents[remove_index]
def __init__(self, dataset_directory, is_recursive=True): """ dataset_directory -- a relative or absolute file path to the \ directory that contains the documents directory and the dataset_metadata.txt file. is_recursive -- find documents recursively in the documents directory, \ by default documents will be found recursively. Document metadata types will be infered from the document metadata. """ # create commonly used directory and file paths self._dataset_directory = dataset_directory self._abs_dataset_directory = os.path.abspath(dataset_directory) self._metadata_file = os.path.join(self._abs_dataset_directory, 'dataset_metadata.txt') self._documents_directory = os.path.join(self._abs_dataset_directory, 'documents') self.is_recursive = is_recursive self._filters = [] self.metadata = {} # load the dataset metadata with io.open(self._metadata_file, 'r', encoding='utf-8', errors='ignore') as meta_file: content = meta_file.read() metadata, __ = basic_tools.seperate_metadata_and_content(content) self.metadata = basic_tools.metadata_to_dict(metadata) self.metadata_types = {} basic_tools.collect_types(self.metadata_types, self.metadata) if not 'readable_name' in self.metadata: identifier = self._dataset_directory.replace('_', ' ').replace('/', ' ').title() else: identifier = self.metadata['readable_name'] self.name = basic_tools.remove_punctuation(identifier) # find and sort all file paths self._list_of_documents = basic_tools.get_all_files_from_directory(self._documents_directory, self.is_recursive) self._list_of_documents.sort() # find any bad documents and find document metadata self.document_metadata_types = {} bad_doc_indices = [] for doc_index, doc in enumerate(self): try: basic_tools.collect_types(self.document_metadata_types, doc.metadata) except Exception as e: print("Bad document: ", self._list_of_documents[doc_index]) bad_doc_indices.append(doc_index) while len(bad_doc_indices) != 0: remove_index = bad_doc_indices.pop() del self._list_of_documents[remove_index]