def load_corpus_from_directory(in_folder): t1 = time.time() corpus = textcorpus.TextDirectoryCorpus(in_folder, lines_are_documents=True) t2 = time.time() print(f"# seconds = {int(t2-t1)}") return corpus
def test_one_level_directory(self): dirpath = self.write_one_level() corpus = textcorpus.TextDirectoryCorpus(dirpath) self.assertEqual(len(corpus), 2) docs = list(corpus) self.assertEqual(len(docs), 2)
def test_two_level_directory(self): dirpath, next_level = self.write_two_levels() corpus = textcorpus.TextDirectoryCorpus(dirpath) self.assertEqual(len(corpus), 4) docs = list(corpus) self.assertEqual(len(docs), 4) corpus = textcorpus.TextDirectoryCorpus(dirpath, min_depth=1) self.assertEqual(len(corpus), 2) docs = list(corpus) self.assertEqual(len(docs), 2) corpus = textcorpus.TextDirectoryCorpus(dirpath, max_depth=0) self.assertEqual(len(corpus), 2) docs = list(corpus) self.assertEqual(len(docs), 2)
def test_non_trivial_structure(self): """Test with non-trivial directory structure, shown below: . ├── 0.txt ├── a_folder │ └── 1.txt └── b_folder ├── 2.txt ├── 3.txt └── c_folder └── 4.txt """ dirpath = tempfile.mkdtemp() self.write_docs_to_directory(dirpath, '0.txt') a_folder = os.path.join(dirpath, 'a_folder') os.mkdir(a_folder) self.write_docs_to_directory(a_folder, '1.txt') b_folder = os.path.join(dirpath, 'b_folder') os.mkdir(b_folder) self.write_docs_to_directory(b_folder, '2.txt', '3.txt') c_folder = os.path.join(b_folder, 'c_folder') os.mkdir(c_folder) self.write_docs_to_directory(c_folder, '4.txt') corpus = textcorpus.TextDirectoryCorpus(dirpath) filenames = list(corpus.iter_filepaths()) base_names = sorted([name[len(dirpath) + 1:] for name in filenames]) expected = sorted([ '0.txt', 'a_folder/1.txt', 'b_folder/2.txt', 'b_folder/3.txt', 'b_folder/c_folder/4.txt' ]) expected = [os.path.normpath(path) for path in expected] self.assertEqual(expected, base_names) corpus.max_depth = 1 self.assertEqual(expected[:-1], base_names[:-1]) corpus.min_depth = 1 self.assertEqual(expected[2:-1], base_names[2:-1]) corpus.max_depth = 0 self.assertEqual(expected[2:], base_names[2:]) corpus.pattern = "4.*" self.assertEqual(expected[-1], base_names[-1])
def test_lines_are_documents(self): dirpath = tempfile.mkdtemp() lines = ['doc%d text' % i for i in range(5)] fpath = os.path.join(dirpath, 'test_file.txt') with open(fpath, 'w') as f: f.write('\n'.join(lines)) corpus = textcorpus.TextDirectoryCorpus(dirpath, lines_are_documents=True) docs = [doc for doc in corpus.getstream()] self.assertEqual(len(lines), corpus.length) # should have cached self.assertEqual(lines, docs) corpus.lines_are_documents = False docs = [doc for doc in corpus.getstream()] self.assertEqual(1, corpus.length) self.assertEqual('\n'.join(lines), docs[0])
def test_filename_filtering(self): dirpath = self.write_one_level('test1.log', 'test1.txt', 'test2.log', 'other1.log') corpus = textcorpus.TextDirectoryCorpus(dirpath, pattern="test.*\.log") filenames = list(corpus.iter_filepaths()) expected = [os.path.join(dirpath, name) for name in ('test1.log', 'test2.log')] self.assertEqual(sorted(expected), sorted(filenames)) corpus.pattern = ".*.txt" filenames = list(corpus.iter_filepaths()) expected = [os.path.join(dirpath, 'test1.txt')] self.assertEqual(expected, filenames) corpus.pattern = None corpus.exclude_pattern = ".*.log" filenames = list(corpus.iter_filepaths()) self.assertEqual(expected, filenames)