def execute_char_vocabulary(self, fin, fout, pout): """Test the vocabulary definition feature""" kwargs = {'topn': 20} self.extractor.load_chars(fin) self.extractor.plot_char_occurrences(pout, mins=[1, 2, 3]) self.assertTrue(os.path.exists(pout)) self.extractor.filter_chars(**kwargs) self.extractor.save_chars(fout) self.assertEqual(os.path.exists(fout), True) self.assertEqual(io_utils.count_lines(fout), 20)
def execute_sentences(self, fin, fout): """Test the sentence division feature""" kwargs = { 'ignore_digits': True, 'apostrophe': 'fr', 'ignore_punctuation': 'noise-a', 'tostrip': False, 'keepalnum': True, } self.extractor.save_sentences(fin, fout, 'text', **kwargs) self.assertEqual(os.path.exists(fout), True) self.assertEqual(io_utils.count_lines(fout), 111) self.assertTrue(filecmp.cmp(self.data, fout, shallow=False), 'Generated corpus different from reference corpus')
def execute_extract(self, fin, fout): """Test the wiki extraction feature""" args = [ '--quiet', '--json', '--bytes 30G', '--processes 2', '--no-templates', '--filter_disambig_pages', '--min_text_length 50', ] self.extractor.save_content(fin, fout, args) self.assertEqual(os.path.exists(fout), True) self.assertEqual(io_utils.count_lines(fout), 3)
def execute_word_vocabulary(self, fin, fout, pout): """Test the vocabulary definition feature""" kwargs = {'topn': 500} self.extractor.load_words(fin) self.extractor.plot_word_occurrences(pout, mins=[1, 2, 3]) self.assertTrue(os.path.exists(pout)) self.extractor.filter_words(**kwargs) self.extractor.save_words(fout) self.assertEqual(os.path.exists(fout), True) self.assertEqual(io_utils.count_lines(fout), 500) self.assertTrue( filecmp.cmp(self.vocab, fout, shallow=False), 'Generated vocabulary different from reference vocabulary')
def test_file(self): self.assertEqual('__init__.py', io_utils.filename(self.empty_file)) self.assertEqual('.py', io_utils.extension(self.empty_file)) self.assertEqual('__init__', io_utils.basename(self.empty_file)) self.assertEqual('/src/tests/utils', io_utils.dirname('/src/tests/utils/__init__.py')) self.assertEqual( '/src/tests/utils/__init__', io_utils.path_without_ext('/src/tests/utils/__init__.py')) self.assertEqual('0.0Bytes', io_utils.filesize(self.empty_file)) self.assertEqual('11.9KB', io_utils.filesize(self.archive)) self.assertEqual(0, io_utils.count_lines(self.empty_file)) self.assertTrue(io_utils.has_extension(self.empty_file, '.py')) self.assertFalse(io_utils.has_extension(self.empty_file, '.txt')) self.assertEqual( '/src/tests/utils/__init__.txt', io_utils.change_extension('/src/tests/utils/__init__.py', 'txt')) self.assertEqual('', io_utils.change_extension('', 'txt'))
def execute_decompress(self, fin, fout): """Test the decompress feature""" self.extractor.save_xml(fin, fout) self.assertEqual(os.path.exists(fout), True) self.assertEqual(io_utils.count_lines(fout), 366)