Example #1
0
    def execute_char_vocabulary(self, fin, fout, pout):
        """Test the vocabulary definition feature"""
        kwargs = {'topn': 20}
        self.extractor.load_chars(fin)

        self.extractor.plot_char_occurrences(pout, mins=[1, 2, 3])
        self.assertTrue(os.path.exists(pout))

        self.extractor.filter_chars(**kwargs)
        self.extractor.save_chars(fout)
        self.assertEqual(os.path.exists(fout), True)
        self.assertEqual(io_utils.count_lines(fout), 20)
Example #2
0
 def execute_sentences(self, fin, fout):
     """Test the sentence division feature"""
     kwargs = {
         'ignore_digits': True,
         'apostrophe': 'fr',
         'ignore_punctuation': 'noise-a',
         'tostrip': False,
         'keepalnum': True,
     }
     self.extractor.save_sentences(fin, fout, 'text', **kwargs)
     self.assertEqual(os.path.exists(fout), True)
     self.assertEqual(io_utils.count_lines(fout), 111)
     self.assertTrue(filecmp.cmp(self.data, fout, shallow=False),
                     'Generated corpus different from reference corpus')
Example #3
0
 def execute_extract(self, fin, fout):
     """Test the wiki extraction feature"""
     args = [
         '--quiet',
         '--json',
         '--bytes 30G',
         '--processes 2',
         '--no-templates',
         '--filter_disambig_pages',
         '--min_text_length 50',
     ]
     self.extractor.save_content(fin, fout, args)
     self.assertEqual(os.path.exists(fout), True)
     self.assertEqual(io_utils.count_lines(fout), 3)
Example #4
0
    def execute_word_vocabulary(self, fin, fout, pout):
        """Test the vocabulary definition feature"""
        kwargs = {'topn': 500}
        self.extractor.load_words(fin)

        self.extractor.plot_word_occurrences(pout, mins=[1, 2, 3])
        self.assertTrue(os.path.exists(pout))

        self.extractor.filter_words(**kwargs)
        self.extractor.save_words(fout)
        self.assertEqual(os.path.exists(fout), True)
        self.assertEqual(io_utils.count_lines(fout), 500)
        self.assertTrue(
            filecmp.cmp(self.vocab, fout, shallow=False),
            'Generated vocabulary different from reference vocabulary')
Example #5
0
    def test_file(self):
        self.assertEqual('__init__.py', io_utils.filename(self.empty_file))
        self.assertEqual('.py', io_utils.extension(self.empty_file))
        self.assertEqual('__init__', io_utils.basename(self.empty_file))
        self.assertEqual('/src/tests/utils',
                         io_utils.dirname('/src/tests/utils/__init__.py'))
        self.assertEqual(
            '/src/tests/utils/__init__',
            io_utils.path_without_ext('/src/tests/utils/__init__.py'))

        self.assertEqual('0.0Bytes', io_utils.filesize(self.empty_file))
        self.assertEqual('11.9KB', io_utils.filesize(self.archive))
        self.assertEqual(0, io_utils.count_lines(self.empty_file))

        self.assertTrue(io_utils.has_extension(self.empty_file, '.py'))
        self.assertFalse(io_utils.has_extension(self.empty_file, '.txt'))

        self.assertEqual(
            '/src/tests/utils/__init__.txt',
            io_utils.change_extension('/src/tests/utils/__init__.py', 'txt'))
        self.assertEqual('', io_utils.change_extension('', 'txt'))
Example #6
0
 def execute_decompress(self, fin, fout):
     """Test the decompress feature"""
     self.extractor.save_xml(fin, fout)
     self.assertEqual(os.path.exists(fout), True)
     self.assertEqual(io_utils.count_lines(fout), 366)