Beispiel #1
0
 def test_non_html_ignored(self, workdir):
     # Non .html/.xhtml files are ignored
     proc = HTMLCleaner()
     sample_path = workdir / "src" / "sample.txt"
     resultpath, metadata = proc.process(str(sample_path), {'error': False})
     # input was not touched
     assert resultpath == str(sample_path)
Beispiel #2
0
 def test_option_fix_head_nums_false(self, samples_dir, workdir):
     # Make sure we respect the `fix_head_nums` option if false.
     samples_dir.join("sample3.html").copy(workdir / "src" / "sample.html")
     proc = HTMLCleaner(options={'html-cleaner-fix-head-nums': 'False'})
     resultpath, metadata = proc.process(
         str(workdir / "src" / "sample.html"), {'error': False})
     contents = codecs.open(resultpath, 'r', 'utf-8').read()
     assert u'<span class="u-o-headnum">1</span>Häding1' not in contents
Beispiel #3
0
 def test_option_invalid(self):
     # Make sure we complain when trash is set as `fix_head_nums`.
     with pytest.raises(ArgumentParserError):
         HTMLCleaner(options={'html-cleaner-fix-head-nums': 'foo'})
     with pytest.raises(ArgumentParserError):
         HTMLCleaner(options={'html-cleaner-fix-img-links': 'foo'})
     with pytest.raises(ArgumentParserError):
         HTMLCleaner(options={'html-cleaner-fix-sdfields': 'foo'})
Beispiel #4
0
 def test_option_fix_sdfields_true(self, samples_dir, workdir):
     # Make sure we respect the `fix_sdtags` option if false
     samples_dir.join("sample3.html").copy(workdir / "src" / "sample.html")
     proc = HTMLCleaner(options={'html-cleaner-fix-sd-fields': '1'})
     resultpath, metadata = proc.process(
         str(workdir / "src" / "sample.html"), {'error': False})
     contents = codecs.open(resultpath, 'r', 'utf-8').read()
     assert u'<sdfield type="PAGE">' not in contents
 def test_rename_img_files_src_is_dir(self, workdir):
     # We cope with src files that are in fact dirs
     proc = HTMLCleaner(
         options={'html-cleaner-fix-img-links': '1'})
     proc.rename_img_files(
         str(workdir), {'src': 'sample.jpg'})
     list_dir = os.listdir(str(workdir))
     assert 'sample.jpg' not in list_dir
 def test_non_html_ignored(self, workdir):
     # Non .html/.xhtml files are ignored
     proc = HTMLCleaner()
     sample_path = workdir / "src" / "sample.txt"
     resultpath, metadata = proc.process(
         str(sample_path), {'error': False})
     # input was not touched
     assert resultpath == str(sample_path)
Beispiel #7
0
 def test_rename_img_files_no_src(self, samples_dir, workdir):
     # We cope with not existing source files
     samples_dir.join("image_sample_html_m20918026.gif").copy(
         workdir / "src" / "image_sample_html_m20918026.gif")
     proc = HTMLCleaner(options={'html-cleaner-fix-img-links': '1'})
     proc.rename_img_files(str(workdir / "src"),
                           {'not-existing-filename': 'sample_1.gif'})
     list_dir = os.listdir(str(workdir / "src"))
     assert 'sample_1.gif' not in list_dir
 def test_non_html_ignored(self):
     # Non .html/.xhtml files are ignored
     proc = HTMLCleaner()
     sample_path = os.path.join(self.workdir, 'sample.txt')
     open(sample_path, 'w').write('Sample file.')
     self.resultpath, metadata = proc.process(
         sample_path, {'error': False})
     # input was not touched
     assert self.resultpath == sample_path
 def test_rename_img_files(self):
     proc = HTMLCleaner(
         options={'html-cleaner-fix-img-links': '1'})
     proc.rename_img_files(
         self.workdir2,
         {'image_sample_html_m20918026.gif': 'sample_1.gif'}
         )
     list_dir = os.listdir(self.workdir2)
     assert 'sample_1.gif' in list_dir
     assert 'image_sample_html_m20918026.gif' not in list_dir
 def test_option_fix_sdfields_true(self):
     # Make sure we respect the `fix_sdtags` option if false
     proc = HTMLCleaner(
         options={
             'html-cleaner-fix-sd-fields': '1'})
     self.resultpath, metadata = proc.process(
         self.sample_path, {'error': False})
     contents = open(self.resultpath, 'rb').read()
     snippet = '<sdfield type="PAGE">'
     assert snippet not in contents
 def test_rename_img_files_no_src(self):
     # We cope with not existing source files
     proc = HTMLCleaner(
         options={'html-cleaner-fix-img-links': '1'})
     proc.rename_img_files(
         self.workdir2,
         {'not-existing-filename': 'sample_1.gif'}
         )
     list_dir = os.listdir(self.workdir2)
     assert 'sample_1.gif' not in list_dir
 def test_option_fix_head_nums_false(self, samples_dir, workdir):
     # Make sure we respect the `fix_head_nums` option if false.
     samples_dir.join("sample3.html").copy(workdir / "src" / "sample.html")
     proc = HTMLCleaner(
         options={
             'html-cleaner-fix-head-nums': 'False'})
     resultpath, metadata = proc.process(
         str(workdir / "src" / "sample.html"), {'error': False})
     contents = codecs.open(resultpath, 'r', 'utf-8').read()
     assert u'<span class="u-o-headnum">1</span>Häding1' not in contents
 def test_option_fix_sdfields_true(self, samples_dir, workdir):
     # Make sure we respect the `fix_sdtags` option if false
     samples_dir.join("sample3.html").copy(workdir / "src" / "sample.html")
     proc = HTMLCleaner(
         options={
             'html-cleaner-fix-sd-fields': '1'})
     resultpath, metadata = proc.process(
         str(workdir / "src" / "sample.html"), {'error': False})
     contents = codecs.open(resultpath, 'r', 'utf-8').read()
     assert u'<sdfield type="PAGE">' not in contents
 def test_cleaner(self, workdir, samples_dir):
     # make sure erranous headings are fixed by default.
     samples_dir.join("sample3.html").copy(workdir / "src" / "sample.html")
     proc = HTMLCleaner()
     resultpath, metadata = proc.process(
         str(workdir / "src" / "sample.html"), {'error': False})
     contents = codecs.open(resultpath, 'r', 'utf-8').read()
     u'<span class="u-o-headnum">1</span>Häding1' in contents
     u'<span class="u-o-headnum">1.1</span>Heading1.1' in contents
     u'<span class="u-o-headnum">1.2.</span>Heading1.2.' in contents
Beispiel #15
0
 def test_cleaner(self, workdir, samples_dir):
     # make sure erranous headings are fixed by default.
     samples_dir.join("sample3.html").copy(workdir / "src" / "sample.html")
     proc = HTMLCleaner()
     resultpath, metadata = proc.process(
         str(workdir / "src" / "sample.html"), {'error': False})
     contents = codecs.open(resultpath, 'r', 'utf-8').read()
     u'<span class="u-o-headnum">1</span>Häding1' in contents
     u'<span class="u-o-headnum">1.1</span>Heading1.1' in contents
     u'<span class="u-o-headnum">1.2.</span>Heading1.2.' in contents
 def test_rename_img_files_src_is_dir(self):
     # We cope with src files that are in fact dirs
     proc = HTMLCleaner(
         options={'html-cleaner-fix-img-links': '1'})
     os.mkdir(os.path.join(self.workdir2, 'some_dir'))
     proc.rename_img_files(
         self.workdir2,
         {'some_dir': 'sample.jpg'}
         )
     list_dir = os.listdir(self.workdir2)
     assert 'sample.jpg' not in list_dir
 def test_rename_img_files_dst_exists_already(self):
     # We cope with dest files that already exist
     proc = HTMLCleaner(
         options={'html-cleaner-fix-img-links': '1'})
     proc.rename_img_files(
         self.workdir2,
         {'image_sample_html_m20918026.gif':
              'image_sample_html_m20918026.gif'}
         )
     list_dir = os.listdir(self.workdir2)
     assert 'image_sample_html_m20918026.gif' in list_dir
Beispiel #18
0
 def test_rename_img_files_dst_exists_already(self, samples_dir, workdir):
     # We cope with dest files that already exist
     samples_dir.join("image_sample_html_m20918026.gif").copy(
         workdir / "src" / "image_sample_html_m20918026.gif")
     proc = HTMLCleaner(options={'html-cleaner-fix-img-links': '1'})
     proc.rename_img_files(str(workdir / "src"), {
         'image_sample_html_m20918026.gif':
         'image_sample_html_m20918026.gif'
     })
     list_dir = os.listdir(str(workdir / "src"))
     assert 'image_sample_html_m20918026.gif' in list_dir
Beispiel #19
0
 def test_rename_img_files(self, samples_dir, workdir):
     # we can rename image files
     samples_dir.join("image_sample_html_m20918026.gif").copy(
         workdir / "src" / "image_sample_html_m20918026.gif")
     proc = HTMLCleaner(options={'html-cleaner-fix-img-links': '1'})
     proc.rename_img_files(
         str(workdir / "src"),
         {'image_sample_html_m20918026.gif': 'sample_1.gif'})
     list_dir = os.listdir(str(workdir / "src"))
     assert 'sample_1.gif' in list_dir
     assert 'image_sample_html_m20918026.gif' not in list_dir
 def test_rename_img_files_no_src(self, samples_dir, workdir):
     # We cope with not existing source files
     samples_dir.join("image_sample_html_m20918026.gif").copy(
         workdir / "src" / "image_sample_html_m20918026.gif")
     proc = HTMLCleaner(
         options={'html-cleaner-fix-img-links': '1'})
     proc.rename_img_files(
         str(workdir / "src"),
         {'not-existing-filename': 'sample_1.gif'}
         )
     list_dir = os.listdir(str(workdir / "src"))
     assert 'sample_1.gif' not in list_dir
    def test_option_fix_head_nums_false(self):
        # Make sure we respect the `fix_head_nums` option if false.
        proc = HTMLCleaner(
            options={
                'html-cleaner-fix-head-nums': 'False'})
        self.resultpath, metadata = proc.process(
            self.sample_path, {'error': False})
        contents = open(self.resultpath, 'rb').read()

        snippet1 = "%s" % (
            '<h1 class="foo"><span class="u-o-headnum">1</span>Häding1</h1>')
        assert snippet1 not in contents
 def test_rename_img_files(self, samples_dir, workdir):
     # we can rename image files
     samples_dir.join("image_sample_html_m20918026.gif").copy(
         workdir / "src" / "image_sample_html_m20918026.gif")
     proc = HTMLCleaner(
         options={'html-cleaner-fix-img-links': '1'})
     proc.rename_img_files(
         str(workdir / "src"),
         {'image_sample_html_m20918026.gif': 'sample_1.gif'}
         )
     list_dir = os.listdir(str(workdir / "src"))
     assert 'sample_1.gif' in list_dir
     assert 'image_sample_html_m20918026.gif' not in list_dir
 def test_option_fix_img_links_true(self):
     # Make sure we respect the `fix_img_links` option if true
     proc = HTMLCleaner(
         options={
             'html-cleaner-fix-img-links': '1'})
     self.resultpath, metadata = proc.process(
         self.img_sample_path, {'error': False})
     contents = open(self.resultpath, 'rb').read()
     resultdir = os.path.dirname(self.resultpath)
     snippet = '<IMG SRC="image_sample_html_m20918026.gif"'
     list_dir = os.listdir(resultdir)
     assert snippet not in contents
     assert 'image_sample_html_m20918026.gif' not in list_dir
     assert 'sample_1.gif' in list_dir
 def test_rename_img_files_dst_exists_already(self, samples_dir, workdir):
     # We cope with dest files that already exist
     samples_dir.join("image_sample_html_m20918026.gif").copy(
         workdir / "src" / "image_sample_html_m20918026.gif")
     proc = HTMLCleaner(
         options={'html-cleaner-fix-img-links': '1'})
     proc.rename_img_files(
         str(workdir / "src"),
         {
             'image_sample_html_m20918026.gif':
             'image_sample_html_m20918026.gif'
         }
     )
     list_dir = os.listdir(str(workdir / "src"))
     assert 'image_sample_html_m20918026.gif' in list_dir
Beispiel #25
0
 def test_option_fix_img_links_true(self, samples_dir, workdir):
     # Make sure we respect the `fix_img_links` option if true
     samples_dir.join("image_sample.html").copy(workdir / "src" /
                                                "sample.html")
     samples_dir.join("image_sample_html_m20918026.gif").copy(
         workdir / "src" / "image_sample_html_m20918026.gif")
     proc = HTMLCleaner(options={'html-cleaner-fix-img-links': '1'})
     resultpath, metadata = proc.process(
         str(workdir / "src" / "sample.html"), {'error': False})
     contents = open(resultpath, 'r').read()
     resultdir = os.path.dirname(resultpath)
     snippet = '<IMG SRC="image_sample_html_m20918026.gif"'
     list_dir = os.listdir(resultdir)
     assert snippet not in contents
     assert 'image_sample_html_m20918026.gif' not in list_dir
     assert 'sample_1.gif' in list_dir
    def test_cleaner(self):
        # make sure erranous headings are fixed by default.
        proc = HTMLCleaner()
        self.resultpath, metadata = proc.process(
            self.sample_path, {'error': False})
        contents = open(self.resultpath, 'rb').read()

        snippet1 = "%s" % (
            '<span class="u-o-headnum">1</span>Häding1')
        snippet2 = "%s" % (
            '<span class="u-o-headnum">1.1</span>Heading1.1')
        snippet3 = "%s" % (
            '<span class="u-o-headnum">1.2.</span>Heading1.2.')
        assert snippet1 in contents
        assert snippet2 in contents
        assert snippet3 in contents
 def test_option_fix_img_links_true(self, samples_dir, workdir):
     # Make sure we respect the `fix_img_links` option if true
     samples_dir.join("image_sample.html").copy(
         workdir / "src" / "sample.html")
     samples_dir.join("image_sample_html_m20918026.gif").copy(
         workdir / "src" / "image_sample_html_m20918026.gif")
     proc = HTMLCleaner(
         options={
             'html-cleaner-fix-img-links': '1'})
     resultpath, metadata = proc.process(
         str(workdir / "src" / "sample.html"), {'error': False})
     contents = open(resultpath, 'r').read()
     resultdir = os.path.dirname(resultpath)
     snippet = '<IMG SRC="image_sample_html_m20918026.gif"'
     list_dir = os.listdir(resultdir)
     assert snippet not in contents
     assert 'image_sample_html_m20918026.gif' not in list_dir
     assert 'sample_1.gif' in list_dir
Beispiel #28
0
 def test_rename_img_files_src_is_dir(self, workdir):
     # We cope with src files that are in fact dirs
     proc = HTMLCleaner(options={'html-cleaner-fix-img-links': '1'})
     proc.rename_img_files(str(workdir), {'src': 'sample.jpg'})
     list_dir = os.listdir(str(workdir))
     assert 'sample.jpg' not in list_dir