def test_cleanup_html_fix_img_links(self): html_input_path = os.path.join( os.path.dirname(__file__), 'input', 'image_sample.html') html_input = open(html_input_path, 'rb').read() result, img_map = cleanup_html( html_input, 'sample.html', fix_img_links=True) assert len(img_map) == 4
def process(self, path, metadata): ext = os.path.splitext(path)[1] if ext not in self.supported_extensions: return path, metadata basename = os.path.basename(path) src_path = os.path.join(copy_to_secure_location(path), basename) src_dir = os.path.dirname(src_path) remove_file_dir(path) new_html, img_name_map = cleanup_html( codecs.open(src_path, 'r', 'utf-8').read(), basename, fix_head_nums=self.options['html_cleaner_fix_heading_numbers'], fix_img_links=self.options['html_cleaner_fix_image_links'], fix_sdfields=self.options['html_cleaner_fix_sd_fields'], ) with codecs.open(src_path, 'wb', 'utf-8') as fd: fd.write(new_html) # Rename images self.rename_img_files(src_dir, img_name_map) return src_path, metadata
def process(self, path, metadata): ext = os.path.splitext(path)[1] if ext not in self.supported_extensions: return path, metadata basename = os.path.basename(path) src_path = os.path.join( copy_to_secure_location(path), basename) src_dir = os.path.dirname(src_path) remove_file_dir(path) new_html, img_name_map = cleanup_html( codecs.open(src_path, 'r', 'utf-8').read(), basename, fix_head_nums=self.options['html_cleaner_fix_heading_numbers'], fix_img_links=self.options['html_cleaner_fix_image_links'], fix_sdfields=self.options['html_cleaner_fix_sd_fields'], ) with codecs.open(src_path, 'wb', 'utf-8') as fd: fd.write(new_html) # Rename images self.rename_img_files(src_dir, img_name_map) return src_path, metadata
def test_cleanup_html_dont_fix_sdfields(self): html_input = '<p>Blah<sdfield type="PAGE">8</sdfield></p>' result, img_map = cleanup_html(html_input, 'sample.html', fix_sdfields=False) assert html_input == result
def test_cleanup_html_fix_sdfields(self): html_input = '<p>Blah<sdfield type="PAGE">8</sdfield></p>' result, img_map = cleanup_html(html_input, 'sample.html') expected = '<p>Blah<span class="sdfield" type="PAGE">8</span></p>' assert result == expected
def test_cleanup_html_fix_head_nums_linebreaks(self): html_input = '<body><h1>\n 1.1.Heading</h1></body>' result, img_map = cleanup_html(html_input, 'sample.html') expected = '<body><h1>\n <span class="u-o-headnum">%s</span>' expected += 'Heading</h1></body>' assert result == expected % ('1.1.')
def test_cleanup_html_fix_head_nums_tag_attrs(self): html_input = '<body><h6 class="foo">1.1.Heading</h6></body>' result, img_map = cleanup_html(html_input, 'sample.html') expected = '<body><h6 class="foo"><span class="u-o-headnum">%s' expected += '</span>Heading</h6></body>' assert result == expected % ('1.1.')
def test_cleanup_html_fix_head_nums_no_nums(self): html_input = '<body><h1>Heading</h1></body>' result, img_map = cleanup_html(html_input, 'sample.html') assert result == '<body><h1>Heading</h1></body>'
def test_cleanup_html_no_minify_by_default(self): # by default, cleanup_html does not minify code html_input = '<span>\n<span>foo</span>\n</span>' result, img_map = cleanup_html(html_input, 'sample.html') assert result == html_input
def test_cleanup_html_fix_img_links(self, samples_dir): # we do fix links to images. html_input = samples_dir.join("image_sample.html").read() result, img_map = cleanup_html( html_input, 'sample.html', fix_img_links=True) assert len(img_map) == 4