def test_extract_css_complex_css(self, samples_dir): # Make sure we get proper external stylesheets. html_input = samples_dir.join("sample2.html").read_text('utf-8') result, css = extract_css(html_input, 'sample.html') assert len(css) == 210 assert css.startswith('@page { size: 21cm') return
def test_extract_css_utf8_unicode(self): # we can handle umlauts in unicode-strings. result, css = extract_css( u"<html><body>ä</body></html>", 'sample.html') assert css is None assert result == u'<html><body>ä</body></html>' return
def test_extract_css_complex_html(self, samples_dir): # Make sure we have styles purged and replaced by a link html_input = samples_dir.join("sample2.html").read_text('utf-8') result, css = extract_css(html_input, 'sample.html') assert '<style' not in result link = '<link href="sample.css" rel="stylesheet" type="text/css"/>' assert link in result return
def test_extract_css_no_empty_comments(self): # Make sure there are no empty comments in CSS html_input_path = os.path.join( os.path.dirname(__file__), 'input', 'sample2.html') html_input = open(html_input_path, 'rb').read() result, css = extract_css(html_input, 'sample.html') assert '/*' not in result return
def test_extract_css_complex_css(self): # Make sure we get proper external stylesheets. html_input_path = os.path.join( os.path.dirname(__file__), 'input', 'sample2.html') html_input = open(html_input_path, 'rb').read() result, css = extract_css(html_input, 'sample.html') assert len(css) == 156 assert css.startswith('@page { size: 21cm') return
def test_extract_css_prettify(self): # We can get prettified HTML (although it might be broken) result, css = extract_css( "<span>text<span>no</span>gap</span>", "sample.html", prettify_html=True ) assert result == ( "<span>\n text\n <span>\n no\n </span>\n gap\n</span>" )
def test_extract_css_complex_html(self): # Make sure we have styles purged and replaced by a link html_input_path = os.path.join( os.path.dirname(__file__), 'input', 'sample2.html') html_input = open(html_input_path, 'rb').read() result, css = extract_css(html_input, 'sample.html') assert '<style' not in result link = '<link href="sample.css" rel="stylesheet" type="text/css"/>' assert link in result return
def test_extract_css_contains_all_styles_from_input(self, samples_dir): # Extracted CSS contains all styles from input HTML content = samples_dir.join("sample2.html").read_text('utf-8') html, css = extract_css(content, "sample.html") assert css == ( "@page { size: 21cm 29.7cm; margin: 2cm }\n" "p { margin-bottom: 0.21cm }\n" "span.c2 { font-family: DejaVu Sans Mono, sans-serif }\n" "span.c3 { font-family: DejaVu Sans Mönö, sans-serif }\n" "p.c1 { margin-bottom: 0cm }\n \n " )
def process(self, path, metadata): ext = os.path.splitext(path)[1] if ext not in self.supported_extensions: return path, metadata basename = os.path.basename(path) src_path = os.path.join(copy_to_secure_location(path), basename) remove_file_dir(path) new_html, css = extract_css( open(src_path, 'rb').read().decode('utf-8'), basename, prettify_html=self.options['css_cleaner_prettify_html']) css, errors = cleanup_css( css, minified=self.options['css_cleaner_minified']) css_file = os.path.splitext(src_path)[0] + '.css' if css is not None: with open(css_file, 'wb') as fd: fd.write(css.encode('utf-8')) with open(src_path, 'wb') as fd: fd.write(new_html.encode('utf-8')) return src_path, metadata
def process(self, path, metadata): ext = os.path.splitext(path)[1] if ext not in self.supported_extensions: return path, metadata basename = os.path.basename(path) src_path = os.path.join( copy_to_secure_location(path), basename) remove_file_dir(path) new_html, css = extract_css( open(src_path, 'rb').read().decode('utf-8'), basename, prettify_html=self.options['css_cleaner_prettify_html']) css, errors = cleanup_css( css, minified=self.options['css_cleaner_minified']) css_file = os.path.splitext(src_path)[0] + '.css' if css is not None: with open(css_file, 'wb') as fd: fd.write(css.encode('utf-8')) with open(src_path, 'wb') as fd: fd.write(new_html.encode('utf-8')) return src_path, metadata
def test_extract_css_puts_links_into_html(self, samples_dir): # the returned HTML part has the styles replaced with a link: content = samples_dir.join("sample2.html").read_text('utf-8') html, css = extract_css(content, "sample.html") assert html == ( '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"\n' ' "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n' '\n' '<html xmlns="http://www.w3.org/1999/xhtml">\n' '<head>\n' '<meta content="HTML Tidy for Linux/x86 (vers 6 November 2007)' ', see www.w3.org" name="generator"/>\n' '<meta content="text/html; charset=utf-8" ' 'http-equiv="CONTENT-TYPE"/>\n' '<title></title>\n' '<meta content="OpenOffice.org 2.4 (Linux)" name="GENERATOR"/>\n' '<meta content="Uli Fouquet" name="AUTHOR"/>\n' '<meta content="20110517;485000" name="CREATED"/>\n' '<meta content="Uli Fouquet" name="CHANGEDBY"/>\n' '<meta content="20110517;524000" name="CHANGED"/>\n' '<link href="sample.css" rel="stylesheet" type="text/css"/>\n' '\n' '</head>\n' '<body dir="ltr" lang="de-DE" xml:lang="de-DE">\n' '<p class="c1">Some text</p>\n' '<p class="c1"><br/></p>\n' '<p class="c1">with <b>bold</b> and <i>italic</i> fonts.</p>\n' '<p class="c1"><br/></p>\n' '<p class="c1">Also a <span class="c2">complete different\n' ' font</span> is here. With umlaut: ä</p>\n' '<p class="c1">Finally, some\n' ' <span class="c2">seam</span><span>less text.</span>\n' '</p>\n' '<p class="c1"><br/></p>\n' '</body>\n' '</html>\n' )
def test_extract_css_utf8(self): # we do not stumble over umlauts. result, css = extract_css( "<html><body>äö</body></html>", 'sample.html') assert css is None assert result == '<html><body>äö</body></html>'
def test_extract_css_no_empty_comments(self, samples_dir): # Make sure there are no empty comments in CSS html_input = samples_dir.join("sample2.html").read_text('utf-8') result, css = extract_css(html_input, 'sample.html') assert '/*' not in result return
def test_extract_css_utf8_unicode(self): result, css = extract_css( "<html><body>ä</body></html>", 'sample.html') assert css is None assert result == '<html>\n <body>\n ä\n </body>\n</html>' return
def test_extract_css_utf8(self): result, css = extract_css( "<html><body>äö</body></html>", 'sample.html') assert css is None assert result == '<html>\n <body>\n äö\n </body>\n</html>'
def test_extract_css_nested_styles(self): # Trash in, trash out... result, css = extract_css( "<html><style>a<style>b</style></style></html>", 'sample.html') assert css == 'a\nb'
def test_extract_css_empty_styles2(self): # Also trashy docs can be handled result, css = extract_css( "<html><style /></html>", 'sample.html') assert css is None assert result == "<html>\n</html>"
def test_extract_css_empty_styles1(self): # Also trashy docs can be handled result, css = extract_css( "<style></style>", 'sample.html') assert css is None assert result == ""
def test_extract_css_simple(self): result, css = extract_css( "<style>a, b</style>", 'sample.html') link = '<link href="sample.css" rel="stylesheet" type="text/css"/>\n' assert css == 'a, b' assert result == link
def test_extract_css_trash(self): # Also trashy docs can be handled result, css = extract_css("", 'sample.html') assert css is None assert result == ""
def test_extract_css_no_prettify_by_default(self): # by default we do not get prettified html result, css = extract_css( "<span>text<span>no</span>gap</span>", "sample.html" ) assert result == "<span>text<span>no</span>gap</span>"