def test_extract_css_complex_css(self, samples_dir):
     # Make sure we get proper external stylesheets.
     html_input = samples_dir.join("sample2.html").read_text('utf-8')
     result, css = extract_css(html_input, 'sample.html')
     assert len(css) == 210
     assert css.startswith('@page { size: 21cm')
     return
 def test_extract_css_utf8_unicode(self):
     # we can handle umlauts in unicode-strings.
     result, css = extract_css(
         u"<html><body>ä</body></html>", 'sample.html')
     assert css is None
     assert result == u'<html><body>ä</body></html>'
     return
 def test_extract_css_complex_html(self, samples_dir):
     # Make sure we have styles purged and replaced by a link
     html_input = samples_dir.join("sample2.html").read_text('utf-8')
     result, css = extract_css(html_input, 'sample.html')
     assert '<style' not in result
     link = '<link href="sample.css" rel="stylesheet" type="text/css"/>'
     assert link in result
     return
 def test_extract_css_no_empty_comments(self):
     # Make sure there are no empty comments in CSS
     html_input_path = os.path.join(
         os.path.dirname(__file__), 'input', 'sample2.html')
     html_input = open(html_input_path, 'rb').read()
     result, css = extract_css(html_input, 'sample.html')
     assert '/*' not in result
     return
 def test_extract_css_complex_css(self):
     # Make sure we get proper external stylesheets.
     html_input_path = os.path.join(
         os.path.dirname(__file__), 'input', 'sample2.html')
     html_input = open(html_input_path, 'rb').read()
     result, css = extract_css(html_input, 'sample.html')
     assert len(css) == 156
     assert css.startswith('@page { size: 21cm')
     return
 def test_extract_css_prettify(self):
     # We can get prettified HTML (although it might be broken)
     result, css = extract_css(
         "<span>text<span>no</span>gap</span>", "sample.html",
         prettify_html=True
     )
     assert result == (
         "<span>\n text\n <span>\n  no\n </span>\n gap\n</span>"
         )
 def test_extract_css_complex_html(self):
     # Make sure we have styles purged and replaced by a link
     html_input_path = os.path.join(
         os.path.dirname(__file__), 'input', 'sample2.html')
     html_input = open(html_input_path, 'rb').read()
     result, css = extract_css(html_input, 'sample.html')
     assert '<style' not in result
     link = '<link href="sample.css" rel="stylesheet" type="text/css"/>'
     assert link in result
     return
 def test_extract_css_contains_all_styles_from_input(self, samples_dir):
     # Extracted CSS contains all styles from input HTML
     content = samples_dir.join("sample2.html").read_text('utf-8')
     html, css = extract_css(content, "sample.html")
     assert css == (
         "@page { size: 21cm 29.7cm; margin: 2cm }\n"
         "p { margin-bottom: 0.21cm }\n"
         "span.c2 { font-family: DejaVu Sans Mono, sans-serif }\n"
         "span.c3 { font-family: DejaVu Sans Mönö, sans-serif }\n"
         "p.c1 { margin-bottom: 0cm }\n  \n  "
     )
Exemple #9
0
    def process(self, path, metadata):
        ext = os.path.splitext(path)[1]
        if ext not in self.supported_extensions:
            return path, metadata
        basename = os.path.basename(path)
        src_path = os.path.join(copy_to_secure_location(path), basename)
        remove_file_dir(path)

        new_html, css = extract_css(
            open(src_path, 'rb').read().decode('utf-8'),
            basename,
            prettify_html=self.options['css_cleaner_prettify_html'])
        css, errors = cleanup_css(
            css, minified=self.options['css_cleaner_minified'])

        css_file = os.path.splitext(src_path)[0] + '.css'
        if css is not None:
            with open(css_file, 'wb') as fd:
                fd.write(css.encode('utf-8'))
        with open(src_path, 'wb') as fd:
            fd.write(new_html.encode('utf-8'))

        return src_path, metadata
Exemple #10
0
    def process(self, path, metadata):
        ext = os.path.splitext(path)[1]
        if ext not in self.supported_extensions:
            return path, metadata
        basename = os.path.basename(path)
        src_path = os.path.join(
            copy_to_secure_location(path), basename)
        remove_file_dir(path)

        new_html, css = extract_css(
            open(src_path, 'rb').read().decode('utf-8'), basename,
            prettify_html=self.options['css_cleaner_prettify_html'])
        css, errors = cleanup_css(
            css, minified=self.options['css_cleaner_minified'])

        css_file = os.path.splitext(src_path)[0] + '.css'
        if css is not None:
            with open(css_file, 'wb') as fd:
                fd.write(css.encode('utf-8'))
        with open(src_path, 'wb') as fd:
            fd.write(new_html.encode('utf-8'))

        return src_path, metadata
 def test_extract_css_puts_links_into_html(self, samples_dir):
     # the returned HTML part has the styles replaced with a link:
     content = samples_dir.join("sample2.html").read_text('utf-8')
     html, css = extract_css(content, "sample.html")
     assert html == (
         '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"\n'
         '    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n'
         '\n'
         '<html xmlns="http://www.w3.org/1999/xhtml">\n'
         '<head>\n'
         '<meta content="HTML Tidy for Linux/x86 (vers 6 November 2007)'
         ', see www.w3.org" name="generator"/>\n'
         '<meta content="text/html; charset=utf-8" '
         'http-equiv="CONTENT-TYPE"/>\n'
         '<title></title>\n'
         '<meta content="OpenOffice.org 2.4 (Linux)" name="GENERATOR"/>\n'
         '<meta content="Uli Fouquet" name="AUTHOR"/>\n'
         '<meta content="20110517;485000" name="CREATED"/>\n'
         '<meta content="Uli Fouquet" name="CHANGEDBY"/>\n'
         '<meta content="20110517;524000" name="CHANGED"/>\n'
         '<link href="sample.css" rel="stylesheet" type="text/css"/>\n'
         '\n'
         '</head>\n'
         '<body dir="ltr" lang="de-DE" xml:lang="de-DE">\n'
         '<p class="c1">Some text</p>\n'
         '<p class="c1"><br/></p>\n'
         '<p class="c1">with <b>bold</b> and <i>italic</i> fonts.</p>\n'
         '<p class="c1"><br/></p>\n'
         '<p class="c1">Also a <span class="c2">complete different\n'
         '  font</span> is here. With umlaut: ä</p>\n'
         '<p class="c1">Finally, some\n'
         '    <span class="c2">seam</span><span>less text.</span>\n'
         '</p>\n'
         '<p class="c1"><br/></p>\n'
         '</body>\n'
         '</html>\n'
         )
 def test_extract_css_utf8(self):
     # we do not stumble over umlauts.
     result, css = extract_css(
         "<html><body>äö</body></html>", 'sample.html')
     assert css is None
     assert result == '<html><body>äö</body></html>'
 def test_extract_css_no_empty_comments(self, samples_dir):
     # Make sure there are no empty comments in CSS
     html_input = samples_dir.join("sample2.html").read_text('utf-8')
     result, css = extract_css(html_input, 'sample.html')
     assert '/*' not in result
     return
 def test_extract_css_utf8_unicode(self):
     result, css = extract_css(
         "<html><body>ä</body></html>", 'sample.html')
     assert css is None
     assert result == '<html>\n <body>\n  ä\n </body>\n</html>'
     return
 def test_extract_css_utf8(self):
     result, css = extract_css(
         "<html><body>äö</body></html>", 'sample.html')
     assert css is None
     assert result == '<html>\n <body>\n  äö\n </body>\n</html>'
 def test_extract_css_nested_styles(self):
     # Trash in, trash out...
     result, css = extract_css(
         "<html><style>a<style>b</style></style></html>", 'sample.html')
     assert css == 'a\nb'
 def test_extract_css_empty_styles2(self):
     # Also trashy docs can be handled
     result, css = extract_css(
         "<html><style /></html>", 'sample.html')
     assert css is None
     assert result == "<html>\n</html>"
 def test_extract_css_empty_styles1(self):
     # Also trashy docs can be handled
     result, css = extract_css(
         "<style></style>", 'sample.html')
     assert css is None
     assert result == ""
 def test_extract_css_simple(self):
     result, css = extract_css(
         "<style>a, b</style>", 'sample.html')
     link = '<link href="sample.css" rel="stylesheet" type="text/css"/>\n'
     assert css == 'a, b'
     assert result == link
 def test_extract_css_trash(self):
     # Also trashy docs can be handled
     result, css = extract_css("", 'sample.html')
     assert css is None
     assert result == ""
 def test_extract_css_no_prettify_by_default(self):
     # by default we do not get prettified html
     result, css = extract_css(
         "<span>text<span>no</span>gap</span>", "sample.html"
     )
     assert result == "<span>text<span>no</span>gap</span>"