Beispiel #1
0
    def transform(self, pretty_print=True, **kwargs):
        """change the self.html and return it with CSS turned into style
        attributes.
        """
        if hasattr(self.html, "getroottree"):
            # skip the next bit
            root = self.html.getroottree()
            page = root
            tree = root
        else:
            if self.method == 'xml':
                parser = etree.XMLParser(
                    ns_clean=False,
                    resolve_entities=False
                )
            else:
                parser = etree.HTMLParser()
            stripped = self.html.strip()
            tree = etree.fromstring(stripped, parser).getroottree()
            page = tree.getroot()
            # lxml inserts a doctype if none exists, so only include it in
            # the root if it was in the original html.
            root = tree if stripped.startswith(tree.docinfo.doctype) else page

        assert page is not None

        if self.disable_leftover_css:
            head = None
        else:
            head = get_or_create_head(tree)
        #
        # style selectors
        #

        rules = []
        index = 0

        for element in CSSSelector('style,link[rel~=stylesheet]')(page):
            # If we have a media attribute whose value is anything other than
            # 'all' or 'screen', ignore the ruleset.
            media = element.attrib.get('media')
            if media and media not in ('all', 'screen'):
                continue

            data_attribute = element.attrib.get(self.attribute_name)
            if data_attribute:
                if data_attribute == 'ignore':
                    del element.attrib[self.attribute_name]
                    continue
                else:
                    warnings.warn(
                        'Unrecognized %s attribute (%r)' % (
                            self.attribute_name,
                            data_attribute,
                        )
                    )

            is_style = element.tag == 'style'
            if is_style:
                css_body = element.text
            else:
                href = element.attrib.get('href')
                css_body = self._load_external(href)

            these_rules, these_leftover = self._parse_style_rules(
                css_body, index
            )
            index += 1
            rules.extend(these_rules)
            parent_of_element = element.getparent()
            if these_leftover or self.keep_style_tags:
                if is_style:
                    style = element
                else:
                    style = etree.Element('style')
                    style.attrib['type'] = 'text/css'
                if self.keep_style_tags:
                    style.text = css_body
                else:
                    style.text = self._css_rules_to_string(these_leftover)
                if self.method == 'xml':
                    style.text = etree.CDATA(style.text)

                if not is_style:
                    element.addprevious(style)
                    parent_of_element.remove(element)

            elif not self.keep_style_tags or not is_style:
                parent_of_element.remove(element)

        # external style files
        if self.external_styles:
            for stylefile in self.external_styles:
                css_body = self._load_external(stylefile)
                self._process_css_text(css_body, index, rules, head)
                index += 1

        # css text
        if self.css_text:
            for css_body in self.css_text:
                self._process_css_text(css_body, index, rules, head)
                index += 1

        # rules is a tuple of (specificity, selector, styles), where
        # specificity is a tuple ordered such that more specific
        # rules sort larger.
        rules.sort(key=operator.itemgetter(0))

        # collecting all elements that we need to apply rules on
        # id is unique for the lifetime of the object
        # and lxml should give us the same everytime during this run
        # item id -> {item: item, classes: [], style: []}
        elements = {}
        for _, selector, style in rules:
            new_selector = selector
            class_ = ''
            if ':' in selector:
                new_selector, class_ = re.split(':', selector, 1)
                class_ = ':%s' % class_
            # Keep filter-type selectors untouched.
            if class_ in FILTER_PSEUDOSELECTORS:
                class_ = ''
            else:
                selector = new_selector

            sel = CSSSelector(selector)
            items = sel(page)
            if len(items):
                # same so process it first
                processed_style = csstext_to_pairs(style)

                for item in items:
                    item_id = id(item)
                    if item_id not in elements:
                        elements[item_id] = {
                            'item': item,
                            'classes': [],
                            'style': [],
                        }

                    elements[item_id]['style'].append(processed_style)
                    elements[item_id]['classes'].append(class_)

        # Now apply inline style
        # merge style only once for each element
        # crucial when you have a lot of pseudo/classes
        # and a long list of elements
        for _, element in elements.items():
            final_style = merge_styles(
                element['item'].attrib.get('style', ''),
                element['style'],
                element['classes'],
                remove_unset_properties=self.remove_unset_properties,
            )
            if final_style:
                # final style could be empty string because of
                # remove_unset_properties
                element['item'].attrib['style'] = final_style
            self._style_to_basic_html_attributes(
                element['item'],
                final_style,
                force=True
            )

        if self.remove_classes:
            # now we can delete all 'class' attributes
            for item in page.xpath('//@class'):
                parent = item.getparent()
                del parent.attrib['class']

        # Add align attributes to images if they have a CSS float value of
        # right or left. Outlook (both on desktop and on the web) are bad at
        # understanding floats, but they do understand the HTML align attrib.
        if self.align_floating_images:
            for item in page.xpath('//img[@style]'):
                image_css = cssutils.parseStyle(item.attrib['style'])
                if image_css.float == 'right':
                    item.attrib['align'] = 'right'
                elif image_css.float == 'left':
                    item.attrib['align'] = 'left'

        #
        # URLs
        #
        if self.base_url:
            if not urlparse(self.base_url).scheme:
                raise ValueError('Base URL must have a scheme')
            for attr in ('href', 'src'):
                for item in page.xpath("//@%s" % attr):
                    parent = item.getparent()
                    url = parent.attrib[attr]
                    if (
                        attr == 'href' and self.preserve_internal_links and
                        url.startswith('#')
                    ):
                        continue
                    if (
                        attr == 'src' and self.preserve_inline_attachments and
                        url.startswith('cid:')
                    ):
                        continue
                    parent.attrib[attr] = urljoin(self.base_url, url)

        if hasattr(self.html, "getroottree"):
            return root
        else:
            kwargs.setdefault('method', self.method)
            kwargs.setdefault('pretty_print', pretty_print)
            kwargs.setdefault('encoding', 'utf-8')  # As Ken Thompson intended
            out = etree.tostring(root, **kwargs).decode(kwargs['encoding'])
            if self.method == 'xml':
                out = _cdata_regex.sub(
                    lambda m: '/*<![CDATA[*/%s/*]]>*/' % m.group(1),
                    out
                )
            if self.strip_important:
                out = _importants.sub('', out)
            return out
Beispiel #2
0
    def transform(self, html=None, pretty_print=True, **kwargs):
        """change the html and return it with CSS turned into style
        attributes.
        """
        if html is not None and self.html is not None:
            raise TypeError("Can't pass html argument twice")
        elif html is None and self.html is None:
            raise TypeError("must pass html as first argument")
        elif html is None:
            html = self.html
        if hasattr(html, "getroottree"):
            # skip the next bit
            root = html.getroottree()
            page = root
            tree = root
        else:
            if self.method == "xml":
                parser = etree.XMLParser(ns_clean=False,
                                         resolve_entities=False)
            else:
                parser = etree.HTMLParser()
            stripped = html.strip()
            tree = etree.fromstring(stripped, parser).getroottree()
            page = tree.getroot()
            # lxml inserts a doctype if none exists, so only include it in
            # the root if it was in the original html.
            root = tree if stripped.startswith(tree.docinfo.doctype) else page

        assert page is not None

        if self.disable_leftover_css:
            head = None
        else:
            head = get_or_create_head(tree)
        #
        # style selectors
        #

        rules = []
        index = 0

        cssselector = ["style"]
        if self.allow_network:
            cssselector.append("link[rel~=stylesheet]")
        for element in _create_cssselector(",".join(cssselector))(page):
            # If we have a media attribute whose value is anything other than
            # 'all' or 'screen', ignore the ruleset.
            media = element.attrib.get("media")
            if media and media not in ("all", "screen"):
                continue

            data_attribute = element.attrib.get(self.attribute_name)
            if data_attribute:
                if data_attribute == "ignore":
                    del element.attrib[self.attribute_name]
                    continue
                else:
                    warnings.warn("Unrecognized %s attribute (%r)" %
                                  (self.attribute_name, data_attribute))

            is_style = element.tag == "style"
            if is_style:
                css_body = element.text
            else:
                href = element.attrib.get("href")
                css_body = self._load_external(href)

            these_rules, these_leftover = self._parse_style_rules(
                css_body, index)

            index += 1
            rules.extend(these_rules)
            parent_of_element = element.getparent()
            if these_leftover or self.keep_style_tags:
                if is_style:
                    style = element
                else:
                    style = etree.Element("style")
                    style.attrib["type"] = "text/css"
                if self.keep_style_tags:
                    style.text = css_body
                else:
                    style.text = self._css_rules_to_string(these_leftover)
                if self.method == "xml":
                    style.text = etree.CDATA(style.text)

                if not is_style:
                    element.addprevious(style)
                    parent_of_element.remove(element)

            elif not self.keep_style_tags or not is_style:
                parent_of_element.remove(element)

        # external style files
        if self.external_styles and self.allow_network:
            for stylefile in self.external_styles:
                css_body = self._load_external(stylefile)
                self._process_css_text(css_body, index, rules, head)
                index += 1

        # css text
        if self.css_text:
            for css_body in self.css_text:
                self._process_css_text(css_body, index, rules, head)
                index += 1

        # rules is a tuple of (specificity, selector, styles), where
        # specificity is a tuple ordered such that more specific
        # rules sort larger.
        rules.sort(key=operator.itemgetter(0))

        # collecting all elements that we need to apply rules on
        # id is unique for the lifetime of the object
        # and lxml should give us the same everytime during this run
        # item id -> {item: item, classes: [], style: []}
        elements = {}
        for _, selector, style in rules:
            new_selector = selector
            class_ = ""
            if ":" in selector:
                new_selector, class_ = re.split(":", selector, 1)
                class_ = ":%s" % class_
            # Keep filter-type selectors untouched.
            if class_ in FILTER_PSEUDOSELECTORS or class_.startswith(
                    ":nth-child"):
                class_ = ""
            else:
                selector = new_selector

            assert selector
            try:
                sel = _create_cssselector(selector)
            except SelectorSyntaxError:
                # TODO: this should be optional
                next
            items = sel(page)
            if len(items):
                # same so process it first
                processed_style = csstext_to_pairs(
                    style, validate=not self.disable_validation)

                for item in items:
                    item_id = id(item)
                    if item_id not in elements:
                        elements[item_id] = {
                            "item": item,
                            "classes": [],
                            "style": []
                        }

                    elements[item_id]["style"].append(processed_style)
                    elements[item_id]["classes"].append(class_)

        # Now apply inline style
        # merge style only once for each element
        # crucial when you have a lot of pseudo/classes
        # and a long list of elements
        for _, element in elements.items():
            final_style = merge_styles(
                element["item"].attrib.get("style", ""),
                element["style"],
                element["classes"],
                remove_unset_properties=self.remove_unset_properties,
            )
            if final_style:
                # final style could be empty string because of
                # remove_unset_properties
                element["item"].attrib["style"] = final_style
            self._style_to_basic_html_attributes(element["item"],
                                                 final_style,
                                                 force=True)

        if self.remove_classes:
            # now we can delete all 'class' attributes
            for item in page.xpath("//@class"):
                parent = item.getparent()
                del parent.attrib["class"]

        # Capitalize Margin properties
        # To fix weird outlook bug
        # https://www.emailonacid.com/blog/article/email-development/outlook.com-does-support-margins
        if self.capitalize_float_margin:
            for item in page.xpath("//@style"):
                mangled = capitalize_float_margin(item)
                item.getparent().attrib["style"] = mangled

        # Add align attributes to images if they have a CSS float value of
        # right or left. Outlook (both on desktop and on the web) are bad at
        # understanding floats, but they do understand the HTML align attrib.
        if self.align_floating_images:
            for item in page.xpath("//img[@style]"):
                image_css = cssutils.parseStyle(item.attrib["style"])
                if image_css.float == "right":
                    item.attrib["align"] = "right"
                elif image_css.float == "left":
                    item.attrib["align"] = "left"

        #
        # URLs
        #
        if self.base_url and not self.disable_link_rewrites:
            if not urlparse(self.base_url).scheme:
                raise ValueError("Base URL must have a scheme")
            for attr in ("href", "src"):
                for item in page.xpath("//@%s" % attr):
                    parent = item.getparent()
                    url = parent.attrib[attr]
                    if (attr == "href" and self.preserve_internal_links
                            and url.startswith("#")):
                        continue
                    if (attr == "src" and self.preserve_inline_attachments
                            and url.startswith("cid:")):
                        continue
                    if attr == "href" and url.startswith("tel:"):
                        continue
                    parent.attrib[attr] = urljoin(self.base_url, url)

        if hasattr(html, "getroottree"):
            return root
        else:
            kwargs.setdefault("method", self.method)
            kwargs.setdefault("pretty_print", pretty_print)
            kwargs.setdefault("encoding", "utf-8")  # As Ken Thompson intended
            out = etree.tostring(root, **kwargs).decode(kwargs["encoding"])
            if self.method == "xml":
                out = _cdata_regex.sub(
                    lambda m: "/*<![CDATA[*/%s/*]]>*/" % m.group(1), out)
            if self.strip_important:
                out = _importants.sub("", out)
            return out
Beispiel #3
0
 def test_inline_invalid_syntax(self):
     # Invalid syntax does not raise
     inline = '{color:pink} :hover{color:purple} :active{color:red}'
     merge_styles(inline, [], [])
Beispiel #4
0
    def transform(self, pretty_print=True, **kwargs):
        """change the self.html and return it with CSS turned into style
        attributes.
        """
        if hasattr(self.html, "getroottree"):
            # skip the next bit
            root = self.html.getroottree()
            page = root
            tree = root
        else:
            if self.method == 'xml':
                parser = etree.XMLParser(ns_clean=False,
                                         resolve_entities=False)
            else:
                parser = etree.HTMLParser()
            stripped = self.html.strip()
            tree = etree.fromstring(stripped, parser).getroottree()
            page = tree.getroot()
            # lxml inserts a doctype if none exists, so only include it in
            # the root if it was in the original html.
            root = tree if stripped.startswith(tree.docinfo.doctype) else page

        assert page is not None

        if self.disable_leftover_css:
            head = None
        else:
            head = get_or_create_head(tree)
        #
        # style selectors
        #

        rules = []
        index = 0

        for element in CSSSelector('style,link[rel~=stylesheet]')(page):
            # If we have a media attribute whose value is anything other than
            # 'all' or 'screen', ignore the ruleset.
            media = element.attrib.get('media')
            if media and media not in ('all', 'screen'):
                continue

            data_attribute = element.attrib.get(self.attribute_name)
            if data_attribute:
                if data_attribute == 'ignore':
                    del element.attrib[self.attribute_name]
                    continue
                else:
                    warnings.warn('Unrecognized %s attribute (%r)' % (
                        self.attribute_name,
                        data_attribute,
                    ))

            is_style = element.tag == 'style'
            if is_style:
                css_body = element.text
            else:
                href = element.attrib.get('href')
                css_body = self._load_external(href)

            these_rules, these_leftover = self._parse_style_rules(
                css_body, index)
            index += 1
            rules.extend(these_rules)
            parent_of_element = element.getparent()
            if these_leftover or self.keep_style_tags:
                if is_style:
                    style = element
                else:
                    style = etree.Element('style')
                    style.attrib['type'] = 'text/css'
                if self.keep_style_tags:
                    style.text = css_body
                else:
                    style.text = self._css_rules_to_string(these_leftover)
                if self.method == 'xml':
                    style.text = etree.CDATA(style.text)

                if not is_style:
                    element.addprevious(style)
                    parent_of_element.remove(element)

            elif not self.keep_style_tags or not is_style:
                parent_of_element.remove(element)

        # external style files
        if self.external_styles:
            for stylefile in self.external_styles:
                css_body = self._load_external(stylefile)
                self._process_css_text(css_body, index, rules, head)
                index += 1

        # css text
        if self.css_text:
            for css_body in self.css_text:
                self._process_css_text(css_body, index, rules, head)
                index += 1

        # rules is a tuple of (specificity, selector, styles), where
        # specificity is a tuple ordered such that more specific
        # rules sort larger.
        rules.sort(key=operator.itemgetter(0))

        # collecting all elements that we need to apply rules on
        # id is unique for the lifetime of the object
        # and lxml should give us the same everytime during this run
        # item id -> {item: item, classes: [], style: []}
        elements = {}
        for _, selector, style in rules:
            new_selector = selector
            class_ = ''
            if ':' in selector:
                new_selector, class_ = re.split(':', selector, 1)
                class_ = ':%s' % class_
            # Keep filter-type selectors untouched.
            if class_ in FILTER_PSEUDOSELECTORS:
                class_ = ''
            else:
                selector = new_selector

            sel = CSSSelector(selector)
            items = sel(page)
            if len(items):
                # same so process it first
                processed_style = csstext_to_pairs(style)

                for item in items:
                    item_id = id(item)
                    if item_id not in elements:
                        elements[item_id] = {
                            'item': item,
                            'classes': [],
                            'style': [],
                        }

                    elements[item_id]['style'].append(processed_style)
                    elements[item_id]['classes'].append(class_)

        # Now apply inline style
        # merge style only once for each element
        # crucial when you have a lot of pseudo/classes
        # and a long list of elements
        for _, element in elements.items():
            final_style = merge_styles(
                element['item'].attrib.get('style', ''),
                element['style'],
                element['classes'],
                remove_unset_properties=self.remove_unset_properties,
            )
            if final_style:
                # final style could be empty string because of
                # remove_unset_properties
                element['item'].attrib['style'] = final_style
            self._style_to_basic_html_attributes(element['item'],
                                                 final_style,
                                                 force=True)

        if self.remove_classes:
            # now we can delete all 'class' attributes
            for item in page.xpath('//@class'):
                parent = item.getparent()
                del parent.attrib['class']

        # Capitalize Margin properties
        # To fix weird outlook bug
        # https://www.emailonacid.com/blog/article/email-development/outlook.com-does-support-margins
        if self.capitalize_float_margin:
            for item in page.xpath('//@style'):
                mangled = capitalize_float_margin(item)
                item.getparent().attrib['style'] = mangled

        # Add align attributes to images if they have a CSS float value of
        # right or left. Outlook (both on desktop and on the web) are bad at
        # understanding floats, but they do understand the HTML align attrib.
        if self.align_floating_images:
            for item in page.xpath('//img[@style]'):
                image_css = cssutils.parseStyle(item.attrib['style'])
                if image_css.float == 'right':
                    item.attrib['align'] = 'right'
                elif image_css.float == 'left':
                    item.attrib['align'] = 'left'

        #
        # URLs
        #
        if self.base_url:
            if not urlparse(self.base_url).scheme:
                raise ValueError('Base URL must have a scheme')
            for attr in ('href', 'src'):
                for item in page.xpath("//@%s" % attr):
                    parent = item.getparent()
                    url = parent.attrib[attr]
                    if (attr == 'href' and self.preserve_internal_links
                            and url.startswith('#')):
                        continue
                    if (attr == 'src' and self.preserve_inline_attachments
                            and url.startswith('cid:')):
                        continue
                    if attr == 'href' and url.startswith('tel:'):
                        continue
                    parent.attrib[attr] = urljoin(self.base_url, url)

        if hasattr(self.html, "getroottree"):
            return root
        else:
            kwargs.setdefault('method', self.method)
            kwargs.setdefault('pretty_print', pretty_print)
            kwargs.setdefault('encoding', 'utf-8')  # As Ken Thompson intended
            out = etree.tostring(root, **kwargs).decode(kwargs['encoding'])
            if self.method == 'xml':
                out = _cdata_regex.sub(
                    lambda m: '/*<![CDATA[*/%s/*]]>*/' % m.group(1), out)
            if self.strip_important:
                out = _importants.sub('', out)
            return out
Beispiel #5
0
 def test_inline_invalid_syntax(self):
     # inline shouldn't have those as I understand
     # but keep the behaviour
     inline = '{color:pink} :hover{color:purple} :active{color:red}'
     merge_styles(inline, [], [])
Beispiel #6
0
    def transform(self, html=None, pretty_print=True, **kwargs):
        """change the html and return it with CSS turned into style
        attributes.
        """
        if html is not None and self.html is not None:
            raise TypeError("Can't pass html argument twice")
        elif html is None and self.html is None:
            raise TypeError("must pass html as first argument")
        elif html is None:
            html = self.html
        if hasattr(html, "getroottree"):
            # skip the next bit
            root = html.getroottree()
            page = root
            tree = root
        else:
            if self.method == "xml":
                parser = etree.XMLParser(ns_clean=False, resolve_entities=False)
            else:
                parser = etree.HTMLParser()
            stripped = html.strip()
            tree = etree.fromstring(stripped, parser).getroottree()
            page = tree.getroot()
            # lxml inserts a doctype if none exists, so only include it in
            # the root if it was in the original html.
            root = tree if stripped.startswith(tree.docinfo.doctype) else page

        assert page is not None

        if self.disable_leftover_css:
            head = None
        else:
            head = get_or_create_head(tree)
        #
        # style selectors
        #

        rules = []
        index = 0

        cssselector = ["style"]
        if self.allow_network:
            cssselector.append("link[rel~=stylesheet]")
        for element in _create_cssselector(",".join(cssselector))(page):
            # If we have a media attribute whose value is anything other than
            # 'all' or 'screen', ignore the ruleset.
            media = element.attrib.get("media")
            if media and media not in ("all", "screen"):
                continue

            data_attribute = element.attrib.get(self.attribute_name)
            if data_attribute:
                if data_attribute == "ignore":
                    del element.attrib[self.attribute_name]
                    continue
                else:
                    warnings.warn(
                        "Unrecognized %s attribute (%r)"
                        % (self.attribute_name, data_attribute)
                    )

            is_style = element.tag == "style"
            if is_style:
                css_body = element.text
            else:
                href = element.attrib.get("href")
                css_body = self._load_external(href)

            these_rules, these_leftover = self._parse_style_rules(css_body, index)

            index += 1
            rules.extend(these_rules)
            parent_of_element = element.getparent()
            if these_leftover or self.keep_style_tags:
                if is_style:
                    style = element
                else:
                    style = etree.Element("style")
                    style.attrib["type"] = "text/css"
                if self.keep_style_tags:
                    style.text = css_body
                else:
                    style.text = self._css_rules_to_string(these_leftover)
                if self.method == "xml":
                    style.text = etree.CDATA(style.text)

                if not is_style:
                    element.addprevious(style)
                    parent_of_element.remove(element)

            elif not self.keep_style_tags or not is_style:
                parent_of_element.remove(element)

        # external style files
        if self.external_styles and self.allow_network:
            for stylefile in self.external_styles:
                css_body = self._load_external(stylefile)
                self._process_css_text(css_body, index, rules, head)
                index += 1

        # css text
        if self.css_text:
            for css_body in self.css_text:
                self._process_css_text(css_body, index, rules, head)
                index += 1

        # rules is a tuple of (specificity, selector, styles), where
        # specificity is a tuple ordered such that more specific
        # rules sort larger.
        rules.sort(key=operator.itemgetter(0))

        # collecting all elements that we need to apply rules on
        # id is unique for the lifetime of the object
        # and lxml should give us the same everytime during this run
        # item id -> {item: item, classes: [], style: []}
        elements = {}
        for _, selector, style in rules:
            new_selector = selector
            class_ = ""
            if ":" in selector:
                new_selector, class_ = re.split(":", selector, 1)
                class_ = ":%s" % class_
            # Keep filter-type selectors untouched.
            if class_ in FILTER_PSEUDOSELECTORS or class_.startswith(":nth-child"):
                class_ = ""
            else:
                selector = new_selector

            assert selector
            sel = _create_cssselector(selector)
            items = sel(page)
            if len(items):
                # same so process it first
                processed_style = csstext_to_pairs(style)

                for item in items:
                    item_id = id(item)
                    if item_id not in elements:
                        elements[item_id] = {"item": item, "classes": [], "style": []}

                    elements[item_id]["style"].append(processed_style)
                    elements[item_id]["classes"].append(class_)

        # Now apply inline style
        # merge style only once for each element
        # crucial when you have a lot of pseudo/classes
        # and a long list of elements
        for _, element in elements.items():
            final_style = merge_styles(
                element["item"].attrib.get("style", ""),
                element["style"],
                element["classes"],
                remove_unset_properties=self.remove_unset_properties,
            )
            if final_style:
                # final style could be empty string because of
                # remove_unset_properties
                element["item"].attrib["style"] = final_style
            self._style_to_basic_html_attributes(
                element["item"], final_style, force=True
            )

        if self.remove_classes:
            # now we can delete all 'class' attributes
            for item in page.xpath("//@class"):
                parent = item.getparent()
                del parent.attrib["class"]

        # Capitalize Margin properties
        # To fix weird outlook bug
        # https://www.emailonacid.com/blog/article/email-development/outlook.com-does-support-margins
        if self.capitalize_float_margin:
            for item in page.xpath("//@style"):
                mangled = capitalize_float_margin(item)
                item.getparent().attrib["style"] = mangled

        # Add align attributes to images if they have a CSS float value of
        # right or left. Outlook (both on desktop and on the web) are bad at
        # understanding floats, but they do understand the HTML align attrib.
        if self.align_floating_images:
            for item in page.xpath("//img[@style]"):
                image_css = cssutils.parseStyle(item.attrib["style"])
                if image_css.float == "right":
                    item.attrib["align"] = "right"
                elif image_css.float == "left":
                    item.attrib["align"] = "left"

        #
        # URLs
        #
        if self.base_url and not self.disable_link_rewrites:
            if not urlparse(self.base_url).scheme:
                raise ValueError("Base URL must have a scheme")
            for attr in ("href", "src"):
                for item in page.xpath("//@%s" % attr):
                    parent = item.getparent()
                    url = parent.attrib[attr]
                    if (
                        attr == "href"
                        and self.preserve_internal_links
                        and url.startswith("#")
                    ):
                        continue
                    if (
                        attr == "src"
                        and self.preserve_inline_attachments
                        and url.startswith("cid:")
                    ):
                        continue
                    if attr == "href" and url.startswith("tel:"):
                        continue
                    parent.attrib[attr] = urljoin(self.base_url, url)

        if hasattr(html, "getroottree"):
            return root
        else:
            kwargs.setdefault("method", self.method)
            kwargs.setdefault("pretty_print", pretty_print)
            kwargs.setdefault("encoding", "utf-8")  # As Ken Thompson intended
            out = etree.tostring(root, **kwargs).decode(kwargs["encoding"])
            if self.method == "xml":
                out = _cdata_regex.sub(
                    lambda m: "/*<![CDATA[*/%s/*]]>*/" % m.group(1), out
                )
            if self.strip_important:
                out = _importants.sub("", out)
            return out