Exemple #1
0
    def recompose(self,
                  output_xhtml5: bool,
                  extra_css_file: Path = None) -> str:
        """
		Iterate over the XHTML files in this epub and "recompose" them into a single XHTML string representing this ebook.

		INPUTS
		output_xhtml5: true to output XHTML5 instead of HTML5

		OUTPUTS
		A string of HTML5 representing the entire recomposed ebook.
		"""

        # Get some header data: title, core and local css
        title = self.metadata_dom.xpath("//dc:title/text()")[0]
        language = self.metadata_dom.xpath("//dc:language/text()")[0]
        css = ""
        namespaces: List[str] = []

        css_filenames = ["core.css", "se.css", "local.css"]

        if extra_css_file:
            css_filenames.append(str(extra_css_file))

        for filename in css_filenames:
            filepath = self.path / "src" / "epub" / "css" / filename
            file_css = self.get_file(filepath)

            namespaces = namespaces + regex.findall(r"@namespace.+?;",
                                                    file_css)

            file_css = regex.sub(r"\s*@(charset|namespace).+?;\s*", "\n",
                                 file_css).strip()

            css = css + f"\n\n\n/* {filepath.name} */\n" + file_css

        css = css.strip()

        namespaces = list(set(namespaces))

        if namespaces:
            css = "\n" + css

            for namespace in namespaces:
                css = namespace + "\n" + css

        css = "\t\t\t".join(css.splitlines(True)) + "\n"

        # Remove min-height from CSS since it doesn't really apply to the single page format.
        # It occurs at least in se.css
        css = regex.sub(r"\s*min-height: [^;]+?;", "", css)

        # Remove -epub-* CSS as it's invalid in a browser context
        css = regex.sub(r"\s*\-epub\-[^;]+?;", "", css)

        output_xhtml = f"<?xml version=\"1.0\" encoding=\"utf-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\" epub:prefix=\"z3998: http://www.daisy.org/z3998/2012/vocab/structure/, se: https://standardebooks.org/vocab/1.0\" xml:lang=\"{language}\"><head><meta charset=\"utf-8\"/><title>{title}</title><style/></head><body></body></html>"
        output_dom = se.formatting.EasyXhtmlTree(output_xhtml)

        # Iterate over spine items in order and recompose them into our output
        for ref in self.metadata_dom.xpath("/package/spine/itemref/@idref"):
            filename = self.metadata_dom.xpath(
                f"/package/manifest/item[@id='{ref}']/@href")[0]

            dom = self.get_dom(self.path / "src" / "epub" / filename)

            for node in dom.xpath("/html/body/*"):
                try:
                    self._recompose_xhtml(node, output_dom)
                except se.SeException as ex:
                    raise se.SeException(
                        f"[path][link=file://{self.path / 'src/epub/' / filename}]{filename}[/][/]: {ex}"
                    ) from ex

        # Add the ToC after the titlepage
        toc_dom = self.get_dom(self.path / "src" / "epub" / "toc.xhtml")
        titlepage_node = output_dom.xpath(
            "//*[contains(concat(' ', @epub:type, ' '), ' titlepage ')]")[0]

        for node in toc_dom.xpath("//nav[1]"):
            titlepage_node.lxml_element.addnext(node.lxml_element)

        # Replace all <a href> links with internal links
        for link in output_dom.xpath(
                "//a[not(re:test(@href, '^https?://')) and contains(@href, '#')]"
        ):
            link.set_attr("href",
                          regex.sub(r".+(#.+)$", r"\1", link.get_attr("href")))

        # Replace all <a href> links to entire files
        for link in output_dom.xpath(
                "//a[not(re:test(@href, '^https?://')) and not(contains(@href, '#'))]"
        ):
            href = link.get_attr("href")
            href = regex.sub(r".+/([^/]+)$", r"#\1", href)
            href = regex.sub(r"\.xhtml$", "", href)
            link.set_attr("href", href)

        # Get the output XHTML as a string
        output_xhtml = output_dom.to_string()
        output_xhtml = regex.sub(r"\"(\.\./)?text/(.+?)\.xhtml\"", "\"#\\2\"",
                                 output_xhtml)
        output_xhtml = regex.sub(r"\"(\.\./)?text/.+?\.xhtml#(.+?)\"",
                                 "\"#\\2\"", output_xhtml)

        # All done, clean the output
        # Very large files like Ulysses S. Grant's memoirs or Through the Looking Glass will crash lxml due to their size.
        # The inlined SVGs get too big.
        # So, if the byte size of the XHTML string is larger than an arbitrary size, don't pretty print the output.
        # Pepys is about 20,000,000 bytes
        if getsizeof(output_xhtml) < 100000000:
            output_xhtml = se.formatting.format_xhtml(output_xhtml)

        # Insert our CSS. We do this after `clean` because `clean` will escape > in the CSS
        output_xhtml = regex.sub(
            r"<style/>", "<style><![CDATA[\n\t\t\t" + css + "\t\t]]></style>",
            output_xhtml)

        if output_xhtml5:
            output_xhtml = output_xhtml.replace(
                "\t\t<meta charset=\"utf-8\"/>\n", "")
            output_xhtml = output_xhtml.replace("\t\t<style/>\n", "")

            output_xhtml = regex.sub(r'xml:lang="([^"]+?)"',
                                     r'xml:lang="\1" lang="\1"', output_xhtml)

            # Re-add a doctype
            output_xhtml = output_xhtml.replace(
                "<?xml version=\"1.0\" encoding=\"utf-8\"?>",
                "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<!DOCTYPE html>")
        else:
            # Remove xml declaration and re-add the doctype
            output_xhtml = regex.sub(r"<\?xml.+?\?>", "<!doctype html>",
                                     output_xhtml)
            output_xhtml = regex.sub(r" epub:prefix=\".+?\"", "", output_xhtml)

            # Remove CDATA
            output_xhtml = output_xhtml.replace("<![CDATA[", "")
            output_xhtml = output_xhtml.replace("]]>", "")

            # Make some replacements for HTML5 compatibility
            output_xhtml = output_xhtml.replace("epub:type", "data-epub-type")
            output_xhtml = output_xhtml.replace("epub|type", "data-epub-type")
            output_xhtml = regex.sub(r" xmlns.+?=\".+?\"", "", output_xhtml)
            output_xhtml = output_xhtml.replace("xml:lang", "lang")

        return output_xhtml
Exemple #2
0
	def recompose(self, output_xhtml5: bool, extra_css_file: Path = None) -> str:
		"""
		Iterate over the XHTML files in this epub and "recompose" them into a single XHTML string representing this ebook.

		INPUTS
		output_xhtml5: true to output XHTML5 instead of HTML5

		OUTPUTS
		A string of HTML5 representing the entire recomposed ebook.
		"""

		# Get some header data: title, core and local css
		title = self.metadata_dom.xpath("/package/metadata/dc:title/text()")[0]
		language = self.metadata_dom.xpath("/package/metadata/dc:language/text()")[0]
		css = ""
		namespaces: List[str] = []

		css_filenames = ["core.css", "se.css", "local.css"]

		if extra_css_file:
			css_filenames.append(str(extra_css_file))

		for filename in css_filenames:
			filepath = self.content_path / "css" / filename
			file_css = self.get_file(filepath)

			namespaces = namespaces + regex.findall(r"@namespace.+?;", file_css)

			file_css = regex.sub(r"\s*@(charset|namespace).+?;\s*", "\n", file_css).strip()

			css = css + f"\n\n\n/* {filepath.name} */\n" + file_css

		css = css.strip()

		namespaces = list(set(namespaces))

		if namespaces:
			css = "\n" + css

			for namespace in namespaces:
				css = namespace + "\n" + css

		css = "\t\t\t".join(css.splitlines(True)) + "\n"

		# Remove min-height from CSS since it doesn't really apply to the single page format.
		# It occurs at least in se.css
		css = regex.sub(r"\s*min-height: [^;]+?;", "", css)

		# Remove -epub-* CSS as it's invalid in a browser context
		css = regex.sub(r"\s*\-epub\-[^;]+?;", "", css)

		output_xhtml = f"<?xml version=\"1.0\" encoding=\"utf-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\" epub:prefix=\"z3998: http://www.daisy.org/z3998/2012/vocab/structure/, se: https://standardebooks.org/vocab/1.0\" xml:lang=\"{language}\"><head><meta charset=\"utf-8\"/><title>{title}</title><style/></head><body></body></html>"
		output_dom = se.formatting.EasyXmlTree(output_xhtml)
		output_dom.is_css_applied = True # We will apply CSS recursively to nodes that will be attached to output_dom, so set the bit here

		# Iterate over spine items in order and recompose them into our output
		needs_wrapper_css = False
		for file_path in self.spine_file_paths:
			dom = self.get_dom(file_path)

			# Apply the stylesheet to see if we have `position: absolute` on any items. If so, apply `position: relative` to its closest <section> ancestor
			# See https://standardebooks.org/ebooks/jean-toomer/cane for an example of this in action
			dom.apply_css(css)

			# Select deepest sections or articles with id attributes that have ONLY figure or img children, and one of those children has position: absolute
			for node in dom.xpath("/html/body//*[@id and (name() = 'section' or name = 'article') and not(.//*[(name() = 'section' or name() = 'article') and not(preceding-sibling::* or following-sibling::*)]) and count(./*[(name() = 'figure' or name() = 'img')]) = count(./*) and .//*[(name() = 'figure' or name() = 'img') and @data-css-position = 'absolute']]"):
				needs_wrapper_css = True

				# Wrap the sections in a div that we style later
				wrapper_element = etree.SubElement(node.lxml_element, "div")
				wrapper_element.set("class", "positioning-wrapper")
				for child in node.xpath("./*[(name() = 'figure' or name() = 'img')]"):
					wrapper_element.append(child.lxml_element) # .append() will *move* the element to the end of wrapper_element

			# Now, recompose the children
			for node in dom.xpath("/html/body/*"):
				try:
					self._recompose_xhtml(node, output_dom)
				except se.SeException as ex:
					raise se.SeException(f"[path][link=file://{file_path}]{file_path}[/][/]: {ex}") from ex

		# Did we add wrappers? If so add the CSS
		# We also have to give the wrapper a height, because it may have siblings that were recomposed in from other files
		if needs_wrapper_css:
			css = css + "\n\t\t\t.positioning-wrapper{\n\t\t\t\tposition: relative; height: 100vh;\n\t\t\t}\n"

		# Add the ToC after the titlepage
		toc_dom = self.get_dom(self.toc_path)
		titlepage_node = output_dom.xpath("//*[contains(concat(' ', @epub:type, ' '), ' titlepage ')]")[0]

		for node in toc_dom.xpath("//nav[1]"):
			titlepage_node.lxml_element.addnext(node.lxml_element)

		# Replace all <a href> links with internal links
		for link in output_dom.xpath("//a[not(re:test(@href, '^https?://')) and contains(@href, '#')]"):
			link.set_attr("href", regex.sub(r".+(#.+)$", r"\1", link.get_attr("href")))

		# Replace all <a href> links to entire files
		for link in output_dom.xpath("//a[not(re:test(@href, '^https?://')) and not(contains(@href, '#'))]"):
			href = link.get_attr("href")
			href = regex.sub(r".+/([^/]+)$", r"#\1", href)
			href = regex.sub(r"\.xhtml$", "", href)
			link.set_attr("href", href)

		for node in output_dom.xpath("/html/body//a[re:test(@href, '^(\\.\\./)?text/(.+?)\\.xhtml$')]"):
			node.set_attr("href", regex.sub(r"(\.\./)?text/(.+?)\.xhtml", r"#\2", node.get_attr("href")))

		for node in output_dom.xpath("/html/body//a[re:test(@href, '^(\\.\\./)?text/.+?\\.xhtml#(.+?)$')]"):
			node.set_attr("href", regex.sub(r"(\.\./)?text/.+?\.xhtml#(.+?)", r"#\2", node.get_attr("href")))

		# Make some compatibility adjustments
		if output_xhtml5:
			for node in output_dom.xpath("/html/head/meta[@charset]"):
				node.remove()

			for node in output_dom.xpath("//*[@xml:lang]"):
				node.set_attr("lang", node.get_attr("xml:lang"))
		else:
			for node in output_dom.xpath("/html[@epub:prefix]"):
				node.remove_attr("epub:prefix")

			for node in output_dom.xpath("//*[@xml:lang]"):
				node.set_attr("lang", node.get_attr("xml:lang"))
				node.remove_attr("xml:lang")

			for node in output_dom.xpath("//*[@epub:type]"):
				node.set_attr("data-epub-type", node.get_attr("epub:type"))
				node.remove_attr("epub:type")

		# Get the output XHTML as a string
		output_xhtml = output_dom.to_string()

		# All done, clean the output
		# Very large files like Ulysses S. Grant's memoirs or Through the Looking Glass will crash lxml due to their size.
		# The inlined SVGs get too big.
		# So, if the byte size of the XHTML string is larger than an arbitrary size, don't pretty print the output.
		# Pepys is about 20,000,000 bytes
		if getsizeof(output_xhtml) < 100000000:
			output_xhtml = se.formatting.format_xhtml(output_xhtml)

		# Insert our CSS. We do this after `clean` because `clean` will escape > in the CSS
		output_xhtml = regex.sub(r"<style/>", "<style><![CDATA[\n\t\t\t" + css + "\t\t]]></style>", output_xhtml)

		if output_xhtml5:
			output_xhtml = output_xhtml.replace("\t\t<style/>\n", "")

			# Re-add a doctype
			output_xhtml = output_xhtml.replace("<?xml version=\"1.0\" encoding=\"utf-8\"?>", "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<!DOCTYPE html>")
		else:
			# Remove xml declaration and re-add the doctype
			output_xhtml = regex.sub(r"<\?xml.+?\?>", "<!doctype html>", output_xhtml)

			# Remove CDATA
			output_xhtml = output_xhtml.replace("<![CDATA[", "")
			output_xhtml = output_xhtml.replace("]]>", "")

			# Make some replacements for HTML5 compatibility
			output_xhtml = output_xhtml.replace("epub|type", "data-epub-type")
			output_xhtml = regex.sub(r" xmlns.+?=\".+?\"", "", output_xhtml)

		return output_xhtml