def recompose(self, output_xhtml5: bool, extra_css_file: Path = None) -> str: """ Iterate over the XHTML files in this epub and "recompose" them into a single XHTML string representing this ebook. INPUTS output_xhtml5: true to output XHTML5 instead of HTML5 OUTPUTS A string of HTML5 representing the entire recomposed ebook. """ # Get some header data: title, core and local css title = self.metadata_dom.xpath("//dc:title/text()")[0] language = self.metadata_dom.xpath("//dc:language/text()")[0] css = "" namespaces: List[str] = [] css_filenames = ["core.css", "se.css", "local.css"] if extra_css_file: css_filenames.append(str(extra_css_file)) for filename in css_filenames: filepath = self.path / "src" / "epub" / "css" / filename file_css = self.get_file(filepath) namespaces = namespaces + regex.findall(r"@namespace.+?;", file_css) file_css = regex.sub(r"\s*@(charset|namespace).+?;\s*", "\n", file_css).strip() css = css + f"\n\n\n/* {filepath.name} */\n" + file_css css = css.strip() namespaces = list(set(namespaces)) if namespaces: css = "\n" + css for namespace in namespaces: css = namespace + "\n" + css css = "\t\t\t".join(css.splitlines(True)) + "\n" # Remove min-height from CSS since it doesn't really apply to the single page format. # It occurs at least in se.css css = regex.sub(r"\s*min-height: [^;]+?;", "", css) # Remove -epub-* CSS as it's invalid in a browser context css = regex.sub(r"\s*\-epub\-[^;]+?;", "", css) output_xhtml = f"<?xml version=\"1.0\" encoding=\"utf-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\" epub:prefix=\"z3998: http://www.daisy.org/z3998/2012/vocab/structure/, se: https://standardebooks.org/vocab/1.0\" xml:lang=\"{language}\"><head><meta charset=\"utf-8\"/><title>{title}</title><style/></head><body></body></html>" output_dom = se.formatting.EasyXhtmlTree(output_xhtml) # Iterate over spine items in order and recompose them into our output for ref in self.metadata_dom.xpath("/package/spine/itemref/@idref"): filename = self.metadata_dom.xpath( f"/package/manifest/item[@id='{ref}']/@href")[0] dom = self.get_dom(self.path / "src" / "epub" / filename) for node in dom.xpath("/html/body/*"): try: self._recompose_xhtml(node, output_dom) except se.SeException as ex: raise se.SeException( f"[path][link=file://{self.path / 'src/epub/' / filename}]{filename}[/][/]: {ex}" ) from ex # Add the ToC after the titlepage toc_dom = self.get_dom(self.path / "src" / "epub" / "toc.xhtml") titlepage_node = output_dom.xpath( "//*[contains(concat(' ', @epub:type, ' '), ' titlepage ')]")[0] for node in toc_dom.xpath("//nav[1]"): titlepage_node.lxml_element.addnext(node.lxml_element) # Replace all <a href> links with internal links for link in output_dom.xpath( "//a[not(re:test(@href, '^https?://')) and contains(@href, '#')]" ): link.set_attr("href", regex.sub(r".+(#.+)$", r"\1", link.get_attr("href"))) # Replace all <a href> links to entire files for link in output_dom.xpath( "//a[not(re:test(@href, '^https?://')) and not(contains(@href, '#'))]" ): href = link.get_attr("href") href = regex.sub(r".+/([^/]+)$", r"#\1", href) href = regex.sub(r"\.xhtml$", "", href) link.set_attr("href", href) # Get the output XHTML as a string output_xhtml = output_dom.to_string() output_xhtml = regex.sub(r"\"(\.\./)?text/(.+?)\.xhtml\"", "\"#\\2\"", output_xhtml) output_xhtml = regex.sub(r"\"(\.\./)?text/.+?\.xhtml#(.+?)\"", "\"#\\2\"", output_xhtml) # All done, clean the output # Very large files like Ulysses S. Grant's memoirs or Through the Looking Glass will crash lxml due to their size. # The inlined SVGs get too big. # So, if the byte size of the XHTML string is larger than an arbitrary size, don't pretty print the output. # Pepys is about 20,000,000 bytes if getsizeof(output_xhtml) < 100000000: output_xhtml = se.formatting.format_xhtml(output_xhtml) # Insert our CSS. We do this after `clean` because `clean` will escape > in the CSS output_xhtml = regex.sub( r"<style/>", "<style><![CDATA[\n\t\t\t" + css + "\t\t]]></style>", output_xhtml) if output_xhtml5: output_xhtml = output_xhtml.replace( "\t\t<meta charset=\"utf-8\"/>\n", "") output_xhtml = output_xhtml.replace("\t\t<style/>\n", "") output_xhtml = regex.sub(r'xml:lang="([^"]+?)"', r'xml:lang="\1" lang="\1"', output_xhtml) # Re-add a doctype output_xhtml = output_xhtml.replace( "<?xml version=\"1.0\" encoding=\"utf-8\"?>", "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<!DOCTYPE html>") else: # Remove xml declaration and re-add the doctype output_xhtml = regex.sub(r"<\?xml.+?\?>", "<!doctype html>", output_xhtml) output_xhtml = regex.sub(r" epub:prefix=\".+?\"", "", output_xhtml) # Remove CDATA output_xhtml = output_xhtml.replace("<![CDATA[", "") output_xhtml = output_xhtml.replace("]]>", "") # Make some replacements for HTML5 compatibility output_xhtml = output_xhtml.replace("epub:type", "data-epub-type") output_xhtml = output_xhtml.replace("epub|type", "data-epub-type") output_xhtml = regex.sub(r" xmlns.+?=\".+?\"", "", output_xhtml) output_xhtml = output_xhtml.replace("xml:lang", "lang") return output_xhtml
def recompose(self, output_xhtml5: bool, extra_css_file: Path = None) -> str: """ Iterate over the XHTML files in this epub and "recompose" them into a single XHTML string representing this ebook. INPUTS output_xhtml5: true to output XHTML5 instead of HTML5 OUTPUTS A string of HTML5 representing the entire recomposed ebook. """ # Get some header data: title, core and local css title = self.metadata_dom.xpath("/package/metadata/dc:title/text()")[0] language = self.metadata_dom.xpath("/package/metadata/dc:language/text()")[0] css = "" namespaces: List[str] = [] css_filenames = ["core.css", "se.css", "local.css"] if extra_css_file: css_filenames.append(str(extra_css_file)) for filename in css_filenames: filepath = self.content_path / "css" / filename file_css = self.get_file(filepath) namespaces = namespaces + regex.findall(r"@namespace.+?;", file_css) file_css = regex.sub(r"\s*@(charset|namespace).+?;\s*", "\n", file_css).strip() css = css + f"\n\n\n/* {filepath.name} */\n" + file_css css = css.strip() namespaces = list(set(namespaces)) if namespaces: css = "\n" + css for namespace in namespaces: css = namespace + "\n" + css css = "\t\t\t".join(css.splitlines(True)) + "\n" # Remove min-height from CSS since it doesn't really apply to the single page format. # It occurs at least in se.css css = regex.sub(r"\s*min-height: [^;]+?;", "", css) # Remove -epub-* CSS as it's invalid in a browser context css = regex.sub(r"\s*\-epub\-[^;]+?;", "", css) output_xhtml = f"<?xml version=\"1.0\" encoding=\"utf-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\" xmlns:epub=\"http://www.idpf.org/2007/ops\" epub:prefix=\"z3998: http://www.daisy.org/z3998/2012/vocab/structure/, se: https://standardebooks.org/vocab/1.0\" xml:lang=\"{language}\"><head><meta charset=\"utf-8\"/><title>{title}</title><style/></head><body></body></html>" output_dom = se.formatting.EasyXmlTree(output_xhtml) output_dom.is_css_applied = True # We will apply CSS recursively to nodes that will be attached to output_dom, so set the bit here # Iterate over spine items in order and recompose them into our output needs_wrapper_css = False for file_path in self.spine_file_paths: dom = self.get_dom(file_path) # Apply the stylesheet to see if we have `position: absolute` on any items. If so, apply `position: relative` to its closest <section> ancestor # See https://standardebooks.org/ebooks/jean-toomer/cane for an example of this in action dom.apply_css(css) # Select deepest sections or articles with id attributes that have ONLY figure or img children, and one of those children has position: absolute for node in dom.xpath("/html/body//*[@id and (name() = 'section' or name = 'article') and not(.//*[(name() = 'section' or name() = 'article') and not(preceding-sibling::* or following-sibling::*)]) and count(./*[(name() = 'figure' or name() = 'img')]) = count(./*) and .//*[(name() = 'figure' or name() = 'img') and @data-css-position = 'absolute']]"): needs_wrapper_css = True # Wrap the sections in a div that we style later wrapper_element = etree.SubElement(node.lxml_element, "div") wrapper_element.set("class", "positioning-wrapper") for child in node.xpath("./*[(name() = 'figure' or name() = 'img')]"): wrapper_element.append(child.lxml_element) # .append() will *move* the element to the end of wrapper_element # Now, recompose the children for node in dom.xpath("/html/body/*"): try: self._recompose_xhtml(node, output_dom) except se.SeException as ex: raise se.SeException(f"[path][link=file://{file_path}]{file_path}[/][/]: {ex}") from ex # Did we add wrappers? If so add the CSS # We also have to give the wrapper a height, because it may have siblings that were recomposed in from other files if needs_wrapper_css: css = css + "\n\t\t\t.positioning-wrapper{\n\t\t\t\tposition: relative; height: 100vh;\n\t\t\t}\n" # Add the ToC after the titlepage toc_dom = self.get_dom(self.toc_path) titlepage_node = output_dom.xpath("//*[contains(concat(' ', @epub:type, ' '), ' titlepage ')]")[0] for node in toc_dom.xpath("//nav[1]"): titlepage_node.lxml_element.addnext(node.lxml_element) # Replace all <a href> links with internal links for link in output_dom.xpath("//a[not(re:test(@href, '^https?://')) and contains(@href, '#')]"): link.set_attr("href", regex.sub(r".+(#.+)$", r"\1", link.get_attr("href"))) # Replace all <a href> links to entire files for link in output_dom.xpath("//a[not(re:test(@href, '^https?://')) and not(contains(@href, '#'))]"): href = link.get_attr("href") href = regex.sub(r".+/([^/]+)$", r"#\1", href) href = regex.sub(r"\.xhtml$", "", href) link.set_attr("href", href) for node in output_dom.xpath("/html/body//a[re:test(@href, '^(\\.\\./)?text/(.+?)\\.xhtml$')]"): node.set_attr("href", regex.sub(r"(\.\./)?text/(.+?)\.xhtml", r"#\2", node.get_attr("href"))) for node in output_dom.xpath("/html/body//a[re:test(@href, '^(\\.\\./)?text/.+?\\.xhtml#(.+?)$')]"): node.set_attr("href", regex.sub(r"(\.\./)?text/.+?\.xhtml#(.+?)", r"#\2", node.get_attr("href"))) # Make some compatibility adjustments if output_xhtml5: for node in output_dom.xpath("/html/head/meta[@charset]"): node.remove() for node in output_dom.xpath("//*[@xml:lang]"): node.set_attr("lang", node.get_attr("xml:lang")) else: for node in output_dom.xpath("/html[@epub:prefix]"): node.remove_attr("epub:prefix") for node in output_dom.xpath("//*[@xml:lang]"): node.set_attr("lang", node.get_attr("xml:lang")) node.remove_attr("xml:lang") for node in output_dom.xpath("//*[@epub:type]"): node.set_attr("data-epub-type", node.get_attr("epub:type")) node.remove_attr("epub:type") # Get the output XHTML as a string output_xhtml = output_dom.to_string() # All done, clean the output # Very large files like Ulysses S. Grant's memoirs or Through the Looking Glass will crash lxml due to their size. # The inlined SVGs get too big. # So, if the byte size of the XHTML string is larger than an arbitrary size, don't pretty print the output. # Pepys is about 20,000,000 bytes if getsizeof(output_xhtml) < 100000000: output_xhtml = se.formatting.format_xhtml(output_xhtml) # Insert our CSS. We do this after `clean` because `clean` will escape > in the CSS output_xhtml = regex.sub(r"<style/>", "<style><![CDATA[\n\t\t\t" + css + "\t\t]]></style>", output_xhtml) if output_xhtml5: output_xhtml = output_xhtml.replace("\t\t<style/>\n", "") # Re-add a doctype output_xhtml = output_xhtml.replace("<?xml version=\"1.0\" encoding=\"utf-8\"?>", "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<!DOCTYPE html>") else: # Remove xml declaration and re-add the doctype output_xhtml = regex.sub(r"<\?xml.+?\?>", "<!doctype html>", output_xhtml) # Remove CDATA output_xhtml = output_xhtml.replace("<![CDATA[", "") output_xhtml = output_xhtml.replace("]]>", "") # Make some replacements for HTML5 compatibility output_xhtml = output_xhtml.replace("epub|type", "data-epub-type") output_xhtml = regex.sub(r" xmlns.+?=\".+?\"", "", output_xhtml) return output_xhtml