def get_dom(self, file_path: Path, remove_comments=False) -> se.easy_xml.EasyXmlTree: """ Get an EasyXmlTree DOM object for a given file. Contents are cached so that we don't hit the disk or re-parse DOMs repeatedly INPUTS file_path: A Path pointing to the file OUTPUTS A string representing the file contents """ file_path_str = str(file_path) + "_" + str(remove_comments) if file_path_str not in self._dom_cache: file_contents = self.get_file(file_path) try: self._dom_cache[file_path_str] = se.easy_xml.EasyXmlTree(file_contents) # Remove comments if remove_comments: for node in self._dom_cache[file_path_str].xpath("//comment()"): node.remove() except etree.XMLSyntaxError as ex: raise se.InvalidXhtmlException(f"Couldn’t parse XML in [path][link=file://{file_path.resolve()}]{file_path}[/][/]. Exception: {ex}") from ex except FileNotFoundError as ex: raise ex except se.InvalidXmlException as ex: raise se.InvalidXhtmlException(f"Couldn’t parse XML in [path][link=file://{file_path.resolve()}]{file_path}[/][/]. Exception: {ex.__cause__}") from ex except Exception as ex: raise se.InvalidXhtmlException(f"Couldn’t parse XML in [path][link=file://{file_path.resolve()}]{file_path}[/][/].") from ex return self._dom_cache[file_path_str]
def _recompose_xhtml(self, section: Tag, output_soup: BeautifulSoup) -> None: """ Helper function used in self.recompose() Recursive function for recomposing a series of XHTML files into a single XHTML file. INPUTS section: A BS4 tag to inspect output_soup: A BS4 object representing the entire soup OUTPUTS None """ # Quick sanity check before we begin if "id" not in section.attrs or (section.parent.name.lower() != "body" and "id" not in section.parent.attrs): raise se.InvalidXhtmlException("Section without ID attribute.") # Try to find our parent tag in the output, by ID. # If it's not in the output, then append it to the tag's closest parent by ID (or <body>), then iterate over its children and do the same. existing_section = output_soup.select("#" + section["id"]) if not existing_section: if section.parent.name.lower() == "body": output_soup.body.append(self._new_bs4_tag(section, output_soup)) else: output_soup.select("#" + section.parent["id"])[0].append(self._new_bs4_tag(section, output_soup)) existing_section = output_soup.select("#" + section["id"]) for child in section.children: if not isinstance(child, str): tag_name = child.name.lower() if tag_name == "section" or tag_name == "article": self._recompose_xhtml(child, output_soup) else: existing_section[0].append(child)
def _recompose_xhtml(self, section: se.easy_xml.EasyXmlElement, output_dom: se.easy_xml.EasyXmlTree) -> None: """ Helper function used in self.recompose() INPUTS section: An EasyXmlElement to inspect output_dom: A EasyXmlTree representing the entire output dom OUTPUTS None """ # Quick sanity check before we begin if not section.get_attr("id") or (section.parent.tag.lower() != "body" and not section.parent.get_attr("id")): raise se.InvalidXhtmlException(f"Section without [attr]id[/] attribute: [html]{section.to_tag_string()}[/]") if section.parent.tag.lower() == "body" and not section.get_attr("data-parent"): section.set_attr("epub:type", f"{section.get_attr('epub:type')} {section.parent.get_attr('epub:type')}".strip()) # Try to find our parent element in the current output dom, by ID. # If it's not in the output, then append this element to the elements's closest parent by ID (or <body>), then iterate over its children and do the same. existing_section = None existing_section = output_dom.xpath(f"//*[@id='{section.get_attr('data-parent')}']") if existing_section: existing_section[0].append(section) else: output_dom.xpath("/html/body")[0].append(section) # Convert all <img> references to inline base64 # We even convert SVGs instead of inlining them, because CSS won't allow us to style inlined SVGs # (for example if we want to apply max-width or filter: invert()) for img in section.xpath("//img[starts-with(@src, '../images/')]"): img.set_attr("src", se.images.get_data_url(self.content_path / img.get_attr("src").replace("../", "")))
def _recompose_xhtml(self, section: se.easy_xml.EasyXmlElement, output_dom: se.easy_xml.EasyXmlTree) -> None: """ Helper function used in self.recompose() Recursive function for recomposing a series of XHTML files into a single XHTML file. INPUTS section: An EasyXmlElement to inspect output_dom: A EasyXmlTree representing the entire output dom OUTPUTS None """ # Quick sanity check before we begin if not section.get_attr("id") or (section.parent.tag.lower() != "body" and not section.parent.get_attr("id")): raise se.InvalidXhtmlException("Section without [attr]id[/] attribute.") if section.parent.tag.lower() == "body": section.set_attr("epub:type", f"{section.get_attr('epub:type')} {section.parent.get_attr('epub:type')}".strip()) # Try to find our parent tag in the output, by ID. # If it's not in the output, then append it to the tag's closest parent by ID (or <body>), then iterate over its children and do the same. existing_section = output_dom.xpath(f"//*[@id='{section.get_attr('id')}']") if not existing_section: if section.parent.tag.lower() == "body": output_dom.xpath("/html/body")[0].append(section) else: output_dom.xpath(f"//*[@id='{section.parent.get_attr('id')}']")[0].append(section) existing_section = output_dom.xpath(f"//*[@id='{section.get_attr('id')}']") # Convert all <img> references to inline base64 # We even convert SVGs instead of inlining them, because CSS won't allow us to style inlined SVGs # (for example if we want to apply max-width or filter: invert()) for img in section.xpath("//img[starts-with(@src, '../images/')]"): src = img.get_attr("src").replace("../", "") with open(self.content_path / src, "rb") as binary_file: image_contents_base64 = base64.b64encode(binary_file.read()).decode() if src.endswith(".svg"): img.set_attr("src", f"data:image/svg+xml;base64, {image_contents_base64}") if src.endswith(".jpg"): img.set_attr("src", f"data:image/jpg;base64, {image_contents_base64}") if src.endswith(".png"): img.set_attr("src", f"data:image/png;base64, {image_contents_base64}") for child in section.xpath("./*"): if child.tag in ("section", "article"): self._recompose_xhtml(child, output_dom) else: existing_section.append(child)
def format_xhtml(xhtml: str) -> str: """ Pretty-print well-formed XHTML. INPUTS xhtml: A string of well-formed XHTML OUTPUTS A string of pretty-printed XHTML. """ # Epub3 doesn't allow named entities, so convert them to their unicode equivalents # But, don't unescape the content.opf long-description accidentally xhtml = regex.sub(r"&#?\w+;", _replace_character_references, xhtml) # Remove unnecessary doctypes which can cause xmllint to hang xhtml = regex.sub(r"<!DOCTYPE[^>]+?>", "", xhtml, flags=regex.DOTALL) # Remove white space between opening/closing tag and text nodes # We do this first so that we can still format line breaks after <br/> # Exclude comments xhtml = regex.sub(r"(<(?:[^!/][^>]*?[^/]|[a-z])>)\s+([^\s<])", r"\1\2", xhtml, flags=regex.IGNORECASE) xhtml = regex.sub(r"([^\s>])\s+(</[^>]+?>)", r"\1\2", xhtml, flags=regex.IGNORECASE) try: tree = _format_xml_str(xhtml) except Exception as ex: raise se.InvalidXhtmlException(f"Couldn’t parse XHTML file. Exception: {ex}") # Lowercase attribute names for node in tree.xpath("//*[attribute::*[re:test(local-name(), '[A-Z]')]]", namespaces=se.XHTML_NAMESPACES): for key, value in node.items(): # Iterate over attributes node.attrib.pop(key) # Remove the attribute node.attrib[key.lower()] = value # Re-add the attribute, lowercased # Lowercase tag names for node in tree.xpath("//*[re:test(local-name(), '[A-Z]')]", namespaces=se.XHTML_NAMESPACES): node.tag = node.tag.lower() # Format <style> elements _format_style_elements(tree) # Remove white space between non-tags and <br/> xhtml = regex.sub(r"([^>\s])\s+<br/>", r"\1<br/>", _xml_tree_to_string(tree)) return xhtml
def format_xhtml( xhtml: str, is_metadata_file: bool = False, is_endnotes_file: bool = False, is_colophon_file: bool = False) -> str: # pylint: disable=unused-argument """ Pretty-print well-formed XHTML. INPUTS xhtml: A string of well-formed XHTML is_metadata_file: True if the passed XHTML is an SE content.opf metadata file is_endnotes_file: True if the passed XHTML is an SE endnotes file OUTPUTS A string of pretty-printed XHTML. """ if is_metadata_file: # Replace html entities in the long description so we can clean it too. # We re-establish them later. xhtml = xhtml.replace("<", "<") xhtml = xhtml.replace(">", ">") else: # Epub3 doesn't allow named entities, so convert them to their unicode equivalents # But, don't unescape the content.opf long-description accidentally xhtml = regex.sub(r"&#?\w+;", _replace_character_references, xhtml) # Remove unnecessary doctypes which can cause xmllint to hang xhtml = regex.sub(r"<!DOCTYPE[^>]+?>", "", xhtml, flags=regex.DOTALL) # Canonicalize and format XHTML try: xhtml = pretty_print_xml(xhtml) except Exception as ex: raise se.InvalidXhtmlException( f"Couldn’t parse file. Files must be in XHTML format, which is not the same as HTML. Exception: {ex}" ) # Normalize unicode characters xhtml = unicodedata.normalize("NFC", xhtml) # Attempt to pretty-print CSS, if we have any (like in cover.svg or titlepage.svg). css = regex.findall(r"<style type=\"text/css\">([^<]+?)</style>", xhtml, flags=regex.DOTALL) if css: # Note that we can't (yet) use a generic format_css function, because we rely # on the output to be structured in a particular way. # This algorithm can't handle arbitrarily-formatted CSS. css = css[0] css = css.replace("{ ", "{\n") css = css.replace(" }", "\n}\n\n") css = css.replace("; ", ";\n") css = regex.sub(r"^\s*(.+?){", "\t\t\\1{", css, flags=regex.MULTILINE) css = regex.sub(r"^\s*}", "\t\t}\n", css, flags=regex.MULTILINE) css = regex.sub(r"^([^{}]+?)$", "\t\t\t\\1", css, flags=regex.MULTILINE) css = f"\t\t{css.strip()}" xhtml = regex.sub(r"<style type=\"text/css\">([^<]+?)</style>", f"<style type=\"text/css\">\n{css}\n\t</style>", xhtml, flags=regex.DOTALL) # Attempt to pretty-print the long description, which has special formatting if "<p>" in xhtml: xhtml = xhtml.replace(" <p>", "\n\t\t\t<p>") xhtml = xhtml.replace("</p> </meta>", "</p>\n\t\t</meta>") # Clean the long description, if we can find it if is_metadata_file: long_description = regex.findall( r"<meta id=\"long-description\" property=\"se:long-description\" refines=\"#description\">(.+?)</meta>", xhtml, flags=regex.DOTALL) if long_description[0]: escaped_long_description = long_description[0].replace("<", "<") escaped_long_description = escaped_long_description.replace( ">", ">") xhtml = xhtml.replace(long_description[0], escaped_long_description) # Almost done. Let's clean CSS in <style> elements, like we find in SVG files. matches = regex.findall(r"^(\s*)(<style[^>]*?>)(.+?)(</style>)", xhtml, flags=regex.DOTALL | regex.MULTILINE) for match in matches: css = format_css(match[2]) # Indent the CSS one level deeper than the <style> element css = ''.join(match[0] + "\t" + line + "\n" for line in css.splitlines()) css = css.strip("\n") css = regex.sub( r"^\s+$", "", css, flags=regex.MULTILINE ) # Remove indents from lines that are just white space xhtml = xhtml.replace( f"{match[0]}{match[1]}{match[2]}{match[3]}", f"{match[0]}{match[1]}\n{css}\n{match[0]}{match[3]}") return xhtml
def format_xhtml(xhtml: str, single_lines: bool = False, is_metadata_file: bool = False, is_endnotes_file: bool = False, is_colophon_file: bool = False) -> str: """ Pretty-print well-formed XHTML. INPUTS xhtml: A string of well-formed XHTML single_lines: True to collapse hard-wrapped line breaks, like those found at Project Gutenberg, to single lines is_metadata_file: True if the passed XHTML is an SE content.opf metadata file is_endnotes_file: True if the passed XHTML is an SE endnotes file OUTPUTS A string of pretty-printed XHTML. """ try: xmllint_path = Path(shutil.which("xmllint")) except Exception: raise se.MissingDependencyException("Couldn’t locate xmllint. Is it installed?") env = os.environ.copy() env["XMLLINT_INDENT"] = "\t" if single_lines: xhtml = xhtml.replace("\n", " ") xhtml = regex.sub(r"\s{2,}", " ", xhtml) # Use this instead of \s+, because \s+ will replace special white space (like hair space or nbsp) with a regular space. # Epub3 doesn't allow named entities, so convert them to their unicode equivalents # But, don't unescape the content.opf long-description accidentally if not is_metadata_file: xhtml = regex.sub(r"&#?\w+;", _replace_character_references, xhtml) # Remove unnecessary doctypes which can cause xmllint to hang xhtml = regex.sub(r"<!DOCTYPE[^>]+?>", "", xhtml, flags=regex.DOTALL) # Remove spaces and newlines before <br/>. We do his before using xmllint, because # in some cases (like poetry) we want the <br/>s on separate lines; but in other cases (like # line breaks in a flow-level element) we don't. xhtml = regex.sub(r"\s*<br/?>(\s*<br/>)?", "<br/>", xhtml, flags=regex.DOTALL) # Canonicalize XHTML # Path arguments must be cast to string for Windows compatibility. result = subprocess.run([str(xmllint_path), "--c14n", "-"], input=xhtml.encode(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) xhtml = result.stdout.decode() try: error = result.stderr.decode().strip() if error: raise se.InvalidXhtmlException("Couldn't parse file; files must be in XHTML format, which is not the same as HTML. xmllint says:\n{}".format(error.replace("-:", "Line "))) except UnicodeDecodeError as ex: raise se.InvalidEncodingException("Invalid encoding; UTF-8 expected: {}".format(ex)) except Exception as ex: raise se.InvalidXhtmlException("Couldn't parse file; files must be in XHTML format, which is not the same as HTML: {}".format(ex)) # Add the XML header that xmllint stripped during c14n xhtml = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + xhtml xhtml = xhtml.replace("encoding=\"UTF-8\"", "encoding=\"utf-8\"") xhtml = unicodedata.normalize("NFC", xhtml) # Pretty-print XML # Path arguments must be cast to string for Windows compatibility. xhtml = subprocess.run([str(xmllint_path), "--format", "-"], input=xhtml.encode(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env).stdout.decode() # Remove white space between some tags xhtml = regex.sub(r"<p([^>]*?)>\s+([^<\s])", "<p\\1>\\2", xhtml, flags=regex.DOTALL) xhtml = regex.sub(r"([^>\s])\s+</p>", "\\1</p>", xhtml, flags=regex.DOTALL) # xmllint has problems with removing spacing between some inline HTML5 elements. Try to fix those problems here. xhtml = regex.sub(r"</(abbr|cite|i|span|em)><(abbr|cite|i|span|em)", "</\\1> <\\2", xhtml) # Try to fix inline elements directly followed by an <a> tag, unless that <a> tag is a noteref. xhtml = regex.sub(r"</(abbr|cite|i|span)><(a(?! href=\"[^\"]+?\" id=\"noteref\-))", "</\\1> <\\2", xhtml) # Two sequential inline elements, when they are the only children of a block, are indented. But this messes up spacing if the 2nd element is a noteref. xhtml = regex.sub(r"</(abbr|cite|i|span)>\s+<(a href=\"[^\"]+?\" id=\"noteref\-)", "</\\1><\\2", xhtml, flags=regex.DOTALL) # Try to fix <cite> tags running next to referrer <a> tags. if is_endnotes_file: xhtml = regex.sub(r"</cite>(<a href=\"[^\"]+?\" epub:type=\"backlink\")", "</cite> \\1", xhtml) if is_colophon_file: xhtml = regex.sub(r"\s*<br/>\s*", "<br/>\n\t\t\t", xhtml, flags=regex.DOTALL) section_xhtml = regex.findall(r"</header>(.+?)</section>", xhtml, flags=regex.DOTALL) if section_xhtml: section_xhtml = regex.sub(r"^\s*", "\t\t\t", section_xhtml[0], flags=regex.MULTILINE).strip() xhtml = regex.sub(r"</header>(.+?)</section>", "</header>\n\t\t\t{}\n\t\t</section>".format(section_xhtml), xhtml, flags=regex.DOTALL) if single_lines: # Attempt to pretty-print CSS, if we have any (like in cover.svg or titlepage.svg). css = regex.findall(r"<style type=\"text/css\">([^<]+?)</style>", xhtml, flags=regex.DOTALL) if css: # Note that we can't (yet) use a generic format_css function, because we rely # on the output of the single_lines flag to be structured in a particular way. # This algorithm can't handle arbitrarily-formatted CSS. css = css[0] css = css.replace("{ ", "{\n") css = css.replace(" }", "\n}\n\n") css = css.replace("; ", ";\n") css = regex.sub(r"^\s*(.+?){", "\t\t\\1{", css, flags=regex.MULTILINE) css = regex.sub(r"^\s*}", "\t\t}\n", css, flags=regex.MULTILINE) css = regex.sub(r"^([^{}]+?)$", "\t\t\t\\1", css, flags=regex.MULTILINE) css = "\t\t" + css.strip() xhtml = regex.sub(r"<style type=\"text/css\">([^<]+?)</style>", "<style type=\"text/css\">\n{}\n\t</style>".format(css), xhtml, flags=regex.DOTALL) # Attempt to pretty-print the long description, which has special formatting if "<p>" in xhtml: xhtml = xhtml.replace(" <p>", "\n\t\t\t<p>") xhtml = xhtml.replace("</p> </meta>", "</p>\n\t\t</meta>") return xhtml
def _create_draft(args: Namespace): """ Implementation for `se create-draft` """ # Put together some variables for later use authors = [] translators = [] illustrators = [] pg_producers = [] title = args.title.replace("'", "’") for author in args.author: authors.append({ "name": author.replace("'", "’"), "wiki_url": None, "nacoaf_url": None }) if args.translator: for translator in args.translator: translators.append({ "name": translator.replace("'", "’"), "wiki_url": None, "nacoaf_url": None }) if args.illustrator: for illustrator in args.illustrator: illustrators.append({ "name": illustrator.replace("'", "’"), "wiki_url": None, "nacoaf_url": None }) title_string = title if authors and authors[0]["name"].lower() != "anonymous": title_string += ", by " + _generate_contributor_string(authors, False) identifier = "" for author in authors: identifier += se.formatting.make_url_safe(author["name"]) + "_" identifier = identifier.rstrip("_") + "/" + se.formatting.make_url_safe( title) sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", title) if translators: title_string = title_string + ". Translated by " + _generate_contributor_string( translators, False) identifier = identifier + "/" for translator in translators: identifier += se.formatting.make_url_safe(translator["name"]) + "_" identifier = identifier.rstrip("_") if illustrators: title_string = title_string + ". Illustrated by " + _generate_contributor_string( illustrators, False) identifier = identifier + "/" for illustrator in illustrators: identifier += se.formatting.make_url_safe( illustrator["name"]) + "_" identifier = identifier.rstrip("_") repo_name = identifier.replace("/", "_") repo_path = Path(repo_name).resolve() if repo_path.is_dir(): raise se.InvalidInputException( f"Directory already exists: [path][link=file://{repo_path}]{repo_path}[/][/]." ) # Get data on authors for i, author in enumerate(authors): if not args.offline and author["name"].lower() != "anonymous": author["wiki_url"], author["nacoaf_url"] = _get_wikipedia_url( author["name"], True) # Get data on translators for i, translator in enumerate(translators): if not args.offline and translator["name"].lower() != "anonymous": translator["wiki_url"], translator[ "nacoaf_url"] = _get_wikipedia_url(translator["name"], True) # Get data on illlustrators for i, illustrator in enumerate(illustrators): if not args.offline and illustrator["name"].lower() != "anonymous": illustrator["wiki_url"], illustrator[ "nacoaf_url"] = _get_wikipedia_url(illustrator["name"], True) # Download PG HTML and do some fixups if args.pg_url: if args.offline: raise se.RemoteCommandErrorException( "Cannot download Project Gutenberg ebook when offline option is enabled." ) args.pg_url = args.pg_url.replace("http://", "https://") # Get the ebook metadata try: response = requests.get(args.pg_url) pg_metadata_html = response.text except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t download Project Gutenberg ebook metadata page. Exception: {ex}" ) parser = etree.HTMLParser() dom = etree.parse(StringIO(pg_metadata_html), parser) # Get the ebook HTML URL from the metadata pg_ebook_url = None for node in dom.xpath("/html/body//a[contains(@type, 'text/html')]"): pg_ebook_url = regex.sub(r"^//", "https://", node.get("href")) pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/", pg_ebook_url) if not pg_ebook_url: raise se.RemoteCommandErrorException( "Could download ebook metadata, but couldn’t find URL for the ebook HTML." ) # Get the ebook LCSH categories pg_subjects = [] for node in dom.xpath( "/html/body//td[contains(@property, 'dcterms:subject')]"): if node.get("datatype") == "dcterms:LCSH": for subject_link in node.xpath("./a"): pg_subjects.append(subject_link.text.strip()) # Get the PG publication date pg_publication_year = None for node in dom.xpath("//td[@itemprop='datePublished']"): pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1", node.text) # Get the actual ebook URL try: response = requests.get(pg_ebook_url) pg_ebook_html = response.text except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t download Project Gutenberg ebook HTML. Exception: {ex}" ) try: fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False) pg_ebook_html = se.strip_bom(fixed_pg_ebook_html) except Exception as ex: raise se.InvalidEncodingException( f"Couldn’t determine text encoding of Project Gutenberg HTML file. Exception: {ex}" ) # Try to guess the ebook language pg_language = "en-US" if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html: pg_language = "en-GB" # Create necessary directories (repo_path / "images").mkdir(parents=True) (repo_path / "src" / "epub" / "css").mkdir(parents=True) (repo_path / "src" / "epub" / "images").mkdir(parents=True) (repo_path / "src" / "epub" / "text").mkdir(parents=True) (repo_path / "src" / "META-INF").mkdir(parents=True) is_pg_html_parsed = True # Write PG data if we have it if args.pg_url and pg_ebook_html: try: dom = etree.parse( StringIO(regex.sub(r"encoding=\".+?\"", "", pg_ebook_html)), parser) namespaces = {"re": "http://exslt.org/regular-expressions"} for node in dom.xpath( "//*[re:test(text(), '\\*\\*\\*\\s*Produced by.+')]", namespaces=namespaces): producers_text = regex.sub( r"^<[^>]+?>", "", etree.tostring(node, encoding=str, with_tail=False)) producers_text = regex.sub(r"<[^>]+?>$", "", producers_text) producers_text = regex.sub(r".+?Produced by (.+?)\s*$", "\\1", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r"\(.+?\)", "", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r"(at )?https?://www\.pgdp\.net", "", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r"[\r\n]+", " ", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r",? and ", ", and ", producers_text) producers_text = producers_text.replace( " and the Online", " and The Online") producers_text = producers_text.replace(", and ", ", ").strip() pg_producers = [ producer.strip() for producer in regex.split(',|;', producers_text) ] # Try to strip out the PG header for node in dom.xpath( "//*[re:test(text(), '\\*\\*\\*\\s*START OF THIS')]", namespaces=namespaces): for sibling_node in node.xpath("./preceding-sibling::*"): easy_node = se.easy_xml.EasyXmlElement(sibling_node) easy_node.remove() easy_node = se.easy_xml.EasyXmlElement(node) easy_node.remove() # Try to strip out the PG license footer for node in dom.xpath( "//*[re:test(text(), 'End of (the )?Project Gutenberg')]", namespaces=namespaces): for sibling_node in node.xpath("./following-sibling::*"): easy_node = se.easy_xml.EasyXmlElement(sibling_node) easy_node.remove() easy_node = se.easy_xml.EasyXmlElement(node) easy_node.remove() # lxml will but the xml declaration in a weird place, remove it first output = regex.sub(r"<\?xml.+?\?>", "", etree.tostring(dom, encoding="unicode")) # Now re-add it output = """<?xml version="1.0" encoding="utf-8"?>\n""" + output # lxml can also output duplicate default namespace declarations so remove the first one only output = regex.sub(r"(xmlns=\".+?\")(\sxmlns=\".+?\")+", r"\1", output) with open(repo_path / "src" / "epub" / "text" / "body.xhtml", "w", encoding="utf-8") as file: file.write(output) except OSError as ex: raise se.InvalidFileException( f"Couldn’t write to ebook directory. Exception: {ex}") except Exception as ex: # Save this error for later, because it's still useful to complete the create-draft process # even if we've failed to parse PG's HTML source. is_pg_html_parsed = False se.quiet_remove(repo_path / "src" / "epub" / "text" / "body.xhtml") # Copy over templates _copy_template_file("gitignore", repo_path / ".gitignore") _copy_template_file("LICENSE.md", repo_path) _copy_template_file("container.xml", repo_path / "src" / "META-INF") _copy_template_file("mimetype", repo_path / "src") _copy_template_file("content.opf", repo_path / "src" / "epub") _copy_template_file("onix.xml", repo_path / "src" / "epub") _copy_template_file("toc.xhtml", repo_path / "src" / "epub") _copy_template_file("core.css", repo_path / "src" / "epub" / "css") _copy_template_file("local.css", repo_path / "src" / "epub" / "css") _copy_template_file("se.css", repo_path / "src" / "epub" / "css") _copy_template_file("logo.svg", repo_path / "src" / "epub" / "images") _copy_template_file("colophon.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("imprint.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("titlepage.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("uncopyright.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("titlepage.svg", repo_path / "images") _copy_template_file("cover.jpg", repo_path / "images" / "cover.jpg") _copy_template_file("cover.svg", repo_path / "images" / "cover.svg") # Try to find Wikipedia links if possible ebook_wiki_url = None if not args.offline and title != "Short Fiction": # There's a "Short Fiction" Wikipedia article, so make an exception for that case ebook_wiki_url, _ = _get_wikipedia_url(title, False) # Pre-fill a few templates _replace_in_file(repo_path / "src" / "epub" / "text" / "titlepage.xhtml", "TITLE_STRING", title_string) _replace_in_file(repo_path / "images" / "titlepage.svg", "TITLE_STRING", title_string) _replace_in_file(repo_path / "images" / "cover.svg", "TITLE_STRING", title_string) # Create the titlepage SVG contributors = {} if args.translator: contributors["translated by"] = _generate_contributor_string( translators, False) if args.illustrator: contributors["illustrated by"] = _generate_contributor_string( illustrators, False) with open(repo_path / "images" / "titlepage.svg", "w", encoding="utf-8") as file: file.write( _generate_titlepage_svg(title, [author["name"] for author in authors], contributors, title_string)) # Create the cover SVG with open(repo_path / "images" / "cover.svg", "w", encoding="utf-8") as file: file.write( _generate_cover_svg(title, [author["name"] for author in authors], title_string)) # Build the cover/titlepage for distribution epub = SeEpub(repo_path) epub.generate_cover_svg() epub.generate_titlepage_svg() if args.pg_url: _replace_in_file(repo_path / "src" / "epub" / "text" / "imprint.xhtml", "PG_URL", args.pg_url) # Fill out the colophon with open(repo_path / "src" / "epub" / "text" / "colophon.xhtml", "r+", encoding="utf-8") as file: colophon_xhtml = file.read() colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier) colophon_xhtml = colophon_xhtml.replace("TITLE", title) contributor_string = _generate_contributor_string(authors, True) if contributor_string == "": colophon_xhtml = colophon_xhtml.replace( " by<br/>\n\t\t\t<a href=\"AUTHOR_WIKI_URL\">AUTHOR</a>", contributor_string) else: colophon_xhtml = colophon_xhtml.replace( "<a href=\"AUTHOR_WIKI_URL\">AUTHOR</a>", contributor_string) if translators: translator_block = f"It was translated from ORIGINAL_LANGUAGE in TRANSLATION_YEAR by<br/>\n\t\t\t{_generate_contributor_string(translators, True)}.</p>" colophon_xhtml = colophon_xhtml.replace( "</p>\n\t\t\t<p>This ebook was produced for the<br/>", f"<br/>\n\t\t\t{translator_block}\n\t\t\t<p>This ebook was produced for the<br/>" ) if args.pg_url: colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url) if pg_publication_year: colophon_xhtml = colophon_xhtml.replace( "PG_YEAR", pg_publication_year) if pg_producers: producers_xhtml = "" for i, producer in enumerate(pg_producers): if "Distributed Proofread" in producer: producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>" elif "anonymous" in producer.lower(): producers_xhtml = producers_xhtml + "<b class=\"name\">An Anonymous Volunteer</b>" else: producers_xhtml = producers_xhtml + f"<b class=\"name\">{_add_name_abbr(producer).strip('.')}</b>" if i < len(pg_producers) - 1: producers_xhtml = producers_xhtml + ", " if i == len(pg_producers) - 2: producers_xhtml = producers_xhtml + "and " producers_xhtml = producers_xhtml + "<br/>" colophon_xhtml = colophon_xhtml.replace( "<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>", producers_xhtml) file.seek(0) file.write(colophon_xhtml) file.truncate() # Fill out the metadata file with open(repo_path / "src" / "epub" / "content.opf", "r+", encoding="utf-8") as file: metadata_xml = file.read() metadata_xml = metadata_xml.replace("SE_IDENTIFIER", identifier) metadata_xml = metadata_xml.replace(">TITLE_SORT<", f">{sorted_title}<") metadata_xml = metadata_xml.replace(">TITLE<", f">{title}<") metadata_xml = metadata_xml.replace("VCS_IDENTIFIER", str(repo_name)) if pg_producers: producers_xhtml = "" i = 1 for producer in pg_producers: if "Distributed Proofread" in producer: producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">The Online Distributed Proofreading Team</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{i}\">https://pgdp.net</meta>\n" elif "anonymous" in producer.lower(): producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">An Anonymous Volunteer</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Anonymous Volunteer, An</meta>\n" else: producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">{producer.strip('.')}</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">TRANSCRIBER_SORT</meta>\n" producers_xhtml = producers_xhtml + f"\t\t<meta property=\"role\" refines=\"#transcriber-{i}\" scheme=\"marc:relators\">trc</meta>\n" i = i + 1 metadata_xml = regex.sub( r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>", "\t\t" + producers_xhtml.strip(), metadata_xml, flags=regex.DOTALL) if ebook_wiki_url: metadata_xml = metadata_xml.replace(">EBOOK_WIKI_URL<", f">{ebook_wiki_url}<") authors_xml = _generate_metadata_contributor_xml(authors, "author") authors_xml = authors_xml.replace("dc:contributor", "dc:creator") metadata_xml = regex.sub( r"<dc:creator id=\"author\">AUTHOR</dc:creator>.+?scheme=\"marc:relators\">aut</meta>", authors_xml, metadata_xml, flags=regex.DOTALL) if translators: translators_xml = _generate_metadata_contributor_xml( translators, "translator") metadata_xml = regex.sub( r"<dc:contributor id=\"translator\">.+?scheme=\"marc:relators\">trl</meta>", translators_xml, metadata_xml, flags=regex.DOTALL) else: metadata_xml = regex.sub( r"<dc:contributor id=\"translator\">.+?scheme=\"marc:relators\">trl</meta>\n\t\t", "", metadata_xml, flags=regex.DOTALL) if illustrators: illustrators_xml = _generate_metadata_contributor_xml( illustrators, "illustrator") metadata_xml = regex.sub( r"<dc:contributor id=\"illustrator\">.+?scheme=\"marc:relators\">ill</meta>", illustrators_xml, metadata_xml, flags=regex.DOTALL) else: metadata_xml = regex.sub( r"<dc:contributor id=\"illustrator\">.+?scheme=\"marc:relators\">ill</meta>\n\t\t", "", metadata_xml, flags=regex.DOTALL) if args.pg_url: if pg_subjects: subject_xhtml = "" i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + f"\t\t<dc:subject id=\"subject-{i}\">{subject}</dc:subject>\n" i = i + 1 i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + f"\t\t<meta property=\"authority\" refines=\"#subject-{i}\">LCSH</meta>\n" # Now, get the LCSH ID by querying LCSH directly. try: response = requests.get( f"https://id.loc.gov/search/?q=%22{urllib.parse.quote(subject)}%22" ) result = regex.search( fr"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{regex.escape(subject.replace(' -- ', '--'))}</a>", response.text) loc_id = "Unknown" try: loc_id = result.group(1) except Exception as ex: pass subject_xhtml = subject_xhtml + f"\t\t<meta property=\"term\" refines=\"#subject-{i}\">{loc_id}</meta>\n" except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t connect to [url][link=https://id.loc.gov]https://id.loc.gov[/][/]. Exception: {ex}" ) i = i + 1 metadata_xml = regex.sub( r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>", "\t\t" + subject_xhtml.strip(), metadata_xml) metadata_xml = metadata_xml.replace( "<dc:language>LANG</dc:language>", f"<dc:language>{pg_language}</dc:language>") metadata_xml = metadata_xml.replace( "<dc:source>PG_URL</dc:source>", f"<dc:source>{args.pg_url}</dc:source>") file.seek(0) file.write(metadata_xml) file.truncate() # Set up local git repo repo = git.Repo.init(repo_path) if args.email: with repo.config_writer() as config: config.set_value("user", "email", args.email) if args.pg_url and pg_ebook_html and not is_pg_html_parsed: raise se.InvalidXhtmlException( "Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook." )
def build(self, metadata_xhtml, metadata_tree, run_epubcheck, build_kobo, build_kindle, output_directory, proof, build_covers, verbose): """ Entry point for `se build` """ calibre_app_mac_path = "/Applications/calibre.app/Contents/MacOS/" epubcheck_path = shutil.which("epubcheck") ebook_convert_path = shutil.which("ebook-convert") # Look for default Mac calibre app path if none found in path if ebook_convert_path is None and os.path.exists(calibre_app_mac_path): ebook_convert_path = os.path.join(calibre_app_mac_path, "ebook-convert") rsvg_convert_path = shutil.which("rsvg-convert") convert_path = shutil.which("convert") navdoc2ncx_xsl_filename = resource_filename("se", os.path.join("data", "navdoc2ncx.xsl")) mathml_xsl_filename = resource_filename("se", os.path.join("data", "mathmlcontent2presentation.xsl")) # Check for some required tools if run_epubcheck and epubcheck_path is None: raise se.MissingDependencyException("Couldn’t locate epubcheck. Is it installed?") if rsvg_convert_path is None: raise se.MissingDependencyException("Couldn’t locate rsvg-convert. Is librsvg2-bin installed?") if build_kindle and ebook_convert_path is None: raise se.MissingDependencyException("Couldn’t locate ebook-convert. Is Calibre installed?") if build_kindle and convert_path is None: raise se.MissingDependencyException("Couldn’t locate convert. Is Imagemagick installed?") # Check the output directory and create it if it doesn't exist if output_directory is None: output_directory = os.getcwd() else: output_directory = output_directory output_directory = os.path.abspath(output_directory) if os.path.exists(output_directory): if not os.path.isdir(output_directory): raise se.InvalidInputException("Not a directory: {}".format(output_directory)) else: # Doesn't exist, try to create it try: os.makedirs(output_directory) except OSError as exception: if exception.errno != errno.EEXIST: raise se.FileExistsException("Couldn’t create output directory.") # All clear to start building! if verbose: print("Building {} ...".format(self.directory)) with tempfile.TemporaryDirectory() as work_directory: work_epub_root_directory = os.path.join(work_directory, "src") copy_tree(self.directory, work_directory) try: shutil.rmtree(os.path.join(work_directory, ".git")) except Exception: pass # By convention the ASIN is set to the SHA-1 sum of the book's identifying URL identifier = metadata_tree.xpath("//dc:identifier")[0].inner_html().replace("url:", "") asin = sha1(identifier.encode("utf-8")).hexdigest() title = metadata_tree.xpath("//dc:title")[0].inner_html() url_title = se.formatting.make_url_safe(title) url_author = "" for author in metadata_tree.xpath("//dc:creator"): url_author = url_author + se.formatting.make_url_safe(author.inner_html()) + "_" url_author = url_author.rstrip("_") epub_output_filename = "{}_{}{}.epub".format(url_author, url_title, ".proof" if proof else "") epub3_output_filename = "{}_{}{}.epub3".format(url_author, url_title, ".proof" if proof else "") kobo_output_filename = "{}_{}{}.kepub.epub".format(url_author, url_title, ".proof" if proof else "") kindle_output_filename = "{}_{}{}.azw3".format(url_author, url_title, ".proof" if proof else "") # Clean up old output files if any for kindle_thumbnail in glob.glob(os.path.join(output_directory, "thumbnail_{}_EBOK_portrait.jpg".format(asin))): se.quiet_remove(kindle_thumbnail) se.quiet_remove(os.path.join(output_directory, "cover.jpg")) se.quiet_remove(os.path.join(output_directory, "cover-thumbnail.jpg")) se.quiet_remove(os.path.join(output_directory, epub_output_filename)) se.quiet_remove(os.path.join(output_directory, epub3_output_filename)) se.quiet_remove(os.path.join(output_directory, kobo_output_filename)) se.quiet_remove(os.path.join(output_directory, kindle_output_filename)) # Are we including proofreading CSS? if proof: with open(os.path.join(work_epub_root_directory, "epub", "css", "local.css"), "a", encoding="utf-8") as local_css_file: with open(resource_filename("se", os.path.join("data", "templates", "proofreading.css")), "r", encoding="utf-8") as proofreading_css_file: local_css_file.write(proofreading_css_file.read()) # Output the pure epub3 file if verbose: print("\tBuilding {} ...".format(epub3_output_filename), end="", flush=True) se.epub.write_epub(work_epub_root_directory, os.path.join(output_directory, epub3_output_filename)) if verbose: print(" OK") if build_kobo: if verbose: print("\tBuilding {} ...".format(kobo_output_filename), end="", flush=True) else: if verbose: print("\tBuilding {} ...".format(epub_output_filename), end="", flush=True) # Now add epub2 compatibility. # Include compatibility CSS with open(os.path.join(work_epub_root_directory, "epub", "css", "core.css"), "a", encoding="utf-8") as core_css_file: with open(resource_filename("se", os.path.join("data", "templates", "compatibility.css")), "r", encoding="utf-8") as compatibility_css_file: core_css_file.write(compatibility_css_file.read()) # Simplify CSS and tags total_css = "" # Simplify the CSS first. Later we'll update the document to match our simplified selectors. # While we're doing this, we store the original css into a single variable so we can extract the original selectors later. for root, _, filenames in os.walk(work_epub_root_directory): for filename in fnmatch.filter(filenames, "*.css"): with open(os.path.join(root, filename), "r+", encoding="utf-8") as file: css = file.read() # Before we do anything, we process a special case in core.css if "core.css" in filename: css = regex.sub(r"abbr{.+?}", "", css, flags=regex.DOTALL) total_css = total_css + css + "\n" file.seek(0) file.write(se.formatting.simplify_css(css)) file.truncate() # Now get a list of original selectors # Remove @supports(){} total_css = regex.sub(r"@supports.+?{(.+?)}\s*}", "\\1}", total_css, flags=regex.DOTALL) # Remove CSS rules total_css = regex.sub(r"{[^}]+}", "", total_css) # Remove trailing commas total_css = regex.sub(r",", "", total_css) # Remove comments total_css = regex.sub(r"/\*.+?\*/", "", total_css, flags=regex.DOTALL) # Remove @ defines total_css = regex.sub(r"^@.+", "", total_css, flags=regex.MULTILINE) # Construct a dictionary of the original selectors selectors = set([line for line in total_css.splitlines() if line != ""]) # Get a list of .xhtml files to simplify for root, _, filenames in os.walk(work_epub_root_directory): for filename in fnmatch.filter(filenames, "*.xhtml"): # Don't mess with the ToC, since if we have ol/li > first-child selectors we could screw it up if filename == "toc.xhtml": continue with open(os.path.join(root, filename), "r+", encoding="utf-8") as file: # We have to remove the default namespace declaration from our document, otherwise # xpath won't find anything at all. See http://stackoverflow.com/questions/297239/why-doesnt-xpath-work-when-processing-an-xhtml-document-with-lxml-in-python xhtml = file.read().replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", "") processed_xhtml = xhtml try: tree = etree.fromstring(str.encode(xhtml)) except Exception as ex: raise se.InvalidXhtmlException("Error parsing XHTML file: {}\n{}".format(filename, ex)) # Now iterate over each CSS selector and see if it's used in any of the files we found force_convert = False for selector in selectors: try: sel = lxml.cssselect.CSSSelector(selector, translator="xhtml", namespaces=se.XHTML_NAMESPACES) # Add classes to elements that match any of our selectors to simplify. For example, if we select :first-child, add a "first-child" class to all elements that match that. for selector_to_simplify in se.SELECTORS_TO_SIMPLIFY: if selector_to_simplify in selector: selector_to_simplify = selector_to_simplify.replace(":", "") for element in tree.xpath(sel.path, namespaces=se.XHTML_NAMESPACES): current_class = element.get("class") if current_class is not None and selector_to_simplify not in current_class: current_class = current_class + " " + selector_to_simplify else: current_class = selector_to_simplify element.set("class", current_class) except lxml.cssselect.ExpressionError: # This gets thrown if we use pseudo-elements, which lxml doesn't support # We force a check if we get thrown this because we might miss some important ::before elements force_convert = True # We've already replaced attribute/namespace selectors with classes in the CSS, now add those classes to the matching elements if force_convert or "[epub|type" in selector: for namespace_selector in regex.findall(r"\[epub\|type\~\=\"[^\"]*?\"\]", selector): sel = lxml.cssselect.CSSSelector(namespace_selector, translator="xhtml", namespaces=se.XHTML_NAMESPACES) for element in tree.xpath(sel.path, namespaces=se.XHTML_NAMESPACES): new_class = regex.sub(r"^\.", "", se.formatting.namespace_to_class(namespace_selector)) current_class = element.get("class", "") if new_class not in current_class: current_class = "{} {}".format(current_class, new_class).strip() element.set("class", current_class) processed_xhtml = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + etree.tostring(tree, encoding=str, pretty_print=True) # We do this round in a second pass because if we modify the tree like this, it screws up how lxml does processing later. # If it's all done in one pass, we wind up in a race condition where some elements are fixed and some not tree = etree.fromstring(str.encode(processed_xhtml)) for selector in selectors: try: sel = lxml.cssselect.CSSSelector(selector, translator="xhtml", namespaces=se.XHTML_NAMESPACES) except lxml.cssselect.ExpressionError: # This gets thrown if we use pseudo-elements, which lxml doesn't support continue # Convert <abbr> to <span> if "abbr" in selector: for element in tree.xpath(sel.path, namespaces=se.XHTML_NAMESPACES): # Why would you want the tail to output by default?!? raw_string = etree.tostring(element, encoding=str, with_tail=False) # lxml--crap as usual--includes a bunch of namespace information in every element we print. # Remove it heregex. raw_string = raw_string.replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", "") raw_string = raw_string.replace(" xmlns:epub=\"http://www.idpf.org/2007/ops\"", "") # Now lxml doesn't let us modify the tree, so we just do a straight up regex replace to turn this into a span processed_string = raw_string.replace("<abbr", "<span") processed_string = processed_string.replace("</abbr", "</span") # Now we have a nice, fixed string. But, since lxml can't replace elements, we write it ourselves. processed_xhtml = processed_xhtml.replace(raw_string, processed_string) tree = etree.fromstring(str.encode(processed_xhtml)) # Now we just remove all stray abbr tags that were not styled by CSS processed_xhtml = regex.sub(r"</?abbr[^>]*?>", "", processed_xhtml) # Remove datetime="" attribute in <time> tags, which is not always understood by epubcheck processed_xhtml = regex.sub(r" datetime=\"[^\"]+?\"", "", processed_xhtml) tree = etree.fromstring(str.encode(processed_xhtml)) if processed_xhtml != xhtml: file.seek(0) file.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + etree.tostring(tree, encoding=str, pretty_print=True).replace("<html", "<html xmlns=\"http://www.w3.org/1999/xhtml\"")) file.truncate() # Done simplifying CSS and tags! # Extract cover and cover thumbnail # We used to be able to use `convert` to convert svg -> jpg in one step, but at some point a bug # was introduced to `convert` that caused it to crash in this situation. Now, we first use rsvg-convert # to convert to svg -> png, then `convert` to convert png -> jpg. subprocess.run([rsvg_convert_path, "--keep-aspect-ratio", "--format", "png", "--output", os.path.join(work_directory, 'cover.png'), os.path.join(work_epub_root_directory, "epub", "images", "cover.svg")]) subprocess.run([convert_path, "-format", "jpg", os.path.join(work_directory, 'cover.png'), os.path.join(work_epub_root_directory, "epub", "images", "cover.jpg")]) os.remove(os.path.join(work_directory, 'cover.png')) if build_covers: shutil.copy2(os.path.join(work_epub_root_directory, "epub", "images", "cover.jpg"), os.path.join(output_directory, "cover.jpg")) shutil.copy2(os.path.join(work_epub_root_directory, "epub", "images", "cover.svg"), os.path.join(output_directory, "cover-thumbnail.svg")) subprocess.run([rsvg_convert_path, "--keep-aspect-ratio", "--format", "png", "--output", os.path.join(work_directory, 'cover-thumbnail.png'), os.path.join(output_directory, "cover-thumbnail.svg")]) subprocess.run([convert_path, "-resize", "{}x{}".format(COVER_THUMBNAIL_WIDTH, COVER_THUMBNAIL_HEIGHT), "-quality", "100", "-format", "jpg", os.path.join(work_directory, 'cover-thumbnail.png'), os.path.join(output_directory, "cover-thumbnail.jpg")]) os.remove(os.path.join(work_directory, 'cover-thumbnail.png')) os.remove(os.path.join(output_directory, "cover-thumbnail.svg")) os.remove(os.path.join(work_epub_root_directory, "epub", "images", "cover.svg")) # Massage image references in content.opf metadata_xhtml = metadata_xhtml.replace("cover.svg", "cover.jpg") metadata_xhtml = metadata_xhtml.replace(".svg", ".png") metadata_xhtml = metadata_xhtml.replace("id=\"cover.jpg\" media-type=\"image/svg+xml\"", "id=\"cover.jpg\" media-type=\"image/jpeg\"") metadata_xhtml = metadata_xhtml.replace("image/svg+xml", "image/png") metadata_xhtml = regex.sub(r"properties=\"([^\"]*?)svg([^\"]*?)\"", "properties=\"\\1\\2\"", metadata_xhtml) # We may also have the `mathml` property # NOTE: even though the a11y namespace is reserved by the epub spec, we must declare it because epubcheck doesn't know that yet. # Once epubcheck understands the a11y namespace is reserved, we can remove it from the namespace declarations. metadata_xhtml = metadata_xhtml.replace(" prefix=\"se: https://standardebooks.org/vocab/1.0\"", " prefix=\"se: https://standardebooks.org/vocab/1.0, a11y: https://www.idpf.org/epub/vocab/package/a11y/\"") # Google Play Books chokes on https XML namespace identifiers (as of at least 2017-07) metadata_xhtml = metadata_xhtml.replace("https://standardebooks.org/vocab/1.0", "http://standardebooks.org/vocab/1.0") metadata_xhtml = metadata_xhtml.replace("https://www.idpf.org/epub/vocab/package/a11y/", "http://www.idpf.org/epub/vocab/package/a11y/") # Output the modified content.opf so that we can build the kobo book before making more epub2 compatibility hacks with open(os.path.join(work_epub_root_directory, "epub", "content.opf"), "w", encoding="utf-8") as file: file.write(metadata_xhtml) file.truncate() # Recurse over xhtml files to make some compatibility replacements for root, _, filenames in os.walk(work_epub_root_directory): for filename in filenames: if filename.lower().endswith(".svg"): # For night mode compatibility, give the titlepage a 1px white stroke attribute if filename.lower() == "titlepage.svg" or filename.lower() == "logo.svg": with open(os.path.join(root, filename), "r+", encoding="utf-8") as file: svg = file.read() paths = svg # What we're doing here is faking the `stroke-align: outside` property, which is an unsupported draft spec right now. # We do this by duplicating all the SVG paths, and giving the duplicates a 2px stroke. The originals are directly on top, # so the 2px stroke becomes a 1px stroke that's *outside* of the path instead of being *centered* on the path border. # This looks much nicer, but we also have to increase the image size by 2px in both directions, and re-center the whole thing. if filename.lower() == "titlepage.svg": stroke_width = SVG_TITLEPAGE_OUTER_STROKE_WIDTH else: stroke_width = SVG_OUTER_STROKE_WIDTH # First, strip out non-path, non-group elements paths = regex.sub(r"<\?xml[^<]+?\?>", "", paths) paths = regex.sub(r"</?svg[^<]*?>", "", paths) paths = regex.sub(r"<title>[^<]+?</title>", "", paths) paths = regex.sub(r"<desc>[^<]+?</desc>", "", paths) # `paths` is now our "duplicate". Add a 2px stroke. paths = paths.replace("<path", "<path style=\"stroke: #ffffff; stroke-width: {}px;\"".format(stroke_width)) # Inject the duplicate under the old SVG paths. We do this by only replacing the first regex match for <g> or <path> svg = regex.sub(r"(<g|<path)", "{}\\1".format(paths), svg, 1) # If this SVG specifies height/width, then increase height and width by 2 pixels and translate everything by 1px try: height = int(regex.search(r"<svg[^>]+?height=\"([0-9]+)\"", svg).group(1)) + stroke_width svg = regex.sub(r"<svg([^<]*?)height=\"[0-9]+\"", "<svg\\1height=\"{}\"".format(height), svg) width = int(regex.search(r"<svg[^>]+?width=\"([0-9]+)\"", svg).group(1)) + stroke_width svg = regex.sub(r"<svg([^<]*?)width=\"[0-9]+\"", "<svg\\1width=\"{}\"".format(width), svg) # Add a grouping element to translate everything over 1px svg = regex.sub(r"(<g|<path)", "<g transform=\"translate({amount}, {amount})\">\n\\1".format(amount=(stroke_width / 2)), svg, 1) svg = svg.replace("</svg>", "</g>\n</svg>") except AttributeError: # Thrown when the regex doesn't match (i.e. SVG doesn't specify height/width) pass file.seek(0) file.write(svg) file.truncate() # Convert SVGs to PNGs at 2x resolution # We use `rsvg-convert` instead of `inkscape` or `convert` because it gives us an easy way of zooming in at 2x subprocess.run([rsvg_convert_path, "--zoom", "2", "--keep-aspect-ratio", "--format", "png", "--output", regex.sub(r"\.svg$", ".png", os.path.join(root, filename)), os.path.join(root, filename)]) os.remove(os.path.join(root, filename)) if filename.lower().endswith(".xhtml"): with open(os.path.join(root, filename), "r+", encoding="utf-8") as file: xhtml = file.read() processed_xhtml = xhtml # Check if there's any MathML to convert. # We expect MathML to be the "content" type (versus the "presentational" type). # We use an XSL transform to convert from "content" to "presentational" MathML. # If we start with presentational, then nothing will be changed. # Kobo supports presentational MathML. After we build kobo, we convert the presentational MathML to PNG for the rest of the builds. mathml_transform = None for line in regex.findall(r"<(?:m:)?math[^>]*?>(.+?)</(?:m:)?math>", processed_xhtml, flags=regex.DOTALL): mathml_content_tree = se.easy_xml.EasyXmlTree("<?xml version=\"1.0\" encoding=\"utf-8\"?><math xmlns=\"http://www.w3.org/1998/Math/MathML\">{}</math>".format(regex.sub(r"<(/?)m:", "<\\1", line))) # Initialize the transform object, if we haven't yet if not mathml_transform: mathml_transform = etree.XSLT(etree.parse(mathml_xsl_filename)) # Transform the mathml and get a string representation # XSLT comes from https://github.com/fred-wang/webextension-content-mathml-polyfill mathml_presentation_tree = mathml_transform(mathml_content_tree.etree) mathml_presentation_xhtml = etree.tostring(mathml_presentation_tree, encoding="unicode", pretty_print=True, with_tail=False).strip() # Plop our string back in to the XHTML we're processing processed_xhtml = regex.sub(r"<math[^>]*?>\{}\</math>".format(regex.escape(line)), mathml_presentation_xhtml, processed_xhtml, flags=regex.MULTILINE) # Add ARIA roles, which are just mostly duplicate attributes to epub:type (with the exception of rearnotes -> endnotes, and adding the `backlink` role which is not yet in epub 3.0) processed_xhtml = regex.sub(r"(epub:type=\"[^\"]*?rearnote(s?)[^\"]*?\")", "\\1 role=\"doc-endnote\\2\"", processed_xhtml) if filename == "endnotes.xhtml": processed_xhtml = processed_xhtml.replace(" epub:type=\"se:referrer\"", " role=\"doc-backlink\" epub:type=\"se:referrer\"") # iOS renders the left-arrow-hook character as an emoji; this fixes it and forces it to renderr as text. # See https://github.com/standardebooks/tools/issues/73 # See http://mts.io/2015/04/21/unicode-symbol-render-text-emoji/ processed_xhtml = processed_xhtml.replace("\u21a9", "\u21a9\ufe0e") for role in se.ARIA_ROLES: processed_xhtml = regex.sub(r"(epub:type=\"[^\"]*?{}[^\"]*?\")".format(role), "\\1 role=\"doc-{}\"".format(role), processed_xhtml) # Since we convert SVGs to raster, here we add the color-depth semantic for night mode processed_xhtml = processed_xhtml.replace("z3998:publisher-logo", "z3998:publisher-logo se:image.color-depth.black-on-transparent") processed_xhtml = regex.sub(r"class=\"([^\"]*?)epub-type-z3998-publisher-logo([^\"]*?)\"", "class=\"\\1epub-type-z3998-publisher-logo epub-type-se-image-color-depth-black-on-transparent\\2\"", processed_xhtml) # Special case for the titlepage if filename == "titlepage.xhtml": processed_xhtml = processed_xhtml.replace("<img", "<img class=\"epub-type-se-image-color-depth-black-on-transparent\" epub:type=\"se:image.color-depth.black-on-transparent\"") # Google Play Books chokes on https XML namespace identifiers (as of at least 2017-07) processed_xhtml = processed_xhtml.replace("https://standardebooks.org/vocab/1.0", "http://standardebooks.org/vocab/1.0") # We converted svgs to pngs, so replace references processed_xhtml = processed_xhtml.replace("cover.svg", "cover.jpg") processed_xhtml = processed_xhtml.replace(".svg", ".png") # To get popup footnotes in iBooks, we have to change epub:rearnote to epub:footnote. # Remember to get our custom style selectors too. processed_xhtml = regex.sub(r"epub:type=\"([^\"]*?)rearnote([^\"]*?)\"", "epub:type=\"\\1footnote\\2\"", processed_xhtml) processed_xhtml = regex.sub(r"class=\"([^\"]*?)epub-type-rearnote([^\"]*?)\"", "class=\"\\1epub-type-footnote\\2\"", processed_xhtml) # Include extra lang tag for accessibility compatibility. processed_xhtml = regex.sub(r"xml:lang\=\"([^\"]+?)\"", "lang=\"\\1\" xml:lang=\"\\1\"", processed_xhtml) # Typography: replace double and triple em dash characters with extra em dashes. processed_xhtml = processed_xhtml.replace("⸺", "—{}—".format(se.WORD_JOINER)) processed_xhtml = processed_xhtml.replace("⸻", "—{}—{}—".format(se.WORD_JOINER, se.WORD_JOINER)) # Typography: replace some other less common characters. processed_xhtml = processed_xhtml.replace("⅒", "1/10") processed_xhtml = processed_xhtml.replace("℅", "c/o") processed_xhtml = processed_xhtml.replace("✗", "×") processed_xhtml = processed_xhtml.replace(" ", "{}{}".format(se.NO_BREAK_SPACE, se.NO_BREAK_SPACE)) # em-space to two nbsps # Many e-readers don't support the word joiner character (U+2060). # They DO, however, support the now-deprecated zero-width non-breaking space (U+FEFF) # For epubs, do this replacement. Kindle now seems to handle everything fortunately. processed_xhtml = processed_xhtml.replace(se.WORD_JOINER, se.ZERO_WIDTH_SPACE) if processed_xhtml != xhtml: file.seek(0) file.write(processed_xhtml) file.truncate() if filename.lower().endswith(".css"): with open(os.path.join(root, filename), "r+", encoding="utf-8") as file: css = file.read() processed_css = css # To get popup footnotes in iBooks, we have to change epub:rearnote to epub:footnote. # Remember to get our custom style selectors too. processed_css = processed_css.replace("rearnote", "footnote") # Add new break-* aliases for compatibilty with newer readers. processed_css = regex.sub(r"(\s+)page-break-(.+?:\s.+?;)", "\\1page-break-\\2\t\\1break-\\2", processed_css) if processed_css != css: file.seek(0) file.write(processed_css) file.truncate() if build_kobo: with tempfile.TemporaryDirectory() as kobo_work_directory: copy_tree(work_epub_root_directory, kobo_work_directory) for root, _, filenames in os.walk(kobo_work_directory): # Add a note to content.opf indicating this is a transform build for filename in fnmatch.filter(filenames, "content.opf"): with open(os.path.join(root, filename), "r+", encoding="utf-8") as file: xhtml = file.read() xhtml = regex.sub(r"<dc:publisher", "<meta property=\"se:transform\">kobo</meta>\n\t\t<dc:publisher", xhtml) file.seek(0) file.write(xhtml) file.truncate() # Kobo .kepub files need each clause wrapped in a special <span> tag to enable highlighting. # Do this here. Hopefully Kobo will get their act together soon and drop this requirement. for filename in fnmatch.filter(filenames, "*.xhtml"): se.kobo.paragraph_counter = 1 se.kobo.segment_counter = 1 # Don't add spans to the ToC if filename == "toc.xhtml": continue with open(os.path.join(root, filename), "r+", encoding="utf-8") as file: xhtml = file.read() # Kobos don't have fonts that support the ↩ character in endnotes, so replace it with « if filename == "endnotes.xhtml": # Note that we replaced ↩ with \u21a9\ufe0e in an earlier iOS compatibility fix xhtml = regex.sub(r"epub:type=\"se:referrer\">\u21a9\ufe0e</a>", "epub:type=\"se:referrer\">«</a>", xhtml) # We have to remove the default namespace declaration from our document, otherwise # xpath won't find anything at all. See http://stackoverflow.com/questions/297239/why-doesnt-xpath-work-when-processing-an-xhtml-document-with-lxml-in-python try: tree = etree.fromstring(str.encode(xhtml.replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", ""))) except Exception as ex: raise se.InvalidXhtmlException("Error parsing XHTML file: {}\n{}".format(filename, ex), verbose) se.kobo.add_kobo_spans_to_node(tree.xpath("./body", namespaces=se.XHTML_NAMESPACES)[0]) xhtml = etree.tostring(tree, encoding="unicode", pretty_print=True, with_tail=False) xhtml = regex.sub(r"<html:span", "<span", xhtml) xhtml = regex.sub(r"html:span>", "span>", xhtml) xhtml = regex.sub(r"<span xmlns:html=\"http://www.w3.org/1999/xhtml\"", "<span", xhtml) xhtml = regex.sub(r"<html", "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\"", xhtml) file.seek(0) file.write(xhtml) file.truncate() se.epub.write_epub(kobo_work_directory, os.path.join(output_directory, kobo_output_filename)) if verbose: print(" OK") print("\tBuilding {} ...".format(epub_output_filename), end="", flush=True) # Now work on more epub2 compatibility # Recurse over css files to make some compatibility replacements. for root, _, filenames in os.walk(work_epub_root_directory): for filename in filenames: if filename.lower().endswith(".css"): with open(os.path.join(root, filename), "r+", encoding="utf-8") as file: css = file.read() processed_css = css processed_css = regex.sub(r"(page\-break\-(before|after|inside)\s*:\s*(.+))", "\\1\n\t-webkit-column-break-\\2: \\3 /* For Readium */", processed_css) processed_css = regex.sub(r"^\s*hyphens\s*:\s*(.+)", "\thyphens: \\1\n\tadobe-hyphenate: \\1\n\t-webkit-hyphens: \\1\n\t-epub-hyphens: \\1\n\t-moz-hyphens: \\1", processed_css, flags=regex.MULTILINE) processed_css = regex.sub(r"^\s*hyphens\s*:\s*none;", "\thyphens: none;\n\tadobe-text-layout: optimizeSpeed; /* For Nook */", processed_css, flags=regex.MULTILINE) if processed_css != css: file.seek(0) file.write(processed_css) file.truncate() # Sort out MathML compatibility has_mathml = "mathml" in metadata_xhtml if has_mathml: firefox_path = shutil.which("firefox") if firefox_path is None: raise se.MissingDependencyException("firefox is required to process MathML, but firefox couldn't be located. Is it installed?") mathml_count = 1 for root, _, filenames in os.walk(work_epub_root_directory): for filename in filenames: if filename.lower().endswith(".xhtml"): with open(os.path.join(root, filename), "r+", encoding="utf-8") as file: xhtml = file.read() processed_xhtml = xhtml replaced_mathml = [] # Check if there's MathML we want to convert # We take a naive approach and use some regexes to try to simplify simple MathML expressions. # For each MathML expression, if our round of regexes finishes and there is still MathML in the processed result, we abandon the attempt and render to PNG using Firefox. for line in regex.findall(r"<(?:m:)math[^>]*?>(?:.+?)</(?:m:)math>", processed_xhtml, flags=regex.DOTALL): if line not in replaced_mathml: replaced_mathml.append(line) # Store converted lines to save time in case we have multiple instances of the same MathML mathml_tree = se.easy_xml.EasyXmlTree("<?xml version=\"1.0\" encoding=\"utf-8\"?>{}".format(regex.sub(r"<(/?)m:", "<\\1", line))) processed_line = line # If the mfenced element has more than one child, they are separated by commas when rendered. # This is too complex for our naive regexes to work around. So, if there is an mfenced element with more than one child, abandon the attempt. if not mathml_tree.css_select("mfenced > * + *"): processed_line = regex.sub(r"</?(?:m:)?math[^>]*?>", "", processed_line) processed_line = regex.sub(r"<!--.+?-->", "", processed_line) processed_line = regex.sub(r"<(?:m:)?mfenced/>", "()", processed_line) processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mi)>(.+?)</\3><((?:m:)?mi)>(.+?)</\5></\1>", "<i>\\4</i><\\2><i>\\6</i></\\2>", processed_line) processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mi)>(.+?)</\3><((?:m:)?mn)>(.+?)</\5></\1>", "<i>\\4</i><\\2>\\6</\\2>", processed_line) processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mn)>(.+?)</\3><((?:m:)?mn)>(.+?)</\5></\1>", "\\4<\\2>\\6</\\2>", processed_line) processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mn)>(.+?)</\3><((?:m:)?mi)>(.+?)</\5></\1>", "\\4<\\2><i>\\6</i></\\2>", processed_line) processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mi) mathvariant=\"normal\">(.+?)</\3><((?:m:)?mi)>(.+?)</\5></\1>", "\\4<\\2><i>\\6</i></\\2>", processed_line) processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mi) mathvariant=\"normal\">(.+?)</\3><((?:m:)?mn)>(.+?)</\5></\1>", "\\4<\\2>\\6</\\2>", processed_line) processed_line = regex.sub(r"<(?:m:)?mo>{}</(?:m:)?mo>".format(se.FUNCTION_APPLICATION), "", processed_line, flags=regex.IGNORECASE) # The ignore case flag is required to match here with the special FUNCTION_APPLICATION character, it's unclear why processed_line = regex.sub(r"<(?:m:)?mfenced><((?:m:)(?:mo|mi|mn|mrow))>(.+?)</\1></(?:m:)?mfenced>", "(<\\1>\\2</\\1>)", processed_line) processed_line = regex.sub(r"<(?:m:)?mrow>([^>].+?)</(?:m:)?mrow>", "\\1", processed_line) processed_line = regex.sub(r"<(?:m:)?mi>([^<]+?)</(?:m:)?mi>", "<i>\\1</i>", processed_line) processed_line = regex.sub(r"<(?:m:)?mi mathvariant=\"normal\">([^<]+?)</(?:m:)?mi>", "\\1", processed_line) processed_line = regex.sub(r"<(?:m:)?mo>([+\-−=×])</(?:m:)?mo>", " \\1 ", processed_line) processed_line = regex.sub(r"<((?:m:)?m[no])>(.+?)</\1>", "\\2", processed_line) processed_line = regex.sub(r"</?(?:m:)?mrow>", "", processed_line) processed_line = processed_line.strip() processed_line = regex.sub(r"</i><i>", "", processed_line, flags=regex.DOTALL) # Did we succeed? Is there any more MathML in our string? if regex.findall("</?(?:m:)?m", processed_line): # Failure! Abandon all hope, and use Firefox to convert the MathML to PNG. se.images.render_mathml_to_png(regex.sub(r"<(/?)m:", "<\\1", line), os.path.join(work_epub_root_directory, "epub", "images", "mathml-{}.png".format(mathml_count))) processed_xhtml = processed_xhtml.replace(line, "<img class=\"mathml epub-type-se-image-color-depth-black-on-transparent\" epub:type=\"se:image.color-depth.black-on-transparent\" src=\"../images/mathml-{}.png\" />".format(mathml_count)) mathml_count = mathml_count + 1 else: # Success! Replace the MathML with our new string. processed_xhtml = processed_xhtml.replace(line, processed_line) if processed_xhtml != xhtml: file.seek(0) file.write(processed_xhtml) file.truncate() # Include epub2 cover metadata cover_id = metadata_tree.xpath("//opf:item[@properties=\"cover-image\"]/@id")[0].replace(".svg", ".jpg") metadata_xhtml = regex.sub(r"(<metadata[^>]+?>)", "\\1\n\t\t<meta content=\"{}\" name=\"cover\" />".format(cover_id), metadata_xhtml) # Add metadata to content.opf indicating this file is a Standard Ebooks compatibility build metadata_xhtml = metadata_xhtml.replace("<dc:publisher", "<meta property=\"se:transform\">compatibility</meta>\n\t\t<dc:publisher") # Add any new MathML images we generated to the manifest if has_mathml: for root, _, filenames in os.walk(os.path.join(work_epub_root_directory, "epub", "images")): filenames = se.natural_sort(filenames) filenames.reverse() for filename in filenames: if filename.lower().startswith("mathml-"): metadata_xhtml = metadata_xhtml.replace("<manifest>", "<manifest><item href=\"images/{}\" id=\"{}\" media-type=\"image/png\"/>".format(filename, filename)) metadata_xhtml = regex.sub(r"properties=\"([^\"]*?)mathml([^\"]*?)\"", "properties=\"\\1\\2\"", metadata_xhtml) metadata_xhtml = regex.sub(r"properties=\"\s*\"", "", metadata_xhtml) # Generate our NCX file for epub2 compatibility. # First find the ToC file. toc_filename = metadata_tree.xpath("//opf:item[@properties=\"nav\"]/@href")[0] metadata_xhtml = metadata_xhtml.replace("<spine>", "<spine toc=\"ncx\">") metadata_xhtml = metadata_xhtml.replace("<manifest>", "<manifest><item href=\"toc.ncx\" id=\"ncx\" media-type=\"application/x-dtbncx+xml\" />") # Now use an XSLT transform to generate the NCX toc_tree = se.epub.convert_toc_to_ncx(work_epub_root_directory, toc_filename, navdoc2ncx_xsl_filename) # Convert the <nav> landmarks element to the <guide> element in content.opf guide_xhtml = "<guide>" for element in toc_tree.xpath("//xhtml:nav[@epub:type=\"landmarks\"]/xhtml:ol/xhtml:li/xhtml:a"): element_xhtml = element.tostring() element_xhtml = regex.sub(r"epub:type=\"([^\"]*)(\s*frontmatter\s*|\s*backmatter\s*)([^\"]*)\"", "type=\"\\1\\3\"", element_xhtml) element_xhtml = regex.sub(r"epub:type=\"[^\"]*(acknowledgements|bibliography|colophon|copyright-page|cover|dedication|epigraph|foreword|glossary|index|loi|lot|notes|preface|bodymatter|titlepage|toc)[^\"]*\"", "type=\"\\1\"", element_xhtml) element_xhtml = element_xhtml.replace("type=\"copyright-page", "type=\"copyright page") # We add the 'text' attribute to the titlepage to tell the reader to start there element_xhtml = element_xhtml.replace("type=\"titlepage", "type=\"title-page text") element_xhtml = regex.sub(r"type=\"\s*\"", "", element_xhtml) element_xhtml = element_xhtml.replace("<a", "<reference") element_xhtml = regex.sub(r">(.+)</a>", " title=\"\\1\" />", element_xhtml) # Replace instances of the `role` attribute since it's illegal in content.opf element_xhtml = regex.sub(r" role=\".*?\"", "", element_xhtml) guide_xhtml = guide_xhtml + element_xhtml guide_xhtml = guide_xhtml + "</guide>" metadata_xhtml = metadata_xhtml.replace("</package>", "") + guide_xhtml + "</package>" # Guide is done, now write content.opf and clean it. # Output the modified content.opf before making more epub2 compatibility hacks. with open(os.path.join(work_epub_root_directory, "epub", "content.opf"), "w", encoding="utf-8") as file: file.write(metadata_xhtml) file.truncate() # All done, clean the output for filename in se.get_target_filenames([work_epub_root_directory], (".xhtml", ".svg", ".opf", ".ncx")): se.formatting.format_xhtml_file(filename, False, filename.endswith("content.opf"), filename.endswith("endnotes.xhtml")) # Write the compatible epub se.epub.write_epub(work_epub_root_directory, os.path.join(output_directory, epub_output_filename)) if verbose: print(" OK") if run_epubcheck: if verbose: print("\tRunning epubcheck on {} ...".format(epub_output_filename), end="", flush=True) output = subprocess.run([epubcheck_path, "--quiet", os.path.join(output_directory, epub_output_filename)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT).stdout.decode().strip() # epubcheck on Ubuntu 18.04 outputs some seemingly harmless warnings; flush them here. if output: output = regex.sub(r"\s*Warning at char 3 in xsl:param/@select on line.+", "", output) output = regex.sub(r"\s*SXWN9000: The parent axis starting at a document node will never select anything", "", output) if output: if verbose: print("\n\t\t" + "\t\t".join(output.splitlines(True)), file=sys.stderr) else: print(output, file=sys.stderr) return if verbose: print(" OK") if build_kindle: if verbose: print("\tBuilding {} ...".format(kindle_output_filename), end="", flush=True) # Kindle doesn't go more than 2 levels deep for ToC, so flatten it here. with open(os.path.join(work_epub_root_directory, "epub", toc_filename), "r+", encoding="utf-8") as file: xhtml = file.read() soup = BeautifulSoup(xhtml, "lxml") for match in soup.select("ol > li > ol > li > ol"): match.unwrap() xhtml = str(soup) pattern = regex.compile(r"(<li>\s*<a href=\"[^\"]+?\">.+?</a>\s*)<li>") matches = 1 while matches > 0: xhtml, matches = pattern.subn(r"\1</li><li>", xhtml) pattern = regex.compile(r"</li>\s*</li>") matches = 1 while matches > 0: xhtml, matches = pattern.subn("</li>", xhtml) file.seek(0) file.write(xhtml) file.truncate() # Rebuild the NCX toc_tree = se.epub.convert_toc_to_ncx(work_epub_root_directory, toc_filename, navdoc2ncx_xsl_filename) # Clean just the ToC and NCX for filename in [os.path.join(work_epub_root_directory, "epub", "toc.ncx"), os.path.join(work_epub_root_directory, "epub", toc_filename)]: se.formatting.format_xhtml_file(filename, False) # Convert endnotes to Kindle popup compatible notes if os.path.isfile(os.path.join(work_epub_root_directory, "epub", "text", "endnotes.xhtml")): with open(os.path.join(work_epub_root_directory, "epub", "text", "endnotes.xhtml"), "r+", encoding="utf-8") as file: xhtml = file.read() # We have to remove the default namespace declaration from our document, otherwise # xpath won't find anything at all. See http://stackoverflow.com/questions/297239/why-doesnt-xpath-work-when-processing-an-xhtml-document-with-lxml-in-python try: tree = etree.fromstring(str.encode(xhtml.replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", ""))) except Exception as ex: raise se.InvalidXhtmlException("Error parsing XHTML file: endnotes.xhtml\n{}".format(ex)) notes = tree.xpath("//li[@epub:type=\"rearnote\" or @epub:type=\"footnote\"]", namespaces=se.XHTML_NAMESPACES) processed_endnotes = "" for note in notes: note_id = note.get("id") note_number = note_id.replace("note-", "") # First, fixup the reference link for this endnote try: ref_link = etree.tostring(note.xpath("p[last()]/a[last()]")[0], encoding="unicode", pretty_print=True, with_tail=False).replace(" xmlns:epub=\"http://www.idpf.org/2007/ops\"", "").strip() except Exception: raise se.InvalidXhtmlException("Can’t find ref link for #{}.".format(note_id)) new_ref_link = regex.sub(r">.*?</a>", ">" + note_number + "</a>.", ref_link) # Now remove the wrapping li node from the note note_text = regex.sub(r"^<li[^>]*?>(.*)</li>$", r"\1", etree.tostring(note, encoding="unicode", pretty_print=True, with_tail=False), flags=regex.IGNORECASE | regex.DOTALL) # Insert our new ref link result = regex.subn(r"^\s*<p([^>]*?)>", "<p\\1 id=\"" + note_id + "\">" + new_ref_link + " ", note_text) # Sometimes there is no leading <p> tag (for example, if the endnote starts with a blockquote # If that's the case, just insert one in front. note_text = result[0] if result[1] == 0: note_text = "<p id=\"" + note_id + "\">" + new_ref_link + "</p>" + note_text # Now remove the old ref_link note_text = note_text.replace(ref_link, "") # Trim trailing spaces left over after removing the ref link note_text = regex.sub(r"\s+</p>", "</p>", note_text).strip() # Sometimes ref links are in their own p tag--remove that too note_text = regex.sub(r"<p>\s*</p>", "", note_text) processed_endnotes += note_text + "\n" # All done with endnotes, so drop them back in xhtml = regex.sub(r"<ol>.*</ol>", processed_endnotes, xhtml, flags=regex.IGNORECASE | regex.DOTALL) file.seek(0) file.write(xhtml) file.truncate() # While Kindle now supports soft hyphens, popup endnotes break words but don't insert the hyphen characters. So for now, remove soft hyphens from the endnotes file. with open(os.path.join(work_epub_root_directory, "epub", "text", "endnotes.xhtml"), "r+", encoding="utf-8") as file: xhtml = file.read() processed_xhtml = xhtml processed_xhtml = processed_xhtml.replace(se.SHY_HYPHEN, "") if processed_xhtml != xhtml: file.seek(0) file.write(processed_xhtml) file.truncate() # Do some compatibility replacements for root, _, filenames in os.walk(work_epub_root_directory): for filename in filenames: if filename.lower().endswith(".xhtml"): with open(os.path.join(root, filename), "r+", encoding="utf-8") as file: xhtml = file.read() processed_xhtml = xhtml # Kindle doesn't recognize most zero-width spaces or word joiners, so just remove them. # It does recognize the word joiner character, but only in the old mobi7 format. The new format renders them as spaces. processed_xhtml = processed_xhtml.replace(se.ZERO_WIDTH_SPACE, "") # Remove the epub:type attribute, as Calibre turns it into just "type" processed_xhtml = regex.sub(r"epub:type=\"[^\"]*?\"", "", processed_xhtml) if processed_xhtml != xhtml: file.seek(0) file.write(processed_xhtml) file.truncate() # Include compatibility CSS with open(os.path.join(work_epub_root_directory, "epub", "css", "core.css"), "a", encoding="utf-8") as core_css_file: with open(resource_filename("se", os.path.join("data", "templates", "kindle.css")), "r", encoding="utf-8") as compatibility_css_file: core_css_file.write(compatibility_css_file.read()) # Add soft hyphens for filename in se.get_target_filenames([work_epub_root_directory], (".xhtml")): se.typography.hyphenate_file(filename, None, True) # Build an epub file we can send to Calibre se.epub.write_epub(work_epub_root_directory, os.path.join(work_directory, epub_output_filename)) # Generate the Kindle file # We place it in the work directory because later we have to update the asin, and the se.mobi.update_asin() function will write to the final output directory cover_path = os.path.join(work_epub_root_directory, "epub", metadata_tree.xpath("//opf:item[@properties=\"cover-image\"]/@href")[0].replace(".svg", ".jpg")) return_code = subprocess.run([ebook_convert_path, os.path.join(work_directory, epub_output_filename), os.path.join(work_directory, kindle_output_filename), "--pretty-print", "--no-inline-toc", "--max-toc-links=0", "--prefer-metadata-cover", "--cover={}".format(cover_path)], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).returncode if return_code: raise se.InvalidSeEbookException("ebook-convert failed.") else: # Success, extract the Kindle cover thumbnail # Update the ASIN in the generated file se.mobi.update_asin(asin, os.path.join(work_directory, kindle_output_filename), os.path.join(output_directory, kindle_output_filename)) # Extract the thumbnail subprocess.run([convert_path, os.path.join(work_epub_root_directory, "epub", "images", "cover.jpg"), "-resize", "432x660", os.path.join(output_directory, "thumbnail_{}_EBOK_portrait.jpg".format(asin))], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) if verbose: print(" OK")
def _create_draft(args: Namespace): """ Implementation for `se create-draft` """ # Put together some variables for later use identifier = se.formatting.make_url_safe( args.author) + "/" + se.formatting.make_url_safe(args.title) title_string = args.title.replace( "'", "’") + ", by " + args.author.replace("'", "’") sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", args.title) pg_producers = [] if args.translator: identifier = identifier + "/" + se.formatting.make_url_safe( args.translator) title_string = title_string + ". Translated by " + args.translator if args.illustrator: identifier = identifier + "/" + se.formatting.make_url_safe( args.illustrator) title_string = title_string + ". Illustrated by " + args.illustrator repo_name = identifier.replace("/", "_") repo_path = Path(repo_name).resolve() if repo_path.is_dir(): raise se.InvalidInputException( f"Directory already exists: [path][link=file://{repo_path}]{repo_path}[/][/]." ) # Download PG HTML and do some fixups if args.pg_url: if args.offline: raise se.RemoteCommandErrorException( "Cannot download Project Gutenberg ebook when offline option is enabled." ) args.pg_url = args.pg_url.replace("http://", "https://") # Get the ebook metadata try: response = requests.get(args.pg_url) pg_metadata_html = response.text except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t download Project Gutenberg ebook metadata page. Exception: {ex}" ) soup = BeautifulSoup(pg_metadata_html, "lxml") # Get the ebook HTML URL from the metadata pg_ebook_url = None for element in soup.select("a[type^=\"text/html\"]"): pg_ebook_url = regex.sub(r"^//", "https://", element["href"]) pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/", pg_ebook_url) if not pg_ebook_url: raise se.RemoteCommandErrorException( "Could download ebook metadata, but couldn’t find URL for the ebook HTML." ) # Get the ebook LCSH categories pg_subjects = [] for element in soup.select("td[property=\"dcterms:subject\"]"): if element["datatype"] == "dcterms:LCSH": for subject_link in element.find("a"): pg_subjects.append(subject_link.strip()) # Get the PG publication date pg_publication_year = None for element in soup.select("td[itemprop=\"datePublished\"]"): pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1", element.text) # Get the actual ebook URL try: response = requests.get(pg_ebook_url) pg_ebook_html = response.text except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t download Project Gutenberg ebook HTML. Exception: {ex}" ) try: fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False) pg_ebook_html = se.strip_bom(fixed_pg_ebook_html) except Exception as ex: raise se.InvalidEncodingException( f"Couldn’t determine text encoding of Project Gutenberg HTML file. Exception: {ex}" ) # Try to guess the ebook language pg_language = "en-US" if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html: pg_language = "en-GB" # Create necessary directories (repo_path / "images").mkdir(parents=True) (repo_path / "src" / "epub" / "css").mkdir(parents=True) (repo_path / "src" / "epub" / "images").mkdir(parents=True) (repo_path / "src" / "epub" / "text").mkdir(parents=True) (repo_path / "src" / "META-INF").mkdir(parents=True) is_pg_html_parsed = True # Write PG data if we have it if args.pg_url and pg_ebook_html: try: soup = BeautifulSoup(pg_ebook_html, "html.parser") # Try to get the PG producers. We only try this if there's a <pre> block with the header info (which is not always the case) for element in soup(text=regex.compile(r"\*\*\*\s*Produced by.+$", flags=regex.DOTALL)): if element.parent.name == "pre": producers_text = regex.sub(r".+?Produced by (.+?)\s*$", "\\1", element, flags=regex.DOTALL) producers_text = regex.sub(r"\(.+?\)", "", producers_text, flags=regex.DOTALL) producers_text = regex.sub( r"(at )?https?://www\.pgdp\.net", "", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r"[\r\n]+", " ", producers_text, flags=regex.DOTALL) producers_text = regex.sub(r",? and ", ", and ", producers_text) producers_text = producers_text.replace( " and the Online", " and The Online") producers_text = producers_text.replace(", and ", ", ").strip() pg_producers = producers_text.split(", ") # Try to strip out the PG header for element in soup(text=regex.compile(r"\*\*\*\s*START OF THIS")): for sibling in element.parent.find_previous_siblings(): sibling.decompose() element.parent.decompose() # Try to strip out the PG license footer for element in soup( text=regex.compile(r"End of (the )?Project Gutenberg")): for sibling in element.parent.find_next_siblings(): sibling.decompose() element.parent.decompose() with open(repo_path / "src" / "epub" / "text" / "body.xhtml", "w", encoding="utf-8") as file: file.write(str(soup)) except OSError as ex: raise se.InvalidFileException( f"Couldn’t write to ebook directory. Exception: {ex}") except: # Save this error for later, because it's still useful to complete the create-draft process # even if we've failed to parse PG's HTML source. is_pg_html_parsed = False se.quiet_remove(repo_path / "src" / "epub" / "text" / "body.xhtml") # Copy over templates _copy_template_file("gitignore", repo_path / ".gitignore") _copy_template_file("LICENSE.md", repo_path) _copy_template_file("container.xml", repo_path / "src" / "META-INF") _copy_template_file("mimetype", repo_path / "src") _copy_template_file("content.opf", repo_path / "src" / "epub") _copy_template_file("onix.xml", repo_path / "src" / "epub") _copy_template_file("toc.xhtml", repo_path / "src" / "epub") _copy_template_file("core.css", repo_path / "src" / "epub" / "css") _copy_template_file("local.css", repo_path / "src" / "epub" / "css") _copy_template_file("logo.svg", repo_path / "src" / "epub" / "images") _copy_template_file("colophon.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("imprint.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("titlepage.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("uncopyright.xhtml", repo_path / "src" / "epub" / "text") _copy_template_file("titlepage.svg", repo_path / "images") _copy_template_file("cover.jpg", repo_path / "images" / "cover.jpg") _copy_template_file("cover.svg", repo_path / "images" / "cover.svg") # Try to find Wikipedia links if possible if args.offline: author_wiki_url = None author_nacoaf_url = None ebook_wiki_url = None translator_wiki_url = None translator_nacoaf_url = None else: author_wiki_url, author_nacoaf_url = _get_wikipedia_url( args.author, True) ebook_wiki_url = None if args.title != "Short Fiction": # There's a "Short Fiction" Wikipedia article, so make an exception for that case ebook_wiki_url, _ = _get_wikipedia_url(args.title, False) translator_wiki_url = None if args.translator: translator_wiki_url, translator_nacoaf_url = _get_wikipedia_url( args.translator, True) # Pre-fill a few templates _replace_in_file(repo_path / "src" / "epub" / "text" / "titlepage.xhtml", "TITLE_STRING", title_string) _replace_in_file(repo_path / "images" / "titlepage.svg", "TITLE_STRING", title_string) _replace_in_file(repo_path / "images" / "cover.svg", "TITLE_STRING", title_string) # Create the titlepage SVG contributors = {} if args.translator: contributors["translated by"] = args.translator if args.illustrator: contributors["illustrated by"] = args.illustrator with open(repo_path / "images" / "titlepage.svg", "w", encoding="utf-8") as file: file.write( _generate_titlepage_svg(args.title, args.author, contributors, title_string)) # Create the cover SVG with open(repo_path / "images" / "cover.svg", "w", encoding="utf-8") as file: file.write(_generate_cover_svg(args.title, args.author, title_string)) # Build the cover/titlepage for distribution epub = SeEpub(repo_path) epub.generate_cover_svg() epub.generate_titlepage_svg() if args.pg_url: _replace_in_file(repo_path / "src" / "epub" / "text" / "imprint.xhtml", "PG_URL", args.pg_url) with open(repo_path / "src" / "epub" / "text" / "colophon.xhtml", "r+", encoding="utf-8") as file: colophon_xhtml = file.read() colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier) colophon_xhtml = colophon_xhtml.replace(">AUTHOR<", f">{args.author}<") colophon_xhtml = colophon_xhtml.replace("TITLE", args.title) if author_wiki_url: colophon_xhtml = colophon_xhtml.replace("AUTHOR_WIKI_URL", author_wiki_url) if args.pg_url: colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url) if pg_publication_year: colophon_xhtml = colophon_xhtml.replace( "PG_YEAR", pg_publication_year) if pg_producers: producers_xhtml = "" for i, producer in enumerate(pg_producers): if "Distributed Proofread" in producer: producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>" elif "anonymous" in producer.lower(): producers_xhtml = producers_xhtml + "<b class=\"name\">An Anonymous Volunteer</b>" else: producers_xhtml = producers_xhtml + f"<b class=\"name\">{producer.strip('.')}</b>" if i < len(pg_producers) - 1: producers_xhtml = producers_xhtml + ", " if i == len(pg_producers) - 2: producers_xhtml = producers_xhtml + "and " producers_xhtml = producers_xhtml + "<br/>" colophon_xhtml = colophon_xhtml.replace( "<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>", producers_xhtml) file.seek(0) file.write(colophon_xhtml) file.truncate() with open(repo_path / "src" / "epub" / "content.opf", "r+", encoding="utf-8") as file: metadata_xml = file.read() metadata_xml = metadata_xml.replace("SE_IDENTIFIER", identifier) metadata_xml = metadata_xml.replace(">AUTHOR<", f">{args.author}<") metadata_xml = metadata_xml.replace(">TITLE_SORT<", f">{sorted_title}<") metadata_xml = metadata_xml.replace(">TITLE<", f">{args.title}<") metadata_xml = metadata_xml.replace("VCS_IDENTIFIER", str(repo_name)) if pg_producers: producers_xhtml = "" i = 1 for producer in pg_producers: if "Distributed Proofread" in producer: producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">The Online Distributed Proofreading Team</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{i}\">https://pgdp.net</meta>\n" elif "anonymous" in producer.lower(): producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">An Anonymous Volunteer</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">Anonymous Volunteer, An</meta>\n" else: producers_xhtml = producers_xhtml + f"\t\t<dc:contributor id=\"transcriber-{i}\">{producer.strip('.')}</dc:contributor>\n\t\t<meta property=\"file-as\" refines=\"#transcriber-{i}\">TRANSCRIBER_SORT</meta>\n" producers_xhtml = producers_xhtml + f"\t\t<meta property=\"role\" refines=\"#transcriber-{i}\" scheme=\"marc:relators\">trc</meta>\n" i = i + 1 metadata_xml = regex.sub( r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>", "\t\t" + producers_xhtml.strip(), metadata_xml, flags=regex.DOTALL) if author_wiki_url: metadata_xml = metadata_xml.replace(">AUTHOR_WIKI_URL<", f">{author_wiki_url}<") if author_nacoaf_url: metadata_xml = metadata_xml.replace(">AUTHOR_NACOAF_URL<", f">{author_nacoaf_url}<") if ebook_wiki_url: metadata_xml = metadata_xml.replace(">EBOOK_WIKI_URL<", f">{ebook_wiki_url}<") if args.translator: metadata_xml = metadata_xml.replace(">TRANSLATOR<", f">{args.translator}<") if translator_wiki_url: metadata_xml = metadata_xml.replace( ">TRANSLATOR_WIKI_URL<", f">{translator_wiki_url}<") if translator_nacoaf_url: metadata_xml = metadata_xml.replace( ">TRANSLATOR_NACOAF_URL<", f">{translator_nacoaf_url}<") else: metadata_xml = regex.sub( r"<dc:contributor id=\"translator\">.+?<dc:contributor id=\"artist\">", "<dc:contributor id=\"artist\">", metadata_xml, flags=regex.DOTALL) if args.pg_url: if pg_subjects: subject_xhtml = "" i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + f"\t\t<dc:subject id=\"subject-{i}\">{subject}</dc:subject>\n" i = i + 1 i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + f"\t\t<meta property=\"authority\" refines=\"#subject-{i}\">LCSH</meta>\n" # Now, get the LCSH ID by querying LCSH directly. try: response = requests.get( f"https://id.loc.gov/search/?q=%22{urllib.parse.quote(subject)}%22" ) result = regex.search( fr"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{regex.escape(subject.replace(' -- ', '--'))}</a>", response.text) loc_id = "Unknown" try: loc_id = result.group(1) except Exception as ex: pass subject_xhtml = subject_xhtml + f"\t\t<meta property=\"term\" refines=\"#subject-{i}\">{loc_id}</meta>\n" except Exception as ex: raise se.RemoteCommandErrorException( f"Couldn’t connect to [url][link=https://id.loc.gov]https://id.loc.gov[/][/]. Exception: {ex}" ) i = i + 1 metadata_xml = regex.sub( r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>", "\t\t" + subject_xhtml.strip(), metadata_xml) metadata_xml = metadata_xml.replace( "<dc:language>LANG</dc:language>", f"<dc:language>{pg_language}</dc:language>") metadata_xml = metadata_xml.replace( "<dc:source>PG_URL</dc:source>", f"<dc:source>{args.pg_url}</dc:source>") file.seek(0) file.write(metadata_xml) file.truncate() # Set up local git repo repo = git.Repo.init(repo_path) if args.email: with repo.config_writer() as config: config.set_value("user", "email", args.email) if args.pg_url and pg_ebook_html and not is_pg_html_parsed: raise se.InvalidXhtmlException( "Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook." )
def format_xhtml(xhtml: str, single_lines: bool = False, is_metadata_file: bool = False, is_endnotes_file: bool = False) -> str: """ Pretty-print well-formed XHTML. INPUTS xhtml: A string of well-formed XHTML single_lines: True to collapse hard-wrapped line breaks, like those found at Project Gutenberg, to single lines is_metadata_file: True if the passed XHTML is an SE content.opf metadata file is_endnotes_file: True if the passed XHTML is an SE endnotes file OUTPUTS A string of pretty-printed XHTML. """ xmllint_path = shutil.which("xmllint") if xmllint_path is None: se.print_error("Couldn’t locate xmllint. Is it installed?") return se.MissingDependencyException.code env = os.environ.copy() env["XMLLINT_INDENT"] = "\t" if single_lines: xhtml = xhtml.replace("\n", " ") xhtml = regex.sub(r"\s+", " ", xhtml) # Epub3 doesn't allow named entities, so convert them to their unicode equivalents # But, don't unescape the content.opf long-description accidentally if not is_metadata_file: xhtml = regex.sub(r"&#?\w+;", _replace_character_references, xhtml) # Remove unnecessary doctypes which can cause xmllint to hang xhtml = regex.sub(r"<!DOCTYPE[^>]+?>", "", xhtml, flags=regex.DOTALL) # Canonicalize XHTML result = subprocess.run([xmllint_path, "--c14n", "-"], input=xhtml.encode(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) xhtml = result.stdout.decode() try: error = result.stderr.decode().strip() if error: raise se.InvalidXhtmlException( "Couldn't parse file; files must be in XHTML format, which is not the same as HTML. xmllint says:\n{}" .format(error.replace("-:", "Line "))) except UnicodeDecodeError as ex: raise se.InvalidEncodingException( "Invalid encoding; UTF-8 expected: {}".format(ex)) except Exception as ex: raise se.InvalidXhtmlException( "Couldn't parse file; files must be in XHTML format, which is not the same as HTML: {}" .format(ex)) # Add the XML header that xmllint stripped during c14n xhtml = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + xhtml # Pretty-print XML xhtml = subprocess.run([xmllint_path, "--format", "-"], input=xhtml.encode(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env).stdout.decode() # Remove white space between some tags xhtml = regex.sub(r"<p([^>]*?)>\s+([^<\s])", "<p\\1>\\2", xhtml, flags=regex.DOTALL) xhtml = regex.sub(r"([^>\s])\s+</p>", "\\1</p>", xhtml, flags=regex.DOTALL) # xmllint has problems with removing spacing between some inline HTML5 elements. Try to fix those problems here. xhtml = regex.sub(r"</(abbr|cite|i|span)><(abbr|cite|i|span)", "</\\1> <\\2", xhtml) # Try to fix inline elements directly followed by an <a> tag, unless that <a> tag is a noteref. xhtml = regex.sub( r"</(abbr|cite|i|span)><(a(?! href=\"[^\"]+?\" id=\"noteref\-))", "</\\1> <\\2", xhtml) # Two sequential inline elements, when they are the only children of a block, are indented. But this messes up spacing if the 2nd element is a noteref. xhtml = regex.sub( r"</(abbr|cite|i|span)>\s+<(a href=\"[^\"]+?\" id=\"noteref\-)", "</\\1><\\2", xhtml, flags=regex.DOTALL) # Try to fix <cite> tags running next to referrer <a> tags. if is_endnotes_file: xhtml = regex.sub( r"</cite>(<a href=\"[^\"]+?\" epub:type=\"se:referrer\")", "</cite> \\1", xhtml) return xhtml
def build(self, metadata_xhtml: str, metadata_tree: se.easy_xml.EasyXmlTree, run_epubcheck: bool, build_kobo: bool, build_kindle: bool, output_directory: Path, proof: bool, build_covers: bool, verbose: bool) -> None: """ Entry point for `se build` """ # Check for some required tools if build_kindle: which_ebook_convert = shutil.which("ebook-convert") if which_ebook_convert: ebook_convert_path = Path(which_ebook_convert) else: # Look for default Mac calibre app path if none found in path ebook_convert_path = Path("/Applications/calibre.app/Contents/MacOS/ebook-convert") if not ebook_convert_path.exists(): raise se.MissingDependencyException("Couldn’t locate ebook-convert. Is Calibre installed?") if run_epubcheck: if not shutil.which("java"): raise se.MissingDependencyException("Couldn’t locate java. Is it installed?") # Check the output directory and create it if it doesn't exist try: output_directory = output_directory.resolve() output_directory.mkdir(parents=True, exist_ok=True) except Exception: raise se.FileExistsException(f"Couldn’t create output directory: {output_directory}") # All clear to start building! if verbose: print(f"Building {self.path} ...") with tempfile.TemporaryDirectory() as temp_directory: work_directory = Path(temp_directory) work_epub_root_directory = work_directory / "src" copy_tree(self.path, str(work_directory)) try: shutil.rmtree(work_directory / ".git") except Exception: pass # By convention the ASIN is set to the SHA-1 sum of the book's identifying URL identifier = metadata_tree.xpath("//dc:identifier")[0].inner_html().replace("url:", "") asin = sha1(identifier.encode("utf-8")).hexdigest() title = metadata_tree.xpath("//dc:title")[0].inner_html() url_title = se.formatting.make_url_safe(title) url_author = "" for author in metadata_tree.xpath("//dc:creator"): url_author = url_author + se.formatting.make_url_safe(author.inner_html()) + "_" url_author = url_author.rstrip("_") epub_output_filename = "{}_{}{}.epub".format(url_author, url_title, ".proof" if proof else "") epub3_output_filename = "{}_{}{}.epub3".format(url_author, url_title, ".proof" if proof else "") kobo_output_filename = "{}_{}{}.kepub.epub".format(url_author, url_title, ".proof" if proof else "") kindle_output_filename = "{}_{}{}.azw3".format(url_author, url_title, ".proof" if proof else "") # Clean up old output files if any se.quiet_remove(output_directory / f"thumbnail_{asin}_EBOK_portrait.jpg") se.quiet_remove(output_directory / "cover.jpg") se.quiet_remove(output_directory / "cover-thumbnail.jpg") se.quiet_remove(output_directory / epub_output_filename) se.quiet_remove(output_directory / epub3_output_filename) se.quiet_remove(output_directory / kobo_output_filename) se.quiet_remove(output_directory / kindle_output_filename) # Are we including proofreading CSS? if proof: with open(work_epub_root_directory / "epub" / "css" / "local.css", "a", encoding="utf-8") as local_css_file: with importlib_resources.open_text("se.data.templates", "proofreading.css", encoding="utf-8") as proofreading_css_file: local_css_file.write(proofreading_css_file.read()) # Update the release date in the metadata and colophon if self.last_commit: last_updated_iso = regex.sub(r"\.[0-9]+$", "", self.last_commit.timestamp.isoformat()) + "Z" last_updated_iso = regex.sub(r"\+.+?Z$", "Z", last_updated_iso) # In the line below, we can't use %l (unpadded 12 hour clock hour) because it isn't portable to Windows. # Instead we use %I (padded 12 hour clock hour) and then do a string replace to remove leading zeros. last_updated_friendly = f"{self.last_commit.timestamp:%B %e, %Y, %I:%M <abbr class=\"time eoc\">%p</abbr>}".replace(" 0", " ") last_updated_friendly = regex.sub(r"\s+", " ", last_updated_friendly).replace("AM", "a.m.").replace("PM", "p.m.").replace(" <abbr", " <abbr") # Set modified date in content.opf self.metadata_xhtml = regex.sub(r"<meta property=\"dcterms:modified\">[^<]+?</meta>", f"<meta property=\"dcterms:modified\">{last_updated_iso}</meta>", self.metadata_xhtml) with open(work_epub_root_directory / "epub" / "content.opf", "w", encoding="utf-8") as file: file.seek(0) file.write(self.metadata_xhtml) file.truncate() # Update the colophon with release info with open(work_epub_root_directory / "epub" / "text" / "colophon.xhtml", "r+", encoding="utf-8") as file: xhtml = file.read() xhtml = xhtml.replace("<p>The first edition of this ebook was released on<br/>", f"<p>This edition was released on<br/>\n\t\t\t<b>{last_updated_friendly}</b><br/>\n\t\t\tand is based on<br/>\n\t\t\t<b>revision {self.last_commit.short_sha}</b>.<br/>\n\t\t\tThe first edition of this ebook was released on<br/>") file.seek(0) file.write(xhtml) file.truncate() # Output the pure epub3 file if verbose: print(f"\tBuilding {epub3_output_filename} ...", end="", flush=True) se.epub.write_epub(work_epub_root_directory, output_directory / epub3_output_filename) if verbose: print(" OK") if build_kobo: if verbose: print(f"\tBuilding {kobo_output_filename} ...", end="", flush=True) else: if verbose: print(f"\tBuilding {epub_output_filename} ...", end="", flush=True) # Now add epub2 compatibility. # Include compatibility CSS with open(work_epub_root_directory / "epub" / "css" / "core.css", "a", encoding="utf-8") as core_css_file: with importlib_resources.open_text("se.data.templates", "compatibility.css", encoding="utf-8") as compatibility_css_file: core_css_file.write(compatibility_css_file.read()) # Simplify CSS and tags total_css = "" # Simplify the CSS first. Later we'll update the document to match our simplified selectors. # While we're doing this, we store the original css into a single variable so we can extract the original selectors later. for root, _, filenames in os.walk(work_epub_root_directory): for filename in fnmatch.filter(filenames, "*.css"): with open(Path(root) / filename, "r+", encoding="utf-8") as file: css = file.read() # Before we do anything, we process a special case in core.css if "core.css" in filename: css = regex.sub(r"abbr{.+?}", "", css, flags=regex.DOTALL) total_css = total_css + css + "\n" file.seek(0) file.write(se.formatting.simplify_css(css)) file.truncate() # Now get a list of original selectors # Remove @supports(){} total_css = regex.sub(r"@supports.+?{(.+?)}\s*}", "\\1}", total_css, flags=regex.DOTALL) # Remove CSS rules total_css = regex.sub(r"{[^}]+}", "", total_css) # Remove trailing commas total_css = regex.sub(r",", "", total_css) # Remove comments total_css = regex.sub(r"/\*.+?\*/", "", total_css, flags=regex.DOTALL) # Remove @ defines total_css = regex.sub(r"^@.+", "", total_css, flags=regex.MULTILINE) # Construct a dictionary of the original selectors selectors = {line for line in total_css.splitlines() if line != ""} # Get a list of .xhtml files to simplify for root, _, filenames in os.walk(work_epub_root_directory): for filename in fnmatch.filter(filenames, "*.xhtml"): # Don't mess with the ToC, since if we have ol/li > first-child selectors we could screw it up if filename == "toc.xhtml": continue with open(Path(root) / filename, "r+", encoding="utf-8") as file: # We have to remove the default namespace declaration from our document, otherwise # xpath won't find anything at all. See http://stackoverflow.com/questions/297239/why-doesnt-xpath-work-when-processing-an-xhtml-document-with-lxml-in-python xhtml = file.read().replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", "") processed_xhtml = xhtml try: tree = etree.fromstring(str.encode(xhtml)) except Exception as ex: raise se.InvalidXhtmlException(f"Error parsing XHTML file: {filename}\n{ex}") # Now iterate over each CSS selector and see if it's used in any of the files we found for selector in selectors: try: # Add classes to elements that match any of our selectors to simplify. For example, if we select :first-child, add a "first-child" class to all elements that match that. for selector_to_simplify in se.SELECTORS_TO_SIMPLIFY: while selector_to_simplify in selector: # Potentially the pseudoclass we’ll simplify isn’t at the end of the selector, # so we need to temporarily remove the trailing part to target the right elements. split_selector = regex.split(fr"({selector_to_simplify}(\(.*?\))?)", selector, 1) target_element_selector = ''.join(split_selector[0:2]) replacement_class = split_selector[1].replace(":", "").replace("(", "-").replace("n-", "n-minus-").replace("n+", "n-plus-").replace(")", "") selector = selector.replace(split_selector[1], "." + replacement_class, 1) sel = lxml.cssselect.CSSSelector(target_element_selector, translator="xhtml", namespaces=se.XHTML_NAMESPACES) for element in tree.xpath(sel.path, namespaces=se.XHTML_NAMESPACES): current_class = element.get("class") if current_class is not None and replacement_class not in current_class: current_class = current_class + " " + replacement_class else: current_class = replacement_class element.set("class", current_class) except lxml.cssselect.ExpressionError: # This gets thrown if we use pseudo-elements, which lxml doesn't support pass except lxml.cssselect.SelectorSyntaxError as ex: raise se.InvalidCssException(f"Couldn't parse CSS in or near this line: {selector}\n{ex}") # We've already replaced attribute/namespace selectors with classes in the CSS, now add those classes to the matching elements if "[epub|type" in selector: for namespace_selector in regex.findall(r"\[epub\|type\~\=\"[^\"]*?\"\]", selector): sel = lxml.cssselect.CSSSelector(namespace_selector, translator="xhtml", namespaces=se.XHTML_NAMESPACES) for element in tree.xpath(sel.path, namespaces=se.XHTML_NAMESPACES): new_class = regex.sub(r"^\.", "", se.formatting.namespace_to_class(namespace_selector)) current_class = element.get("class", "") if new_class not in current_class: current_class = f"{current_class} {new_class}".strip() element.set("class", current_class) processed_xhtml = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + etree.tostring(tree, encoding=str, pretty_print=True) # We do this round in a second pass because if we modify the tree like this, it screws up how lxml does processing later. # If it's all done in one pass, we wind up in a race condition where some elements are fixed and some not tree = etree.fromstring(str.encode(processed_xhtml)) for selector in selectors: try: sel = lxml.cssselect.CSSSelector(selector, translator="xhtml", namespaces=se.XHTML_NAMESPACES) except lxml.cssselect.ExpressionError: # This gets thrown if we use pseudo-elements, which lxml doesn't support continue except lxml.cssselect.SelectorSyntaxError as ex: raise se.InvalidCssException(f"Couldn't parse CSS in or near this line: {selector}\n{ex}") # Convert <abbr> to <span> if "abbr" in selector: for element in tree.xpath(sel.path, namespaces=se.XHTML_NAMESPACES): # Why would you want the tail to output by default?!? raw_string = etree.tostring(element, encoding=str, with_tail=False) # lxml--crap as usual--includes a bunch of namespace information in every element we print. # Remove it here. raw_string = raw_string.replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", "") raw_string = raw_string.replace(" xmlns:epub=\"http://www.idpf.org/2007/ops\"", "") raw_string = raw_string.replace(" xmlns:m=\"http://www.w3.org/1998/Math/MathML\"", "") # Now lxml doesn't let us modify the tree, so we just do a straight up regex replace to turn this into a span processed_string = raw_string.replace("<abbr", "<span") processed_string = processed_string.replace("</abbr", "</span") # Now we have a nice, fixed string. But, since lxml can't replace elements, we write it ourselves. processed_xhtml = processed_xhtml.replace(raw_string, processed_string) tree = etree.fromstring(str.encode(processed_xhtml)) # Now we just remove all stray abbr tags that were not styled by CSS processed_xhtml = regex.sub(r"</?abbr[^>]*?>", "", processed_xhtml) # Remove datetime="" attribute in <time> tags, which is not always understood by epubcheck processed_xhtml = regex.sub(r" datetime=\"[^\"]+?\"", "", processed_xhtml) tree = etree.fromstring(str.encode(processed_xhtml)) if processed_xhtml != xhtml: file.seek(0) file.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" + etree.tostring(tree, encoding=str, pretty_print=True).replace("<html", "<html xmlns=\"http://www.w3.org/1999/xhtml\"")) file.truncate() # Done simplifying CSS and tags! # Extract cover and cover thumbnail cover_svg_file = work_epub_root_directory / "epub" / "images" / "cover.svg" if not os.path.isfile(cover_svg_file): raise se.MissingDependencyException("Cover image is missing. Did you run build-images?") svg2png(url=str(cover_svg_file), write_to=str(work_directory / "cover.png")) cover = Image.open(work_directory / "cover.png") cover = cover.convert("RGB") # Remove alpha channel from PNG if necessary cover.save(work_epub_root_directory / "epub" / "images" / "cover.jpg") (work_directory / "cover.png").unlink() if build_covers: shutil.copy2(work_epub_root_directory / "epub" / "images" / "cover.jpg", output_directory / "cover.jpg") shutil.copy2(cover_svg_file, output_directory / "cover-thumbnail.svg") # Path arguments must be cast to string svg2png(url=str(output_directory / "cover-thumbnail.svg"), write_to=str(work_directory / "cover-thumbnail.png")) cover = Image.open(work_directory / "cover-thumbnail.png") cover = cover.resize((COVER_THUMBNAIL_WIDTH, COVER_THUMBNAIL_HEIGHT)) cover = cover.convert("RGB") # Remove alpha channel from PNG if necessary cover.save(output_directory / "cover-thumbnail.jpg") (work_directory / "cover-thumbnail.png").unlink() (output_directory / "cover-thumbnail.svg").unlink() cover_svg_file.unlink() # Massage image references in content.opf metadata_xhtml = metadata_xhtml.replace("cover.svg", "cover.jpg") metadata_xhtml = metadata_xhtml.replace(".svg", ".png") metadata_xhtml = metadata_xhtml.replace("id=\"cover.jpg\" media-type=\"image/svg+xml\"", "id=\"cover.jpg\" media-type=\"image/jpeg\"") metadata_xhtml = metadata_xhtml.replace("image/svg+xml", "image/png") metadata_xhtml = regex.sub(r"properties=\"([^\"]*?)svg([^\"]*?)\"", "properties=\"\\1\\2\"", metadata_xhtml) # We may also have the `mathml` property # Add an element noting the version of the se tools that built this ebook metadata_xhtml = regex.sub(r"<dc:publisher", f"<meta property=\"se:built-with\">{se.VERSION}</meta>\n\t\t<dc:publisher", metadata_xhtml) # Google Play Books chokes on https XML namespace identifiers (as of at least 2017-07) metadata_xhtml = metadata_xhtml.replace("https://standardebooks.org/vocab/1.0", "http://standardebooks.org/vocab/1.0") # Output the modified content.opf so that we can build the kobo book before making more epub2 compatibility hacks with open(work_epub_root_directory / "epub" / "content.opf", "w", encoding="utf-8") as file: file.write(metadata_xhtml) file.truncate() # Recurse over xhtml files to make some compatibility replacements for root, _, filenames in os.walk(work_epub_root_directory): for filename in filenames: if filename.lower().endswith(".svg"): # For night mode compatibility, give the titlepage a 1px white stroke attribute if filename.lower() == "titlepage.svg" or filename.lower() == "logo.svg": with open(Path(root) / filename, "r+", encoding="utf-8") as file: svg = file.read() paths = svg # What we're doing here is faking the `stroke-align: outside` property, which is an unsupported draft spec right now. # We do this by duplicating all the SVG paths, and giving the duplicates a 2px stroke. The originals are directly on top, # so the 2px stroke becomes a 1px stroke that's *outside* of the path instead of being *centered* on the path border. # This looks much nicer, but we also have to increase the image size by 2px in both directions, and re-center the whole thing. if filename.lower() == "titlepage.svg": stroke_width = SVG_TITLEPAGE_OUTER_STROKE_WIDTH else: stroke_width = SVG_OUTER_STROKE_WIDTH # First, strip out non-path, non-group elements paths = regex.sub(r"<\?xml[^<]+?\?>", "", paths) paths = regex.sub(r"</?svg[^<]*?>", "", paths) paths = regex.sub(r"<title>[^<]+?</title>", "", paths) paths = regex.sub(r"<desc>[^<]+?</desc>", "", paths) # `paths` is now our "duplicate". Add a 2px stroke. paths = paths.replace("<path", f"<path style=\"stroke: #ffffff; stroke-width: {stroke_width}px;\"") # Inject the duplicate under the old SVG paths. We do this by only replacing the first regex match for <g> or <path> svg = regex.sub(r"(<g|<path)", f"{paths}\\1", svg, 1) # If this SVG specifies height/width, then increase height and width by 2 pixels and translate everything by 1px try: height = int(regex.search(r"<svg[^>]+?height=\"([0-9]+)\"", svg).group(1)) + stroke_width svg = regex.sub(r"<svg([^<]*?)height=\"[0-9]+\"", f"<svg\\1height=\"{height}\"", svg) width = int(regex.search(r"<svg[^>]+?width=\"([0-9]+)\"", svg).group(1)) + stroke_width svg = regex.sub(r"<svg([^<]*?)width=\"[0-9]+\"", f"<svg\\1width=\"{width}\"", svg) # Add a grouping element to translate everything over 1px svg = regex.sub(r"(<g|<path)", "<g transform=\"translate({amount}, {amount})\">\n\\1".format(amount=(stroke_width / 2)), svg, 1) svg = svg.replace("</svg>", "</g>\n</svg>") except AttributeError: # Thrown when the regex doesn't match (i.e. SVG doesn't specify height/width) pass file.seek(0) file.write(svg) file.truncate() # Convert SVGs to PNGs at 2x resolution # Path arguments must be cast to string svg2png(url=str(Path(root) / filename), write_to=regex.sub(r"\.svg$", ".png", str(Path(root) / filename)), scale=2) (Path(root) / filename).unlink() if filename.lower().endswith(".xhtml"): with open(Path(root) / filename, "r+", encoding="utf-8") as file: xhtml = file.read() processed_xhtml = xhtml # Check if there's any MathML to convert. # We expect MathML to be the "content" type (versus the "presentational" type). # We use an XSL transform to convert from "content" to "presentational" MathML. # If we start with presentational, then nothing will be changed. # Kobo supports presentational MathML. After we build kobo, we convert the presentational MathML to PNG for the rest of the builds. mathml_transform = None for line in regex.findall(r"<(?:m:)?math[^>]*?>(.+?)</(?:m:)?math>", processed_xhtml, flags=regex.DOTALL): mathml_content_tree = se.easy_xml.EasyXmlTree("<?xml version=\"1.0\" encoding=\"utf-8\"?><math xmlns=\"http://www.w3.org/1998/Math/MathML\">{}</math>".format(regex.sub(r"<(/?)m:", "<\\1", line))) # Initialize the transform object, if we haven't yet if not mathml_transform: with importlib_resources.path("se.data", "mathmlcontent2presentation.xsl") as mathml_xsl_filename: mathml_transform = etree.XSLT(etree.parse(str(mathml_xsl_filename))) # Transform the mathml and get a string representation # XSLT comes from https://github.com/fred-wang/webextension-content-mathml-polyfill mathml_presentation_tree = mathml_transform(mathml_content_tree.etree) mathml_presentation_xhtml = etree.tostring(mathml_presentation_tree, encoding="unicode", pretty_print=True, with_tail=False).strip() # Plop our string back in to the XHTML we're processing processed_xhtml = regex.sub(r"<math[^>]*?>\{}\</math>".format(regex.escape(line)), mathml_presentation_xhtml, processed_xhtml, flags=regex.MULTILINE) if filename == "endnotes.xhtml": # iOS renders the left-arrow-hook character as an emoji; this fixes it and forces it to render as text. # See https://github.com/standardebooks/tools/issues/73 # See http://mts.io/2015/04/21/unicode-symbol-render-text-emoji/ processed_xhtml = processed_xhtml.replace("\u21a9", "\u21a9\ufe0e") # Add ARIA roles, which are just mostly duplicate attributes to epub:type for role in se.ARIA_ROLES: processed_xhtml = regex.sub(fr"(epub:type=\"[^\"]*?{role}[^\"]*?\")", f"\\1 role=\"doc-{role}\"", processed_xhtml) # Some ARIA roles can't apply to some elements. # For example, epilogue can't apply to <article> processed_xhtml = regex.sub(r"<article ([^>]*?)role=\"doc-epilogue\"", "<article \\1", processed_xhtml) if filename == "toc.xhtml": landmarks_xhtml = regex.findall(r"<nav epub:type=\"landmarks\">.*?</nav>", processed_xhtml, flags=regex.DOTALL) landmarks_xhtml = regex.sub(r" role=\"doc-.*?\"", "", landmarks_xhtml[0]) processed_xhtml = regex.sub(r"<nav epub:type=\"landmarks\">.*?</nav>", landmarks_xhtml, processed_xhtml, flags=regex.DOTALL) # But, remove ARIA roles we added to h# tags, because tyically those roles are for sectioning content. # For example, we might have an h2 that is both a title and dedication. But ARIA can't handle it being a dedication. # See The Man Who Was Thursday by G K Chesterton processed_xhtml = regex.sub(r"(<h[1-6] [^>]*) role=\".*?\">", "\\1>", processed_xhtml) # Since we convert SVGs to raster, here we add the color-depth semantic for night mode processed_xhtml = processed_xhtml.replace("z3998:publisher-logo", "z3998:publisher-logo se:image.color-depth.black-on-transparent") processed_xhtml = regex.sub(r"class=\"([^\"]*?)epub-type-z3998-publisher-logo([^\"]*?)\"", "class=\"\\1epub-type-z3998-publisher-logo epub-type-se-image-color-depth-black-on-transparent\\2\"", processed_xhtml) # Special case for the titlepage if filename == "titlepage.xhtml": processed_xhtml = processed_xhtml.replace("<img", "<img class=\"epub-type-se-image-color-depth-black-on-transparent\" epub:type=\"se:image.color-depth.black-on-transparent\"") # Google Play Books chokes on https XML namespace identifiers (as of at least 2017-07) processed_xhtml = processed_xhtml.replace("https://standardebooks.org/vocab/1.0", "http://standardebooks.org/vocab/1.0") # We converted svgs to pngs, so replace references processed_xhtml = processed_xhtml.replace("cover.svg", "cover.jpg") processed_xhtml = processed_xhtml.replace(".svg", ".png") # To get popup footnotes in iBooks, we have to change epub:endnote to epub:footnote. # Remember to get our custom style selectors too. processed_xhtml = regex.sub(r"epub:type=\"([^\"]*?)endnote([^\"]*?)\"", "epub:type=\"\\1footnote\\2\"", processed_xhtml) processed_xhtml = regex.sub(r"class=\"([^\"]*?)epub-type-endnote([^\"]*?)\"", "class=\"\\1epub-type-footnote\\2\"", processed_xhtml) # Include extra lang tag for accessibility compatibility. processed_xhtml = regex.sub(r"xml:lang\=\"([^\"]+?)\"", "lang=\"\\1\" xml:lang=\"\\1\"", processed_xhtml) # Typography: replace double and triple em dash characters with extra em dashes. processed_xhtml = processed_xhtml.replace("⸺", f"—{se.WORD_JOINER}—") processed_xhtml = processed_xhtml.replace("⸻", f"—{se.WORD_JOINER}—{se.WORD_JOINER}—") # Typography: replace some other less common characters. processed_xhtml = processed_xhtml.replace("⅒", "1/10") processed_xhtml = processed_xhtml.replace("℅", "c/o") processed_xhtml = processed_xhtml.replace("✗", "×") processed_xhtml = processed_xhtml.replace(" ", f"{se.NO_BREAK_SPACE}{se.NO_BREAK_SPACE}") # em-space to two nbsps # Many e-readers don't support the word joiner character (U+2060). # They DO, however, support the now-deprecated zero-width non-breaking space (U+FEFF) # For epubs, do this replacement. Kindle now seems to handle everything fortunately. processed_xhtml = processed_xhtml.replace(se.WORD_JOINER, se.ZERO_WIDTH_SPACE) # Some minor code style cleanup processed_xhtml = processed_xhtml.replace(" >", ">") if processed_xhtml != xhtml: file.seek(0) file.write(processed_xhtml) file.truncate() if filename.lower().endswith(".css"): with open(Path(root) / filename, "r+", encoding="utf-8") as file: css = file.read() processed_css = css # To get popup footnotes in iBooks, we have to change epub:endnote to epub:footnote. # Remember to get our custom style selectors too. processed_css = processed_css.replace("endnote", "footnote") # Add new break-* aliases for compatibilty with newer readers. processed_css = regex.sub(r"(\s+)page-break-(.+?:\s.+?;)", "\\1page-break-\\2\t\\1break-\\2", processed_css) if processed_css != css: file.seek(0) file.write(processed_css) file.truncate() if build_kobo: with tempfile.TemporaryDirectory() as temp_directory: kobo_work_directory = Path(temp_directory) copy_tree(str(work_epub_root_directory), str(kobo_work_directory)) for root, _, filenames in os.walk(kobo_work_directory): # Add a note to content.opf indicating this is a transform build for filename in fnmatch.filter(filenames, "content.opf"): with open(Path(root) / filename, "r+", encoding="utf-8") as file: xhtml = file.read() xhtml = regex.sub(r"<dc:publisher", "<meta property=\"se:transform\">kobo</meta>\n\t\t<dc:publisher", xhtml) file.seek(0) file.write(xhtml) file.truncate() # Kobo .kepub files need each clause wrapped in a special <span> tag to enable highlighting. # Do this here. Hopefully Kobo will get their act together soon and drop this requirement. for filename in fnmatch.filter(filenames, "*.xhtml"): kobo.paragraph_counter = 1 kobo.segment_counter = 1 # Don't add spans to the ToC if filename == "toc.xhtml": continue with open(Path(root) / filename, "r+", encoding="utf-8") as file: xhtml = file.read() # Kobos don't have fonts that support the ↩ character in endnotes, so replace it with « if filename == "endnotes.xhtml": # Note that we replaced ↩ with \u21a9\ufe0e in an earlier iOS compatibility fix xhtml = regex.sub(r"epub:type=\"backlink\">\u21a9\ufe0e</a>", "epub:type=\"backlink\">«</a>", xhtml) # We have to remove the default namespace declaration from our document, otherwise # xpath won't find anything at all. See http://stackoverflow.com/questions/297239/why-doesnt-xpath-work-when-processing-an-xhtml-document-with-lxml-in-python try: tree = etree.fromstring(str.encode(xhtml.replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", ""))) except Exception as ex: raise se.InvalidXhtmlException(f"Error parsing XHTML file: {filename}\n{ex}", verbose) kobo.add_kobo_spans_to_node(tree.xpath("./body", namespaces=se.XHTML_NAMESPACES)[0]) xhtml = etree.tostring(tree, encoding="unicode", pretty_print=True, with_tail=False) xhtml = regex.sub(r"<html:span", "<span", xhtml) xhtml = regex.sub(r"html:span>", "span>", xhtml) xhtml = regex.sub(r"<span xmlns:html=\"http://www.w3.org/1999/xhtml\"", "<span", xhtml) xhtml = regex.sub(r"<html", "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n<html xmlns=\"http://www.w3.org/1999/xhtml\"", xhtml) file.seek(0) file.write(xhtml) file.truncate() se.epub.write_epub(kobo_work_directory, output_directory / kobo_output_filename) if verbose: print(" OK") print(f"\tBuilding {epub_output_filename} ...", end="", flush=True) # Now work on more epub2 compatibility # Recurse over css files to make some compatibility replacements. for root, _, filenames in os.walk(work_epub_root_directory): for filename in filenames: if filename.lower().endswith(".css"): with open(Path(root) / filename, "r+", encoding="utf-8") as file: css = file.read() processed_css = css processed_css = regex.sub(r"(page\-break\-(before|after|inside)\s*:\s*(.+))", "\\1\n\t-webkit-column-break-\\2: \\3 /* For Readium */", processed_css) processed_css = regex.sub(r"^\s*hyphens\s*:\s*(.+)", "\thyphens: \\1\n\tadobe-hyphenate: \\1\n\t-webkit-hyphens: \\1\n\t-epub-hyphens: \\1\n\t-moz-hyphens: \\1", processed_css, flags=regex.MULTILINE) processed_css = regex.sub(r"^\s*hyphens\s*:\s*none;", "\thyphens: none;\n\tadobe-text-layout: optimizeSpeed; /* For Nook */", processed_css, flags=regex.MULTILINE) if processed_css != css: file.seek(0) file.write(processed_css) file.truncate() # Sort out MathML compatibility has_mathml = "mathml" in metadata_xhtml if has_mathml: # We import this late because we don't want to load selenium if we're not going to use it! from se import browser # pylint: disable=import-outside-toplevel driver = browser.initialize_selenium_firefox_webdriver() mathml_count = 1 for root, _, filenames in os.walk(work_epub_root_directory): for filename in filenames: if filename.lower().endswith(".xhtml"): with open(Path(root) / filename, "r+", encoding="utf-8") as file: xhtml = file.read() processed_xhtml = xhtml replaced_mathml: List[str] = [] # Check if there's MathML we want to convert # We take a naive approach and use some regexes to try to simplify simple MathML expressions. # For each MathML expression, if our round of regexes finishes and there is still MathML in the processed result, we abandon the attempt and render to PNG using Firefox. for line in regex.findall(r"<(?:m:)math[^>]*?>(?:.+?)</(?:m:)math>", processed_xhtml, flags=regex.DOTALL): if line not in replaced_mathml: replaced_mathml.append(line) # Store converted lines to save time in case we have multiple instances of the same MathML mathml_tree = se.easy_xml.EasyXmlTree("<?xml version=\"1.0\" encoding=\"utf-8\"?>{}".format(regex.sub(r"<(/?)m:", "<\\1", line))) processed_line = line # If the mfenced element has more than one child, they are separated by commas when rendered. # This is too complex for our naive regexes to work around. So, if there is an mfenced element with more than one child, abandon the attempt. if not mathml_tree.css_select("mfenced > * + *"): processed_line = regex.sub(r"</?(?:m:)?math[^>]*?>", "", processed_line) processed_line = regex.sub(r"<!--.+?-->", "", processed_line) processed_line = regex.sub(r"<(?:m:)?mfenced/>", "()", processed_line) processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mi)>(.+?)</\3><((?:m:)?mi)>(.+?)</\5></\1>", "<i>\\4</i><\\2><i>\\6</i></\\2>", processed_line) processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mi)>(.+?)</\3><((?:m:)?mn)>(.+?)</\5></\1>", "<i>\\4</i><\\2>\\6</\\2>", processed_line) processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mn)>(.+?)</\3><((?:m:)?mn)>(.+?)</\5></\1>", "\\4<\\2>\\6</\\2>", processed_line) processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mn)>(.+?)</\3><((?:m:)?mi)>(.+?)</\5></\1>", "\\4<\\2><i>\\6</i></\\2>", processed_line) processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mi) mathvariant=\"normal\">(.+?)</\3><((?:m:)?mi)>(.+?)</\5></\1>", "\\4<\\2><i>\\6</i></\\2>", processed_line) processed_line = regex.sub(r"<((?:m:)?m(sub|sup))><((?:m:)?mi) mathvariant=\"normal\">(.+?)</\3><((?:m:)?mn)>(.+?)</\5></\1>", "\\4<\\2>\\6</\\2>", processed_line) processed_line = regex.sub(fr"<(?:m:)?mo>{se.FUNCTION_APPLICATION}</(?:m:)?mo>", "", processed_line, flags=regex.IGNORECASE) # The ignore case flag is required to match here with the special FUNCTION_APPLICATION character, it's unclear why processed_line = regex.sub(r"<(?:m:)?mfenced><((?:m:)(?:mo|mi|mn|mrow))>(.+?)</\1></(?:m:)?mfenced>", "(<\\1>\\2</\\1>)", processed_line) processed_line = regex.sub(r"<(?:m:)?mrow>([^>].+?)</(?:m:)?mrow>", "\\1", processed_line) processed_line = regex.sub(r"<(?:m:)?mi>([^<]+?)</(?:m:)?mi>", "<i>\\1</i>", processed_line) processed_line = regex.sub(r"<(?:m:)?mi mathvariant=\"normal\">([^<]+?)</(?:m:)?mi>", "\\1", processed_line) processed_line = regex.sub(r"<(?:m:)?mo>([+\-−=×])</(?:m:)?mo>", " \\1 ", processed_line) processed_line = regex.sub(r"<((?:m:)?m[no])>(.+?)</\1>", "\\2", processed_line) processed_line = regex.sub(r"</?(?:m:)?mrow>", "", processed_line) processed_line = processed_line.strip() processed_line = regex.sub(r"</i><i>", "", processed_line, flags=regex.DOTALL) # Did we succeed? Is there any more MathML in our string? if regex.findall("</?(?:m:)?m", processed_line): # Failure! Abandon all hope, and use Firefox to convert the MathML to PNG. se.images.render_mathml_to_png(driver, regex.sub(r"<(/?)m:", "<\\1", line), work_epub_root_directory / "epub" / "images" / f"mathml-{mathml_count}.png") processed_xhtml = processed_xhtml.replace(line, f"<img class=\"mathml epub-type-se-image-color-depth-black-on-transparent\" epub:type=\"se:image.color-depth.black-on-transparent\" src=\"../images/mathml-{mathml_count}.png\" />") mathml_count = mathml_count + 1 else: # Success! Replace the MathML with our new string. processed_xhtml = processed_xhtml.replace(line, processed_line) if processed_xhtml != xhtml: file.seek(0) file.write(processed_xhtml) file.truncate() # Include epub2 cover metadata cover_id = metadata_tree.xpath("//opf:item[@properties=\"cover-image\"]/@id")[0].replace(".svg", ".jpg") metadata_xhtml = regex.sub(r"(<metadata[^>]+?>)", f"\\1\n\t\t<meta content=\"{cover_id}\" name=\"cover\" />", metadata_xhtml) # Add metadata to content.opf indicating this file is a Standard Ebooks compatibility build metadata_xhtml = metadata_xhtml.replace("<dc:publisher", "<meta property=\"se:transform\">compatibility</meta>\n\t\t<dc:publisher") # Add any new MathML images we generated to the manifest if has_mathml: for root, _, filenames in os.walk(work_epub_root_directory / "epub" / "images"): filenames = se.natural_sort(filenames) filenames.reverse() for filename in filenames: if filename.lower().startswith("mathml-"): metadata_xhtml = metadata_xhtml.replace("<manifest>", "<manifest><item href=\"images/{0}\" id=\"{0}\" media-type=\"image/png\"/>".format(filename)) metadata_xhtml = regex.sub(r"properties=\"([^\"]*?)mathml([^\"]*?)\"", "properties=\"\\1\\2\"", metadata_xhtml) metadata_xhtml = regex.sub(r"properties=\"\s*\"", "", metadata_xhtml) # Generate our NCX file for epub2 compatibility. # First find the ToC file. toc_filename = metadata_tree.xpath("//opf:item[@properties=\"nav\"]/@href")[0] metadata_xhtml = metadata_xhtml.replace("<spine>", "<spine toc=\"ncx\">") metadata_xhtml = metadata_xhtml.replace("<manifest>", "<manifest><item href=\"toc.ncx\" id=\"ncx\" media-type=\"application/x-dtbncx+xml\" />") # Now use an XSLT transform to generate the NCX with importlib_resources.path("se.data", "navdoc2ncx.xsl") as navdoc2ncx_xsl_filename: toc_tree = se.epub.convert_toc_to_ncx(work_epub_root_directory, toc_filename, navdoc2ncx_xsl_filename) # Convert the <nav> landmarks element to the <guide> element in content.opf guide_xhtml = "<guide>" for element in toc_tree.xpath("//xhtml:nav[@epub:type=\"landmarks\"]/xhtml:ol/xhtml:li/xhtml:a"): element_xhtml = element.tostring() element_xhtml = regex.sub(r"epub:type=\"([^\"]*)(\s*frontmatter\s*|\s*backmatter\s*)([^\"]*)\"", "type=\"\\1\\3\"", element_xhtml) element_xhtml = regex.sub(r"epub:type=\"[^\"]*(acknowledgements|bibliography|colophon|copyright-page|cover|dedication|epigraph|foreword|glossary|index|loi|lot|notes|preface|bodymatter|titlepage|toc)[^\"]*\"", "type=\"\\1\"", element_xhtml) element_xhtml = element_xhtml.replace("type=\"copyright-page", "type=\"copyright page") # We add the 'text' attribute to the titlepage to tell the reader to start there element_xhtml = element_xhtml.replace("type=\"titlepage", "type=\"title-page text") element_xhtml = regex.sub(r"type=\"\s*\"", "", element_xhtml) element_xhtml = element_xhtml.replace("<a", "<reference") element_xhtml = regex.sub(r">(.+)</a>", " title=\"\\1\" />", element_xhtml) # Replace instances of the `role` attribute since it's illegal in content.opf element_xhtml = regex.sub(r" role=\".*?\"", "", element_xhtml) guide_xhtml = guide_xhtml + element_xhtml guide_xhtml = guide_xhtml + "</guide>" metadata_xhtml = metadata_xhtml.replace("</package>", "") + guide_xhtml + "</package>" # Guide is done, now write content.opf and clean it. # Output the modified content.opf before making more epub2 compatibility hacks. with open(work_epub_root_directory / "epub" / "content.opf", "w", encoding="utf-8") as file: file.write(metadata_xhtml) file.truncate() # All done, clean the output for filepath in se.get_target_filenames([work_epub_root_directory], (".xhtml", ".svg", ".opf", ".ncx")): se.formatting.format_xhtml_file(filepath, False, filepath.name == "content.opf", filepath.name == "endnotes.xhtml", filepath.name == "colophon.xhtml") # Write the compatible epub se.epub.write_epub(work_epub_root_directory, output_directory / epub_output_filename) if verbose: print(" OK") if run_epubcheck: if verbose: print(f"\tRunning epubcheck on {epub_output_filename} ...", end="", flush=True) # Path arguments must be cast to string for Windows compatibility. with importlib_resources.path("se.data.epubcheck", "epubcheck.jar") as jar_path: output = subprocess.run(["java", "-jar", str(jar_path), "--quiet", str(output_directory / epub_output_filename)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=False).stdout.decode().strip() if output: # Get the epubcheck version to print to the console version_output = subprocess.run(["java", "-jar", str(jar_path), "--version"], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=False).stdout.decode().strip() version = regex.search(r"[0-9]+\.([0-9]+\.?)*", version_output, flags=regex.MULTILINE).group(0) # Remove trailing lines from epubcheck output output = output.replace("\n\nCheck finished with errors", "") if verbose: print(f"\n\t\tepubcheck v{version} failed with:\n\t\t" + "\t\t".join(output.splitlines(True)), file=sys.stderr) else: print(f"epubcheck v{version} failed with:\n{output}", file=sys.stderr) return if verbose: print(" OK") if build_kindle: if verbose: print(f"\tBuilding {kindle_output_filename} ...", end="", flush=True) # There's a bug in Calibre <= 3.48.0 where authors who have more than one MARC relator role # display as "unknown author" in the Kindle interface. # See: https://bugs.launchpad.net/calibre/+bug/1844578 # Until the bug is fixed, we simply remove any other MARC relator on the dc:creator element. # Once the bug is fixed, we can remove this block. with open(work_epub_root_directory / "epub" / "content.opf", "r+", encoding="utf-8") as file: xhtml = file.read() processed_xhtml = xhtml for match in regex.findall(r"<meta property=\"role\" refines=\"#author\" scheme=\"marc:relators\">.*?</meta>", xhtml): if ">aut<" not in match: processed_xhtml = processed_xhtml.replace(match, "") if processed_xhtml != xhtml: file.seek(0) file.write(processed_xhtml) file.truncate() # Kindle doesn't go more than 2 levels deep for ToC, so flatten it here. with open(work_epub_root_directory / "epub" / toc_filename, "r+", encoding="utf-8") as file: xhtml = file.read() soup = BeautifulSoup(xhtml, "lxml") for match in soup.select("ol > li > ol > li > ol"): match.parent.insert_after(match) match.unwrap() file.seek(0) file.write(str(soup)) file.truncate() # Rebuild the NCX with importlib_resources.path("se.data", "navdoc2ncx.xsl") as navdoc2ncx_xsl_filename: toc_tree = se.epub.convert_toc_to_ncx(work_epub_root_directory, toc_filename, navdoc2ncx_xsl_filename) # Clean just the ToC and NCX for filepath in [work_epub_root_directory / "epub" / "toc.ncx", work_epub_root_directory / "epub" / toc_filename]: se.formatting.format_xhtml_file(filepath, False) # Convert endnotes to Kindle popup compatible notes if (work_epub_root_directory / "epub" / "text" / "endnotes.xhtml").is_file(): with open(work_epub_root_directory / "epub" / "text" / "endnotes.xhtml", "r+", encoding="utf-8") as file: xhtml = file.read() # We have to remove the default namespace declaration from our document, otherwise # xpath won't find anything at all. See http://stackoverflow.com/questions/297239/why-doesnt-xpath-work-when-processing-an-xhtml-document-with-lxml-in-python try: tree = etree.fromstring(str.encode(xhtml.replace(" xmlns=\"http://www.w3.org/1999/xhtml\"", ""))) except Exception as ex: raise se.InvalidXhtmlException(f"Error parsing XHTML file: endnotes.xhtml\n{ex}") notes = tree.xpath("//li[@epub:type=\"endnote\" or @epub:type=\"footnote\"]", namespaces=se.XHTML_NAMESPACES) processed_endnotes = "" for note in notes: note_id = note.get("id") note_number = note_id.replace("note-", "") # First, fixup the reference link for this endnote try: ref_link = etree.tostring(note.xpath("p[last()]/a[last()]")[0], encoding="unicode", pretty_print=True, with_tail=False).replace(" xmlns:epub=\"http://www.idpf.org/2007/ops\"", "").strip() except Exception: raise se.InvalidXhtmlException(f"Can’t find ref link for #{note_id}.") new_ref_link = regex.sub(r">.*?</a>", ">" + note_number + "</a>.", ref_link) # Now remove the wrapping li node from the note note_text = regex.sub(r"^<li[^>]*?>(.*)</li>$", r"\1", etree.tostring(note, encoding="unicode", pretty_print=True, with_tail=False), flags=regex.IGNORECASE | regex.DOTALL) # Insert our new ref link result = regex.subn(r"^\s*<p([^>]*?)>", "<p\\1 id=\"" + note_id + "\">" + new_ref_link + " ", note_text) # Sometimes there is no leading <p> tag (for example, if the endnote starts with a blockquote # If that's the case, just insert one in front. note_text = result[0] if result[1] == 0: note_text = "<p id=\"" + note_id + "\">" + new_ref_link + "</p>" + note_text # Now remove the old ref_link note_text = note_text.replace(ref_link, "") # Trim trailing spaces left over after removing the ref link note_text = regex.sub(r"\s+</p>", "</p>", note_text).strip() # Sometimes ref links are in their own p tag--remove that too note_text = regex.sub(r"<p>\s*</p>", "", note_text) processed_endnotes += note_text + "\n" # All done with endnotes, so drop them back in xhtml = regex.sub(r"<ol>.*</ol>", processed_endnotes, xhtml, flags=regex.IGNORECASE | regex.DOTALL) file.seek(0) file.write(xhtml) file.truncate() # While Kindle now supports soft hyphens, popup endnotes break words but don't insert the hyphen characters. So for now, remove soft hyphens from the endnotes file. with open(work_epub_root_directory / "epub" / "text" / "endnotes.xhtml", "r+", encoding="utf-8") as file: xhtml = file.read() processed_xhtml = xhtml processed_xhtml = processed_xhtml.replace(se.SHY_HYPHEN, "") if processed_xhtml != xhtml: file.seek(0) file.write(processed_xhtml) file.truncate() # Do some compatibility replacements for root, _, filenames in os.walk(work_epub_root_directory): for filename in filenames: if filename.lower().endswith(".xhtml"): with open(Path(root) / filename, "r+", encoding="utf-8") as file: xhtml = file.read() processed_xhtml = xhtml # Kindle doesn't recognize most zero-width spaces or word joiners, so just remove them. # It does recognize the word joiner character, but only in the old mobi7 format. The new format renders them as spaces. processed_xhtml = processed_xhtml.replace(se.ZERO_WIDTH_SPACE, "") # Remove the epub:type attribute, as Calibre turns it into just "type" processed_xhtml = regex.sub(r"epub:type=\"[^\"]*?\"", "", processed_xhtml) if processed_xhtml != xhtml: file.seek(0) file.write(processed_xhtml) file.truncate() # Include compatibility CSS with open(work_epub_root_directory / "epub" / "css" / "core.css", "a", encoding="utf-8") as core_css_file: with importlib_resources.open_text("se.data.templates", "kindle.css", encoding="utf-8") as compatibility_css_file: core_css_file.write(compatibility_css_file.read()) # Add soft hyphens for filepath in se.get_target_filenames([work_epub_root_directory], (".xhtml",)): se.typography.hyphenate_file(filepath, None, True) # Build an epub file we can send to Calibre se.epub.write_epub(work_epub_root_directory, work_directory / epub_output_filename) # Generate the Kindle file # We place it in the work directory because later we have to update the asin, and the mobi.update_asin() function will write to the final output directory cover_path = work_epub_root_directory / "epub" / metadata_tree.xpath("//opf:item[@properties=\"cover-image\"]/@href")[0].replace(".svg", ".jpg") # Path arguments must be cast to string for Windows compatibility. return_code = subprocess.run([str(ebook_convert_path), str(work_directory / epub_output_filename), str(work_directory / kindle_output_filename), "--pretty-print", "--no-inline-toc", "--max-toc-links=0", "--prefer-metadata-cover", f"--cover={cover_path}"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=False).returncode if return_code: raise se.InvalidSeEbookException("ebook-convert failed.") # Success, extract the Kindle cover thumbnail # Update the ASIN in the generated file mobi.update_asin(asin, work_directory / kindle_output_filename, output_directory / kindle_output_filename) # Extract the thumbnail kindle_cover_thumbnail = Image.open(work_epub_root_directory / "epub" / "images" / "cover.jpg") kindle_cover_thumbnail = kindle_cover_thumbnail.convert("RGB") # Remove alpha channel from PNG if necessary kindle_cover_thumbnail = kindle_cover_thumbnail.resize((432, 648)) kindle_cover_thumbnail.save(output_directory / f"thumbnail_{asin}_EBOK_portrait.jpg") if verbose: print(" OK")
def create_draft(args: list): """ Entry point for `se create-draft` """ # Put together some variables for later use identifier = se.formatting.make_url_safe(args.author) + "/" + se.formatting.make_url_safe(args.title) title_string = args.title.replace("'", "’") + ", by " + args.author.replace("'", "’") sorted_title = regex.sub(r"^(A|An|The) (.+)$", "\\2, \\1", args.title) pg_producers = [] if args.translator: identifier = identifier + "/" + se.formatting.make_url_safe(args.translator) title_string = title_string + ". Translated by " + args.translator if args.illustrator: identifier = identifier + "/" + se.formatting.make_url_safe(args.illustrator) title_string = title_string + ". Illustrated by " + args.illustrator repo_name = Path(identifier.replace("/", "_")) if repo_name.is_dir(): raise se.InvalidInputException("./{}/ already exists.".format(repo_name)) # Download PG HTML and do some fixups if args.pg_url: args.pg_url = args.pg_url.replace("http://", "https://") # Get the ebook metadata try: response = requests.get(args.pg_url) pg_metadata_html = response.text except Exception as ex: raise se.RemoteCommandErrorException("Couldn’t download Project Gutenberg ebook metadata page. Error: {}".format(ex)) soup = BeautifulSoup(pg_metadata_html, "lxml") # Get the ebook HTML URL from the metadata pg_ebook_url = None for element in soup.select("a[type^=\"text/html\"]"): pg_ebook_url = regex.sub(r"^//", "https://", element["href"]) pg_ebook_url = regex.sub(r"^/", "https://www.gutenberg.org/", pg_ebook_url) if not pg_ebook_url: raise se.RemoteCommandErrorException("Could download ebook metadata, but couldn’t find URL for the ebook HTML.") # Get the ebook LCSH categories pg_subjects = [] for element in soup.select("td[property=\"dcterms:subject\"]"): if element["datatype"] == "dcterms:LCSH": for subject_link in element.find("a"): pg_subjects.append(subject_link.strip()) # Get the PG publication date pg_publication_year = None for element in soup.select("td[itemprop=\"datePublished\"]"): pg_publication_year = regex.sub(r".+?([0-9]{4})", "\\1", element.text) # Get the actual ebook URL try: response = requests.get(pg_ebook_url) pg_ebook_html = response.text except Exception as ex: raise se.RemoteCommandErrorException("Couldn’t download Project Gutenberg ebook HTML. Error: {}".format(ex)) try: fixed_pg_ebook_html = fix_text(pg_ebook_html, uncurl_quotes=False) pg_ebook_html = se.strip_bom(fixed_pg_ebook_html) except Exception as ex: raise se.InvalidEncodingException("Couldn’t determine text encoding of Project Gutenberg HTML file. Error: {}".format(ex)) # Try to guess the ebook language pg_language = "en-US" if "colour" in pg_ebook_html or "favour" in pg_ebook_html or "honour" in pg_ebook_html: pg_language = "en-GB" # Create necessary directories (repo_name / "images").mkdir(parents=True) (repo_name / "src" / "epub" / "css").mkdir(parents=True) (repo_name / "src" / "epub" / "images").mkdir(parents=True) (repo_name / "src" / "epub" / "text").mkdir(parents=True) (repo_name / "src" / "META-INF").mkdir(parents=True) is_pg_html_parsed = True # Write PG data if we have it if args.pg_url and pg_ebook_html: try: soup = BeautifulSoup(pg_ebook_html, "html.parser") # Try to get the PG producers. We only try this if there's a <pre> block with the header info (which is not always the case) for element in soup(text=regex.compile(r"\*\*\*\s*Produced by.+$", flags=regex.DOTALL)): if element.parent.name == "pre": pg_producers = regex.sub(r".+?Produced by (.+?)\s*$", "\\1", element, flags=regex.DOTALL) pg_producers = regex.sub(r"\(.+?\)", "", pg_producers, flags=regex.DOTALL) pg_producers = regex.sub(r"(at )?https?://www\.pgdp\.net", "", pg_producers, flags=regex.DOTALL) pg_producers = regex.sub(r"[\r\n]+", " ", pg_producers, flags=regex.DOTALL) pg_producers = regex.sub(r",? and ", ", and ", pg_producers) pg_producers = pg_producers.replace(" and the Online", " and The Online") pg_producers = pg_producers.replace(", and ", ", ").strip().split(", ") # Try to strip out the PG header for element in soup(text=regex.compile(r"\*\*\*\s*START OF THIS")): for sibling in element.parent.find_previous_siblings(): sibling.decompose() element.parent.decompose() # Try to strip out the PG license footer for element in soup(text=regex.compile(r"End of (the )?Project Gutenberg")): for sibling in element.parent.find_next_siblings(): sibling.decompose() element.parent.decompose() with open(repo_name / "src" / "epub" / "text" / "body.xhtml", "w", encoding="utf-8") as file: file.write(str(soup)) except IOError as ex: raise se.InvalidFileException("Couldn’t write to ebook directory. Error: {}".format(ex)) except: # Save this error for later, because it's still useful to complete the create-draft process # even if we've failed to parse PG's HTML source. is_pg_html_parsed = False se.quiet_remove(repo_name / "src" / "epub" / "text" / "body.xhtml") # Copy over templates shutil.copy(resource_filename("se", str(Path("data") / "templates" / "gitignore")), repo_name / ".gitignore") shutil.copy(resource_filename("se", str(Path("data") / "templates" / "LICENSE.md")), repo_name) shutil.copy(resource_filename("se", str(Path("data") / "templates" / "META-INF" / "container.xml")), repo_name / "src" / "META-INF") shutil.copy(resource_filename("se", str(Path("data") / "templates" / "mimetype")), repo_name / "src") shutil.copy(resource_filename("se", str(Path("data") / "templates" / "content.opf")), repo_name / "src" / "epub") shutil.copy(resource_filename("se", str(Path("data") / "templates" / "onix.xml")), repo_name / "src" / "epub") shutil.copy(resource_filename("se", str(Path("data") / "templates" / "toc.xhtml")), repo_name / "src" / "epub") shutil.copy(resource_filename("se", str(Path("data") / "templates" / "core.css")), repo_name / "src" / "epub" / "css") shutil.copy(resource_filename("se", str(Path("data") / "templates" / "local.css")), repo_name / "src" / "epub" / "css") shutil.copy(resource_filename("se", str(Path("data") / "templates" / "logo.svg")), repo_name / "src" / "epub" / "images") shutil.copy(resource_filename("se", str(Path("data") / "templates" / "colophon.xhtml")), repo_name / "src" / "epub" / "text") shutil.copy(resource_filename("se", str(Path("data") / "templates" / "imprint.xhtml")), repo_name / "src" / "epub" / "text") shutil.copy(resource_filename("se", str(Path("data") / "templates" / "titlepage.xhtml")), repo_name / "src" / "epub" / "text") shutil.copy(resource_filename("se", str(Path("data") / "templates" / "uncopyright.xhtml")), repo_name / "src" / "epub" / "text") shutil.copy(resource_filename("se", str(Path("data") / "templates" / "titlepage.svg")), repo_name / "images") shutil.copy(resource_filename("se", str(Path("data") / "templates" / "cover.jpg")), repo_name / "images" / "cover.jpg") shutil.copy(resource_filename("se", str(Path("data") / "templates" / "cover.svg")), repo_name / "images" / "cover.svg") # Try to find Wikipedia links if possible author_wiki_url, author_nacoaf_url = _get_wikipedia_url(args.author, True) ebook_wiki_url, _ = _get_wikipedia_url(args.title, False) translator_wiki_url = None if args.translator: translator_wiki_url, translator_nacoaf_url = _get_wikipedia_url(args.translator, True) # Pre-fill a few templates se.replace_in_file(repo_name / "src" / "epub" / "text" / "titlepage.xhtml", "TITLE_STRING", title_string) se.replace_in_file(repo_name / "images" / "titlepage.svg", "TITLE_STRING", title_string) se.replace_in_file(repo_name / "images" / "cover.svg", "TITLE_STRING", title_string) # Create the titlepage SVG contributors = {} if args.translator: contributors["translated by"] = args.translator if args.illustrator: contributors["illustrated by"] = args.illustrator with open(repo_name / "images" / "titlepage.svg", "w", encoding="utf-8") as file: file.write(_generate_titlepage_svg(args.title, args.author, contributors, title_string)) # Create the cover SVG with open(repo_name / "images" / "cover.svg", "w", encoding="utf-8") as file: file.write(_generate_cover_svg(args.title, args.author, title_string)) if args.pg_url: se.replace_in_file(repo_name / "src" / "epub" / "text" / "imprint.xhtml", "PG_URL", args.pg_url) with open(repo_name / "src" / "epub" / "text" / "colophon.xhtml", "r+", encoding="utf-8") as file: colophon_xhtml = file.read() colophon_xhtml = colophon_xhtml.replace("SE_IDENTIFIER", identifier) colophon_xhtml = colophon_xhtml.replace(">AUTHOR<", ">{}<".format(args.author)) colophon_xhtml = colophon_xhtml.replace("TITLE", args.title) if author_wiki_url: colophon_xhtml = colophon_xhtml.replace("AUTHOR_WIKI_URL", author_wiki_url) if args.pg_url: colophon_xhtml = colophon_xhtml.replace("PG_URL", args.pg_url) if pg_publication_year: colophon_xhtml = colophon_xhtml.replace("PG_YEAR", pg_publication_year) if pg_producers: producers_xhtml = "" for i, producer in enumerate(pg_producers): if "Distributed Proofreading" in producer: producers_xhtml = producers_xhtml + "<a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a>" else: producers_xhtml = producers_xhtml + "<b class=\"name\">{}</b>".format(producer) if i < len(pg_producers) - 1: producers_xhtml = producers_xhtml + ", " if i == len(pg_producers) - 2: producers_xhtml = producers_xhtml + "and " producers_xhtml = producers_xhtml + "<br/>" colophon_xhtml = colophon_xhtml.replace("<b class=\"name\">TRANSCRIBER_1</b>, <b class=\"name\">TRANSCRIBER_2</b>, and <a href=\"https://www.pgdp.net\">The Online Distributed Proofreading Team</a><br/>", producers_xhtml) file.seek(0) file.write(colophon_xhtml) file.truncate() with open(repo_name / "src" / "epub" / "content.opf", "r+", encoding="utf-8") as file: metadata_xhtml = file.read() metadata_xhtml = metadata_xhtml.replace("SE_IDENTIFIER", identifier) metadata_xhtml = metadata_xhtml.replace(">AUTHOR<", ">{}<".format(args.author)) metadata_xhtml = metadata_xhtml.replace(">TITLE_SORT<", ">{}<".format(sorted_title)) metadata_xhtml = metadata_xhtml.replace(">TITLE<", ">{}<".format(args.title)) metadata_xhtml = metadata_xhtml.replace("VCS_IDENTIFIER", str(repo_name)) if pg_producers: producers_xhtml = "" i = 1 for producer in pg_producers: producers_xhtml = producers_xhtml + "\t\t<dc:contributor id=\"transcriber-{}\">{}</dc:contributor>\n".format(i, producer) if "Distributed Proofreading" in producer: producers_xhtml = producers_xhtml + "\t\t<meta property=\"file-as\" refines=\"#transcriber-{0}\">Online Distributed Proofreading Team, The</meta>\n\t\t<meta property=\"se:url.homepage\" refines=\"#transcriber-{0}\">https://pgdp.net</meta>\n".format(i) else: producers_xhtml = producers_xhtml + "\t\t<meta property=\"file-as\" refines=\"#transcriber-{}\">TRANSCRIBER_SORT</meta>\n".format(i) producers_xhtml = producers_xhtml + "\t\t<meta property=\"role\" refines=\"#transcriber-{}\" scheme=\"marc:relators\">trc</meta>\n".format(i) i = i + 1 metadata_xhtml = regex.sub(r"\t\t<dc:contributor id=\"transcriber-1\">TRANSCRIBER</dc:contributor>\s*<meta property=\"file-as\" refines=\"#transcriber-1\">TRANSCRIBER_SORT</meta>\s*<meta property=\"se:url.homepage\" refines=\"#transcriber-1\">TRANSCRIBER_URL</meta>\s*<meta property=\"role\" refines=\"#transcriber-1\" scheme=\"marc:relators\">trc</meta>", "\t\t" + producers_xhtml.strip(), metadata_xhtml, flags=regex.DOTALL) if author_wiki_url: metadata_xhtml = metadata_xhtml.replace(">AUTHOR_WIKI_URL<", ">{}<".format(author_wiki_url)) if author_nacoaf_url: metadata_xhtml = metadata_xhtml.replace(">AUTHOR_NACOAF_URL<", ">{}<".format(author_nacoaf_url)) if ebook_wiki_url: metadata_xhtml = metadata_xhtml.replace(">EBOOK_WIKI_URL<", ">{}<".format(ebook_wiki_url)) if args.translator: metadata_xhtml = metadata_xhtml.replace(">TRANSLATOR<", ">{}<".format(args.translator)) if translator_wiki_url: metadata_xhtml = metadata_xhtml.replace(">TRANSLATOR_WIKI_URL<", ">{}<".format(translator_wiki_url)) if translator_nacoaf_url: metadata_xhtml = metadata_xhtml.replace(">TRANSLATOR_NACOAF_URL<", ">{}<".format(translator_nacoaf_url)) else: metadata_xhtml = regex.sub(r"<dc:contributor id=\"translator\">.+?<dc:contributor id=\"artist\">", "<dc:contributor id=\"artist\">", metadata_xhtml, flags=regex.DOTALL) if args.pg_url: if pg_subjects: subject_xhtml = "" i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + "\t\t<dc:subject id=\"subject-{}\">{}</dc:subject>\n".format(i, subject) i = i + 1 i = 1 for subject in pg_subjects: subject_xhtml = subject_xhtml + "\t\t<meta property=\"authority\" refines=\"#subject-{}\">LCSH</meta>\n".format(i) # Now, get the LCSH ID by querying LCSH directly. try: response = requests.get("http://id.loc.gov/search/?q=%22{}%22".format(urllib.parse.quote(subject))) result = regex.search(r"<a title=\"Click to view record\" href=\"/authorities/subjects/([^\"]+?)\">{}</a>".format(regex.escape(subject.replace(" -- ", "--"))), response.text) loc_id = "Unknown" try: loc_id = result.group(1) except Exception as ex: pass subject_xhtml = subject_xhtml + "\t\t<meta property=\"term\" refines=\"#subject-{}\">{}</meta>\n".format(i, loc_id) except Exception as ex: raise se.RemoteCommandErrorException("Couldn’t connect to id.loc.gov. Error: {}".format(ex)) i = i + 1 metadata_xhtml = regex.sub(r"\t\t<dc:subject id=\"subject-1\">SUBJECT_1</dc:subject>\s*<dc:subject id=\"subject-2\">SUBJECT_2</dc:subject>\s*<meta property=\"authority\" refines=\"#subject-1\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-1\">LCSH_ID_1</meta>\s*<meta property=\"authority\" refines=\"#subject-2\">LCSH</meta>\s*<meta property=\"term\" refines=\"#subject-2\">LCSH_ID_2</meta>", "\t\t" + subject_xhtml.strip(), metadata_xhtml) metadata_xhtml = metadata_xhtml.replace("<dc:language>LANG</dc:language>", "<dc:language>{}</dc:language>".format(pg_language)) metadata_xhtml = metadata_xhtml.replace("<dc:source>PG_URL</dc:source>", "<dc:source>{}</dc:source>".format(args.pg_url)) file.seek(0) file.write(metadata_xhtml) file.truncate() # Set up local git repo repo = git.Repo.init(repo_name) if args.email: with repo.config_writer() as config: config.set_value("user", "email", args.email) # Set up remote git repos if args.create_se_repo: git_command = git.cmd.Git(repo_name) git_command.remote("add", "origin", "standardebooks.org:/standardebooks.org/ebooks/{}.git".format(repo_name)) # Set git to automatically push to SE git_command.config("branch.master.remote", "origin") git_command.config("branch.master.merge", "refs/heads/master") github_option = "" if args.create_github_repo: github_option = "--github" return_code = call(["ssh", "standardebooks.org", "/standardebooks.org/scripts/init-se-repo --repo-name={} --title-string=\"{}\" {}".format(repo_name, title_string, github_option)]) if return_code != 0: raise se.RemoteCommandErrorException("Failed to create repository on Standard Ebooks server: ssh returned code {}.".format(return_code)) if args.pg_url and pg_ebook_html and not is_pg_html_parsed: raise se.InvalidXhtmlException("Couldn’t parse Project Gutenberg ebook source. This is usually due to invalid HTML in the ebook.")