def format_svg(svg: str) -> str: """ Pretty-print well-formed SVG XML. INPUTS svg: A string of well-formed SVG XML. OUTPUTS A string of pretty-printed SVG XML. """ try: tree = _format_xml_str(svg) except Exception as ex: raise se.InvalidXmlException(f"Couldn’t parse SVG file. Exception: {ex}") # Make sure viewBox is correctly-cased for node in tree.xpath("/svg:svg", namespaces={"svg": "http://www.w3.org/2000/svg"}): for key, value in node.items(): # Iterate over attributes if key.lower() == "viewbox": node.attrib.pop(key) # Remove the attribute node.attrib["viewBox"] = value # Re-add the attribute, correctly-cased break # Format <style> elements _format_style_elements(tree) return _xml_tree_to_string(tree)
def __init__(self, xml_string: str): self.namespaces = { "re": "http://exslt.org/regular-expressions" } # Enable regular expressions in xpath try: self.etree = etree.fromstring(str.encode(xml_string)) except etree.XMLSyntaxError as ex: raise se.InvalidXmlException( f"Couldn’t parse XML. Exception: {ex}") from ex self.is_css_applied = False
def metadata_dom(self) -> se.easy_xml.EasyXmlTree: """ Accessor """ if self._metadata_dom is None: try: self._metadata_dom = se.easy_xml.EasyOpfTree(self.metadata_xml) except Exception as ex: raise se.InvalidXmlException(f"Couldn’t parse [path][link=file://{self.metadata_file_path}]{self.metadata_file_path}[/][/]. Exception: {ex}") return self._metadata_dom
def format_opf(xml: str) -> str: """ Pretty-print well-formed OPF XML. INPUTS xml: A string of well-formed OPF XML OUTPUTS A string of pretty-printed XML. """ # Replace html entities in the long description so we can clean it too. # We re-establish them later. Don't use html.unescape because that will unescape # things like & which would make an invalid XML document. (& may appear in translator info, # or other parts of the metadata that are not the long description. xml = xml.replace("<", "<") xml = xml.replace(">", ">") xml = xml.replace("&amp;", "&") # Unescape escaped ampersands, which appear in the long description only # Canonicalize and format XML try: tree = _format_xml_str(xml) except Exception as ex: raise se.InvalidXmlException(f"Couldn’t parse OPF file. Exception: {ex}") # Format the long description, then escape it for node in tree.xpath("/opf:package/opf:metadata/opf:meta[@property='se:long-description']", namespaces={"opf": "http://www.idpf.org/2007/opf"}): # Convert the node contents to escaped text. xhtml = node.text # This preserves the initial newline and indentation if xhtml is None: xhtml = "" for child in node: xhtml += etree.tostring(child, encoding="unicode") # After composing the string, lxml adds namespaces to every tag. The only way to remove them is with regex. xhtml = regex.sub(r"\sxmlns(:.+?)?=\"[^\"]+?\"", "", xhtml) # Remove the children so that we can replace them with the escaped xhtml for child in node: node.remove(child) node.text = xhtml return _xml_tree_to_string(tree)
def format_xml(xml: str) -> str: """ Pretty-print well-formed XML. INPUTS xml: A string of well-formed XML. OUTPUTS A string of pretty-printed XML. """ try: tree = _format_xml_str(xml) except Exception as ex: raise se.InvalidXmlException(f"Couldn’t parse XML file. Exception: {ex}") return _xml_tree_to_string(tree)
def __init__(self, epub_root_directory: Union[str, Path]): try: self.path = Path(epub_root_directory).resolve() if not self.path.is_dir(): raise Exception except Exception as ex: raise se.InvalidSeEbookException(f"Not a directory: [path][link=file://{self.path}]{self.path}[/][/].") from ex # Decide if this is an SE epub, or a white-label epub # SE epubs have a ./src dir and the identifier looks like an SE identifier if (self.path / "src" / "META-INF" / "container.xml").is_file(): self.epub_root_path = self.path / "src" else: self.epub_root_path = self.path self.is_se_ebook = False try: container_tree = self.get_dom(self.epub_root_path / "META-INF" / "container.xml") self.metadata_file_path = self.epub_root_path / container_tree.xpath("/container/rootfiles/rootfile[@media-type=\"application/oebps-package+xml\"]/@full-path")[0] except Exception as ex: raise se.InvalidSeEbookException("Target doesn’t appear to be an epub: no [path]container.xml[/] or no metadata file.") from ex self.content_path = self.metadata_file_path.parent try: self.metadata_dom = self.get_dom(self.metadata_file_path) except Exception as ex: raise se.InvalidXmlException(f"Couldn’t parse [path][link=file://{self.metadata_file_path}]{self.metadata_file_path}[/][/]. Exception: {ex}") from ex toc_href = self.metadata_dom.xpath("/package/manifest/item[contains(@properties, 'nav')]/@href", True) if toc_href: self.toc_path = self.content_path / toc_href else: raise se.InvalidSeEbookException("Couldn’t find table of contents.") # If our identifier isn't SE-style, we're not an SE ebook identifier = self.metadata_dom.xpath("/package/metadata/dc:identifier/text()", True) if not identifier or not identifier.startswith("url:https://standardebooks.org/ebooks/"): self.is_se_ebook = False
def svg_text_to_paths(in_svg: Path, out_svg: Path, remove_style=True) -> None: """ Convert SVG <text> elements into <path> elements, using SVG document's <style> tag and external font files. (These SVG font files are built-in to the SE tools). Resulting SVG file will have no dependency on external fonts. INPUTS in_svg: Path for the SVG file to convert <text> elements. out_svg: Path for where to write the result SVG file, with <path> elements. OUTPUTS None. """ font_paths = [] name_list = {"league_spartan": ["league-spartan-bold.svg"], "sorts_mill_goudy": ["sorts-mill-goudy-italic.svg", "sorts-mill-goudy.svg"]} for font_family, font_names in name_list.items(): for font_name in font_names: with importlib_resources.path(f"se.data.fonts.{font_family}", font_name) as font_path: font_paths.append(font_path) fonts = [] for font_path in font_paths: font = _parse_font(font_path) fonts.append(font) with open(in_svg, "rt") as svg_in_raw: try: xml = etree.fromstring(str.encode(svg_in_raw.read())) except Exception as ex: raise se.InvalidXmlException(f"Couldn’t parse SVG file: [path][link={in_svg.resolve()}]{in_svg}[/][/].") from ex svg_ns = "{http://www.w3.org/2000/svg}" style = xml.find(svg_ns + "style") # Possibly remove style tag if caller wants that def filter_predicate(elem: etree.Element): if remove_style and elem.tag.endswith("style"): return None # Remove <style> tag return elem # Keep all other elements if remove_style: xml = _traverse_element(xml, filter_predicate) for elem in xml.iter(): if elem.tag.endswith("text"): properties = _apply_css(elem, style.text) _get_properties_from_text_elem(properties, elem) _add_font_to_properties(properties, fonts) text = elem.text if not text: raise se.InvalidFileException(f"SVG [xml]<text>[/] element has no content. File: [path][link=file://{in_svg.resolve()}]{in_svg}[/].") elem.tag = "g" # Replace <text> tag with <g> tag for k in elem.attrib.keys(): if k != "class": del elem.attrib[k] elif k == "class" and elem.attrib["class"] != "title-box": # Keep just class attribute if class="title-box" del elem.attrib[k] elem.attrib["aria-label"] = text elem.tail = "\n" elem.text = "" _add_svg_paths_to_group(elem, properties) xmlstr = etree.tostring(xml, pretty_print=True).decode("UTF-8") result_all_text = xmlstr.replace("ns0:", "").replace(":ns0", "") result_all_text = se.formatting.format_xml(result_all_text) with open(out_svg, "wt") as output: output.write(result_all_text)