def get_level(node: EasyXmlElement, toc_list: list) -> int: """ Get level of a node. """ # first need to check how deep this heading is within the current file parent_sections = node.xpath( "./ancestor::*[name() = 'section' or name() = 'article']") if parent_sections: depth = len(parent_sections) else: depth = 1 if not node.parent: return depth # must be at the top level data_parents = node.xpath("//*[@data-parent]") if not data_parents: return depth data_parent = data_parents[0].get_attr("data-parent") if data_parent: # see if we can find it in already processed (as we should if spine is correctly ordered) parent_file = [t for t in toc_list if t.id == data_parent] if parent_file: this_level = parent_file[0].level + 1 return this_level + depth - 1 # subtract from depth because all headings should have depth >= 1 return depth
def output_toc(item_list: list, landmark_list, toc_path: str, work_type: str, work_title: str) -> str: """ Outputs the contructed ToC based on the lists of items and landmarks found, either to stdout or overwriting the existing ToC file INPUTS: item_list: list of ToC items (the first part of the ToC) landmark_list: list of landmark items (the second part of the ToC) work_type: "fiction" or "non-fiction" work_title: the title of the book OUTPUTS: a html string representing the new ToC """ if len(item_list) < 2: raise se.InvalidInputException("Too few ToC items found.") try: with open(toc_path, encoding="utf8") as file: toc_dom = se.easy_xml.EasyXhtmlTree(file.read()) except Exception as ex: raise se.InvalidInputException( f"Existing ToC not found. Exception: {ex}") # There should be exactly two nav sections. navs = toc_dom.xpath("//nav") if len(navs) < 2: raise se.InvalidInputException( "Existing ToC has too few nav sections.") # now remove and then re-add the ol sections to clear them for nav in navs: ols = nav.xpath("./ol") # just want the immediate ol children for ol_item in ols: ol_item.remove() # this is ugly and stupid, but I can't figure out an easier way to do it item_ol = EasyXmlElement(etree.Element("ol"), toc_dom.namespaces) item_ol.lxml_element.text = "TOC_ITEMS" navs[0].append(item_ol) landmark_ol = EasyXmlElement(etree.Element("ol"), toc_dom.namespaces) landmark_ol.lxml_element.text = "LANDMARK_ITEMS" navs[1].append(landmark_ol) xhtml = toc_dom.to_string() xhtml = xhtml.replace("TOC_ITEMS", process_items(item_list)) xhtml = xhtml.replace( "LANDMARK_ITEMS", process_landmarks(landmark_list, work_type, work_title)) return se.formatting.format_xhtml(xhtml)
def get_place(node: EasyXmlElement) -> Position: """ Returns place of file in ebook, eg frontmatter, backmatter, etc. INPUTS: node: EasyXmlElement representation of the file OUTPUTS: a Position enum value indicating the place in the book """ epub_type = node.get_attr("epub:type") if not epub_type: return Position.NONE if "backmatter" in epub_type: retval = Position.BACK elif "frontmatter" in epub_type: retval = Position.FRONT elif "bodymatter" in epub_type: retval = Position.BODY else: retval = Position.NONE return retval
def evaluate_descendants(node: EasyXmlElement, toc_item: TocItem) -> TocItem: """ Burrow down into a hgroup structure to qualify the ToC item INPUTS: node: EasyXmlElement object representing a hgroup OUTPUTS: toc_item: qualified ToC item """ children = node.xpath("./h1 | ./h2 | ./h3 | ./h4 | ./h5 | ./h6") for child in children: # we expect these to be h1, h2, h3, h4 etc if not toc_item.lang: toc_item.lang = child.get_attr("xml:lang") epub_type = child.get_attr("epub:type") if child.get_attr("hidden"): toc_item.hidden = True if not epub_type: # should be a label/ordinal grouping child_strings = get_child_strings(child) if "label" in child_strings and "ordinal" in child_strings: # quick test toc_item.title_is_ordinal = True # strip label child_strings = regex.sub( r"<span epub:type=\"label\">(.*?)</span>", " \\1 ", child_strings) # remove ordinal if it's by itself in a span child_strings = regex.sub( r"<span epub:type=\"ordinal\">(.*?)</span>", " \\1 ", child_strings) # remove ordinal if it's joined with a roman (which we want to keep) child_strings = regex.sub(r"\bordinal\b", "", child_strings) # remove extra spaces child_strings = regex.sub(r"[ ]{2,}", " ", child_strings) # get rid of any endnotes child_strings = strip_notes(child_strings) toc_item.title = child_strings.strip() continue # skip the following if "z3998:roman" in epub_type: toc_item.roman = extract_strings(child) if not toc_item.title: toc_item.title = f"<span epub:type=\"z3998:roman\">{toc_item.roman}</span>" elif "ordinal" in epub_type: # but not a roman numeral or a labelled item, cases caught caught above if not toc_item.title: toc_item.title = extract_strings(child) toc_item.title_is_ordinal = True if "subtitle" in epub_type: toc_item.subtitle = extract_strings(child) else: if "title" in epub_type: # this allows for `fulltitle` to work here, too if toc_item.title or toc_item.roman or toc_item.title_is_ordinal: # if title already filled, must be a subtitle toc_item.subtitle = extract_strings(child) else: toc_item.title = extract_strings(child) if toc_item.title and toc_item.subtitle: # then we're done return toc_item return toc_item
def get_child_strings(node: EasyXmlElement) -> str: """ Get child strings """ children = node.xpath("*") child_strs = "" for child in children: child_strs += child.to_string() + "\n" return child_strs
def get_book_division(node: EasyXmlElement) -> BookDivision: """ Determine the kind of book division. At present only Part and Division are important; but others stored for possible future logic. INPUTS: tag: an EasyXml node representing a tag OUTPUTS: a BookDivision enum value representing the kind of division """ parent_sections = node.xpath( "./ancestor::*[name() = 'section' or name() = 'article']") if not parent_sections: parent_sections = node.xpath("./ancestor::body") if not parent_sections: # couldn't find a parent, so throw an error raise se.InvalidInputException section_epub_type = parent_sections[-1].get_attr("epub:type") retval = BookDivision.NONE if not section_epub_type: return retval if "part" in section_epub_type: retval = BookDivision.PART if "division" in section_epub_type: retval = BookDivision.DIVISION if ("volume" in section_epub_type) and ("se:short-story" not in section_epub_type): retval = BookDivision.VOLUME if "subchapter" in section_epub_type: retval = BookDivision.SUBCHAPTER if "chapter" in section_epub_type: retval = BookDivision.CHAPTER if "article" in parent_sections[-1].tag: retval = BookDivision.ARTICLE return retval
def extract_strings(node: EasyXmlElement) -> str: """ Returns string representation of a tag, ignoring linefeeds INPUTS: node: a tag as xpath node OUTPUTS: just the string contents of the tag """ out_string = node.inner_xml() out_string = strip_notes(out_string) return regex.sub(r"[\n\t]", "", out_string)
def get_parent_id(hchild: EasyXmlElement) -> str: """ Climbs up the document tree looking for parent id in a <section> tag. INPUTS: hchild: a heading tag for which we want to find the parent id OUTPUTS: the id of the parent section """ # position() = 1 gets the nearest ancestor parents = hchild.xpath( "./ancestor::*[name() = 'section' or name() = 'article'][@id][position() = 1]" ) if parents: return parents[0].get_attr("id") return ""
def process_a_heading(node: EasyXmlElement, textf: str, is_toplevel: bool, single_file: bool) -> TocItem: """ Generate and return a single TocItem from this heading. INPUTS: node: an EasyXml node representing a heading text: the path to the file is_toplevel: is this heading at the top-most level in the file? single_file: is there only one content file in the production (like some Poetry volumes)? OUTPUTS: a qualified ToCItem object """ toc_item = TocItem() parent_sections = node.xpath( "./ancestor::*[name() = 'section' or name() = 'article']") if parent_sections: toc_item.level = len(parent_sections) else: toc_item.level = 1 toc_item.division = get_book_division(node) # is_top_level stops the first heading in a file getting an anchor id, we don't generally want that. # The exceptions are things like poems within a single-file volume. toc_item.id = get_parent_id(node) # pylint: disable=invalid-name if toc_item.id == "": toc_item.file_link = textf else: if not is_toplevel: toc_item.file_link = f"{textf}#{toc_item.id}" elif single_file: # It IS the first heading in the file, but there's only a single content file? toc_item.file_link = f"{textf}#{toc_item.id}" else: toc_item.file_link = textf toc_item.lang = node.get_attr("xml:lang") epub_type = node.get_attr("epub:type") # it may be an empty header tag eg <h3>, so we pass its parent rather than itself to evaluate the parent's descendants if not epub_type and node.tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: parent = node.parent if parent: evaluate_descendants(parent, toc_item) else: # shouldn't ever happen, but... just in case, raise an error raise se.InvalidInputException( f"Header without parent in file: [path][link=file://{textf}]{textf}[/][/]." ) return toc_item if epub_type: # A heading may include z3998:roman directly, # eg <h5 epub:type="title z3998:roman">II</h5>. if "z3998:roman" in epub_type: toc_item.roman = extract_strings(node) toc_item.title = f"<span epub:type=\"z3998:roman\">{toc_item.roman}</span>" return toc_item if "ordinal" in epub_type: # but not a roman numeral (eg in Nietzche's Beyond Good and Evil) toc_item.title = extract_strings(node) toc_item.title_is_ordinal = True return toc_item # may be the halftitle page with a subtitle, so we need to burrow down if ("fulltitle" in epub_type) and (node.tag == "hgroup"): evaluate_descendants(node, toc_item) return toc_item # or it may be a straightforward one-level title eg: <h2 epub:type="title">Imprint</h2> if "title" in epub_type: toc_item.title = extract_strings(node) return toc_item # otherwise, burrow down into its structure to get the info evaluate_descendants(node, toc_item) return toc_item