def add_landmark(dom: EasyXmlTree, textf: str, landmarks: list) -> None: """ Adds an item to landmark list with appropriate details. INPUTS: dom: EasyXmlTree representation of the file we are indexing in ToC textf: path to the file landmarks: the list of landmark items we are building OUTPUTS: None """ epub_type = "" sections = dom.xpath("//body/*[name() = 'section' or name() = 'article']") if not sections: raise se.InvalidInputException( "Couldn’t locate first [xhtml]<section>[/] or [xhtml]<article>[/]." ) epub_type = sections[0].get_attr("epub:type") bodys = dom.xpath("//body") if not bodys: raise se.InvalidInputException("Couldn’t locate [xhtml]<body>[/].") if not epub_type: # some productions don't have an epub:type in outermost section, so get it from body tag epub_type = bodys[0].get_attr("epub:type") if not epub_type: epub_type = "" if epub_type in ["frontmatter", "bodymatter", "backmatter"]: return # if epub_type is ONLY frontmatter, bodymatter, backmatter, we don't want this as a landmark # We may wind up with a (front|body|back)matter semantic in epub_type, remove it here since we add it to the landmark later epub_type = regex.sub(r"(front|body|back)matter\s*", "", epub_type) landmark = TocItem() if epub_type: landmark.epub_type = epub_type landmark.file_link = textf landmark.place = get_place(bodys[0]) if epub_type == "halftitlepage": landmark.title = "Half Title" else: landmark.title = dom.xpath( "//head/title/text()", True) # Use the page title as the landmark entry title. if landmark.title is None: # This is a bit desperate, use this only if there's no proper <title> tag in file. landmark.title = landmark.epub_type.capitalize() landmarks.append(landmark)
def process_headings(dom: EasyXmlTree, textf: str, toc_list: list, nest_under_halftitle: bool, single_file: bool): """ Find headings in current file and extract title data into items added to toc_list. INPUTS: dom: an EasyXmlTree representation of the current file textf: the path to the file toc_list: the list of ToC items we are building nest_under_halftitle: does this item need to be nested? single_file: is there only a single content item in the production? OUTPUTS: None """ body = dom.xpath("//body") place = Position.NONE if body: place = get_place(body[0]) else: raise se.InvalidInputException("Couldn't locate body node") is_toplevel = True # Find all the hgroups and h1, h2 etc headings. heads = dom.xpath("//hgroup | //h1 | //h2 | //h3 | //h4 | //h5 | //h6") # special treatment where we can't find any header or hgroups if not heads: # May be a dedication or an epigraph, with no heading tag. if single_file and nest_under_halftitle: # There's a halftitle, but only this one content file with no subsections, # so leave out of ToC because the Toc will link to the halftitle. return special_item = TocItem() # Need to determine level depth. # We don't have a heading, so get first content item content_item = dom.xpath("//p | //header | //img") if content_item is not None: parents = content_item[0].xpath( "./ancestor::*[name() = 'section' or name() = 'article']") special_item.level = len(parents) if special_item.level == 0: special_item.level = 1 if nest_under_halftitle: special_item.level += 1 special_item.title = dom.xpath( "//head/title/text()", True) # Use the page title as the ToC entry title. if special_item.title is None: special_item.title = "NO TITLE" special_item.file_link = textf toc_list.append(special_item) return for heading in heads: # don't process a heading separately if it's within a hgroup if heading.parent.tag == "hgroup": continue # skip it if place == Position.BODY: toc_item = process_a_heading(heading, textf, is_toplevel, single_file) else: # if it's not a bodymatter item we don't care about whether it's single_file toc_item = process_a_heading(heading, textf, is_toplevel, False) # Tricky check to see if we want to include the item because there's a halftitle # but only a single content file with no subsidiary sections. if is_toplevel and single_file and nest_under_halftitle and len( heads) == 1: continue if nest_under_halftitle: toc_item.level += 1 is_toplevel = False toc_list.append(toc_item)
def process_headings(dom: EasyXmlTree, textf: str, toc_list: list, nest_under_halftitle: bool, single_file: bool) -> None: """ Find headings in current file and extract title data into items added to toc_list. INPUTS: dom: an EasyXmlTree representation of the current file textf: the path to the file toc_list: the list of ToC items we are building nest_under_halftitle: does this item need to be nested? single_file: is there only a single content item in the production? OUTPUTS: None """ body = dom.xpath("//body") place = Position.NONE if body: place = get_place(body[0]) else: raise se.InvalidInputException("Couldn’t locate [xhtml]<body>[/].") is_toplevel = True # Find all the hgroups and h1, h2 etc headings. heads = dom.xpath("//hgroup | //h1 | //h2 | //h3 | //h4 | //h5 | //h6") # special treatment where we can't find any header or hgroups if not heads: # May be a dedication or an epigraph, with no heading tag. if single_file and nest_under_halftitle: # There's a halftitle, but only this one content file with no subsections, # so leave out of ToC because the Toc will link to the halftitle. return special_item = TocItem() # Need to determine level depth. # We don't have a heading, so get first content item content_item = dom.xpath("//p | //header | //img") if content_item is not None: special_item.level = get_level(content_item[0], toc_list) special_item.title = dom.xpath( "//head/title/text()", True) # Use the page title as the ToC entry title. if special_item.title is None: special_item.title = "NO TITLE" special_item.file_link = textf special_item.place = place toc_list.append(special_item) return for heading in heads: # don't process a heading separately if it's within a hgroup if heading.parent.tag == "hgroup": continue # skip it if place == Position.BODY: toc_item = process_a_heading(heading, textf, is_toplevel, single_file) else: # if it's not a bodymatter item we don't care about whether it's single_file toc_item = process_a_heading(heading, textf, is_toplevel, False) toc_item.level = get_level(heading, toc_list) toc_item.place = place # Exception: The titlepage always has is titled 'titlepage' in the ToC if dom.xpath("//section[re:test(@epub:type, '\\btitlepage\\b')]"): toc_item.title = "Titlepage" is_toplevel = False toc_list.append(toc_item)
def add_landmark(dom: EasyXmlTree, textf: str, landmarks: list) -> None: """ Adds an item to landmark list with appropriate details. INPUTS: dom: EasyXmlTree representation of the file we are indexing in ToC textf: path to the file landmarks: the list of landmark items we are building OUTPUTS: None """ # According to the IDPF a11y best practices page: <http://idpf.org/epub/a11y/techniques/#sem-003>: # > it is recommended to include a link to the start of the body matter as well as to any major # > reference sections (e.g., table of contents, endnotes, bibliography, glossary, index). # # So, we only want the start of the text, and (endnotes,glossary,bibliography,loi) in the landmarks. epub_type = "" sections = dom.xpath( "//body/*[name() = 'section' or name() = 'article' or name() = 'nav']") if not sections: raise se.InvalidInputException( "Couldn’t locate first [xhtml]<section>[/], [xhtml]<article>[/], or [xhtml]<nav>[/]." ) epub_type = sections[0].get_attr("epub:type") bodys = dom.xpath("//body") if not bodys: raise se.InvalidInputException("Couldn’t locate [xhtml]<body>[/].") if not epub_type: # some productions don't have an epub:type in outermost section, so get it from body tag epub_type = bodys[0].get_attr("epub:type") if not epub_type: epub_type = "" if epub_type in ["frontmatter", "bodymatter", "backmatter"]: return # if epub_type is ONLY frontmatter, bodymatter, backmatter, we don't want this as a landmark if dom.xpath("//*[contains(@epub:type, 'frontmatter')]"): return # We don't want frontmatter in the landmarks if dom.xpath( "//*[contains(@epub:type, 'backmatter')]") and not regex.findall( r"\b(loi|endnotes|bibliography|glossary|index)\b", epub_type): return # We only want certain backmatter in the landmarks # We may wind up with a (front|body|back)matter semantic in epub_type, remove it here since we add it to the landmark later epub_type = regex.sub(r"(front|body|back)matter\s*", "", epub_type) landmark = TocItem() if epub_type: landmark.epub_type = epub_type landmark.file_link = textf landmark.place = get_place(bodys[0]) if epub_type == "halftitlepage": landmark.title = "Half Title" elif epub_type == "titlepage": # Exception: The titlepage always has is titled 'titlepage' in the ToC landmark.title = "Titlepage" else: landmark.title = dom.xpath( "//head/title/text()", True) # Use the page title as the landmark entry title. if landmark.title is None: # This is a bit desperate, use this only if there's no proper <title> tag in file. landmark.title = landmark.epub_type.capitalize() landmarks.append(landmark)
def process_headings(dom: EasyXmlTree, textf: str, toc_list: list, single_file: bool, single_file_without_headers: bool) -> None: """ Find headings in current file and extract title data into items added to toc_list. INPUTS: dom: an EasyXmlTree representation of the current file textf: the path to the file toc_list: the list of ToC items we are building single_file: is there only a single content item in the production? OUTPUTS: None """ body = dom.xpath("//body") place = Position.NONE if body: place = get_place(body[0]) else: raise se.InvalidInputException("Couldn’t locate [xhtml]<body>[/].") is_toplevel = True # Find all the hgroups and h1, h2 etc headings. heads = dom.xpath("//hgroup | //h1 | //h2 | //h3 | //h4 | //h5 | //h6") # special treatment where we can't find any header or hgroups if not heads: # May be a dedication or an epigraph, with no heading tag. special_item = TocItem() # Need to determine level depth. # We don't have a heading, so get first content item content_item = dom.xpath("//p | //header | //img") if content_item is not None: special_item.level = get_level(content_item[0], toc_list) special_item.title = dom.xpath( "//head/title/text()", True) # Use the page title as the ToC entry title. if special_item.title is None: special_item.title = "NO TITLE" special_item.file_link = textf special_item.place = place toc_list.append(special_item) return for heading in heads: # don't process a heading separately if it's within a hgroup if heading.parent.tag == "hgroup": continue # skip it if place == Position.BODY: toc_item = process_a_heading(heading, textf, is_toplevel, single_file) else: # if it's not a bodymatter item we don't care about whether it's single_file toc_item = process_a_heading(heading, textf, is_toplevel, False) toc_item.level = get_level(heading, toc_list) toc_item.place = place # Exception: The titlepage always has is titled 'titlepage' in the ToC if dom.xpath("//section[re:test(@epub:type, '\\btitlepage\\b')]"): toc_item.title = "Titlepage" # Exception: If there is only a single body item WITHOUT HEADERS (like Father Goriot or The Path to Rome), # the half title page is listed as "Half-Titlepage" instead of the work title, # so that we don't duplicate the work title in the ToC. We always include a link to the work body # in the ToC because readers on the web version need to have access to the text starting point, since # there are no back/forward nav buttons in XHTML files served on the web. if single_file_without_headers and dom.xpath( "//section[re:test(@epub:type, '\\bhalftitlepage\\b')]"): toc_item.title = "Half-Titlepage" is_toplevel = False toc_list.append(toc_item)