Python EasyXmlElementの例、se.easy_xml.EasyXmlElement Pythonの例

コード例 #1

0

ファイルを表示

ファイル: se_epub_generate_toc.py プロジェクト: mihailim/standardebooks-tools

def get_level(node: EasyXmlElement, toc_list: list) -> int:
    """
	Get level of a node.
	"""

    # first need to check how deep this heading is within the current file
    parent_sections = node.xpath(
        "./ancestor::*[name() = 'section' or name() = 'article']")
    if parent_sections:
        depth = len(parent_sections)
    else:
        depth = 1

    if not node.parent:
        return depth  # must be at the top level

    data_parents = node.xpath("//*[@data-parent]")
    if not data_parents:
        return depth

    data_parent = data_parents[0].get_attr("data-parent")

    if data_parent:
        # see if we can find it in already processed (as we should if spine is correctly ordered)
        parent_file = [t for t in toc_list if t.id == data_parent]
        if parent_file:
            this_level = parent_file[0].level + 1
            return this_level + depth - 1  # subtract from depth because all headings should have depth >= 1

    return depth

コード例 #2

0

ファイルを表示

ファイル: se_epub_generate_toc.py プロジェクト: zoeypeterson/tools

def output_toc(item_list: list, landmark_list, toc_path: str, work_type: str,
               work_title: str) -> str:
    """
	Outputs the contructed ToC based on the lists of items and landmarks found,
	either to stdout or overwriting the existing ToC file

	INPUTS:
	item_list: list of ToC items (the first part of the ToC)
	landmark_list: list of landmark items (the second part of the ToC)
	work_type: "fiction" or "non-fiction"
	work_title: the title of the book

	OUTPUTS:
	a html string representing the new ToC
	"""

    if len(item_list) < 2:
        raise se.InvalidInputException("Too few ToC items found.")

    try:
        with open(toc_path, encoding="utf8") as file:
            toc_dom = se.easy_xml.EasyXhtmlTree(file.read())
    except Exception as ex:
        raise se.InvalidInputException(
            f"Existing ToC not found. Exception: {ex}")

    # There should be exactly two nav sections.
    navs = toc_dom.xpath("//nav")

    if len(navs) < 2:
        raise se.InvalidInputException(
            "Existing ToC has too few nav sections.")

    # now remove and then re-add the ol sections to clear them
    for nav in navs:
        ols = nav.xpath("./ol")  # just want the immediate ol children
        for ol_item in ols:
            ol_item.remove()

    # this is ugly and stupid, but I can't figure out an easier way to do it
    item_ol = EasyXmlElement(etree.Element("ol"), toc_dom.namespaces)
    item_ol.lxml_element.text = "TOC_ITEMS"
    navs[0].append(item_ol)
    landmark_ol = EasyXmlElement(etree.Element("ol"), toc_dom.namespaces)
    landmark_ol.lxml_element.text = "LANDMARK_ITEMS"
    navs[1].append(landmark_ol)
    xhtml = toc_dom.to_string()
    xhtml = xhtml.replace("TOC_ITEMS", process_items(item_list))
    xhtml = xhtml.replace(
        "LANDMARK_ITEMS",
        process_landmarks(landmark_list, work_type, work_title))

    return se.formatting.format_xhtml(xhtml)

コード例 #3

0

ファイルを表示

ファイル: se_epub_generate_toc.py プロジェクト: zoeypeterson/tools

def get_place(node: EasyXmlElement) -> Position:
    """
	Returns place of file in ebook, eg frontmatter, backmatter, etc.

	INPUTS:
	node: EasyXmlElement representation of the file

	OUTPUTS:
	a Position enum value indicating the place in the book
	"""

    epub_type = node.get_attr("epub:type")
    if not epub_type:
        return Position.NONE

    if "backmatter" in epub_type:
        retval = Position.BACK
    elif "frontmatter" in epub_type:
        retval = Position.FRONT
    elif "bodymatter" in epub_type:
        retval = Position.BODY
    else:
        retval = Position.NONE

    return retval

コード例 #4

0

ファイルを表示

ファイル: se_epub_generate_toc.py プロジェクト: vr8hub/tools

def evaluate_descendants(node: EasyXmlElement, toc_item: TocItem) -> TocItem:
    """
	Burrow down into a hgroup structure to qualify the ToC item

	INPUTS:
	node: EasyXmlElement object representing a hgroup

	OUTPUTS:
	toc_item: qualified ToC item
	"""
    children = node.xpath("./h1 | ./h2 | ./h3 | ./h4 | ./h5 | ./h6")
    for child in children:  # we expect these to be h1, h2, h3, h4 etc
        if not toc_item.lang:
            toc_item.lang = child.get_attr("xml:lang")
        epub_type = child.get_attr("epub:type")

        if child.get_attr("hidden"):
            toc_item.hidden = True

        if not epub_type:
            # should be a label/ordinal grouping
            child_strings = get_child_strings(child)
            if "label" in child_strings and "ordinal" in child_strings:  # quick test
                toc_item.title_is_ordinal = True
                # strip label
                child_strings = regex.sub(
                    r"<span epub:type=\"label\">(.*?)</span>", " \\1 ",
                    child_strings)
                # remove ordinal if it's by itself in a span
                child_strings = regex.sub(
                    r"<span epub:type=\"ordinal\">(.*?)</span>", " \\1 ",
                    child_strings)
                # remove ordinal if it's joined with a roman (which we want to keep)
                child_strings = regex.sub(r"\bordinal\b", "", child_strings)
                # remove extra spaces
                child_strings = regex.sub(r"[ ]{2,}", " ", child_strings)
                # get rid of any endnotes
                child_strings = strip_notes(child_strings)
                toc_item.title = child_strings.strip()
            continue  # skip the following
        if "z3998:roman" in epub_type:
            toc_item.roman = extract_strings(child)
            if not toc_item.title:
                toc_item.title = f"<span epub:type=\"z3998:roman\">{toc_item.roman}</span>"
        elif "ordinal" in epub_type:  # but not a roman numeral or a labelled item, cases caught caught above
            if not toc_item.title:
                toc_item.title = extract_strings(child)
                toc_item.title_is_ordinal = True
        if "subtitle" in epub_type:
            toc_item.subtitle = extract_strings(child)
        else:
            if "title" in epub_type:  # this allows for `fulltitle` to work here, too
                if toc_item.title or toc_item.roman or toc_item.title_is_ordinal:  # if title already filled, must be a subtitle
                    toc_item.subtitle = extract_strings(child)
                else:
                    toc_item.title = extract_strings(child)
        if toc_item.title and toc_item.subtitle:  # then we're done
            return toc_item
    return toc_item

コード例 #5

0

ファイルを表示

ファイル: se_epub_generate_toc.py プロジェクト: zoeypeterson/tools

def get_child_strings(node: EasyXmlElement) -> str:
    """
	Get child strings
	"""

    children = node.xpath("*")
    child_strs = ""
    for child in children:
        child_strs += child.to_string() + "\n"
    return child_strs

コード例 #6

0

ファイルを表示

ファイル: se_epub_generate_toc.py プロジェクト: zoeypeterson/tools

def get_book_division(node: EasyXmlElement) -> BookDivision:
    """
	Determine the kind of book division. At present only Part and Division
	are important; but others stored for possible future logic.

	INPUTS:
	tag: an EasyXml node representing a tag

	OUTPUTS:
	a BookDivision enum value representing the kind of division
	"""

    parent_sections = node.xpath(
        "./ancestor::*[name() = 'section' or name() = 'article']")

    if not parent_sections:
        parent_sections = node.xpath("./ancestor::body")

    if not parent_sections:  # couldn't find a parent, so throw an error
        raise se.InvalidInputException

    section_epub_type = parent_sections[-1].get_attr("epub:type")
    retval = BookDivision.NONE
    if not section_epub_type:
        return retval

    if "part" in section_epub_type:
        retval = BookDivision.PART
    if "division" in section_epub_type:
        retval = BookDivision.DIVISION
    if ("volume" in section_epub_type) and ("se:short-story"
                                            not in section_epub_type):
        retval = BookDivision.VOLUME
    if "subchapter" in section_epub_type:
        retval = BookDivision.SUBCHAPTER
    if "chapter" in section_epub_type:
        retval = BookDivision.CHAPTER
    if "article" in parent_sections[-1].tag:
        retval = BookDivision.ARTICLE

    return retval

コード例 #7

0

ファイルを表示

ファイル: se_epub_generate_toc.py プロジェクト: zoeypeterson/tools

def extract_strings(node: EasyXmlElement) -> str:
    """
	Returns string representation of a tag, ignoring linefeeds

	INPUTS:
	node: a tag as xpath node

	OUTPUTS:
	just the string contents of the tag
	"""

    out_string = node.inner_xml()
    out_string = strip_notes(out_string)
    return regex.sub(r"[\n\t]", "", out_string)

コード例 #8

0

ファイルを表示

ファイル: se_epub_generate_toc.py プロジェクト: zoeypeterson/tools

def get_parent_id(hchild: EasyXmlElement) -> str:
    """
	Climbs up the document tree looking for parent id in a <section> tag.

	INPUTS:
	hchild: a heading tag for which we want to find the parent id

	OUTPUTS:
	the id of the parent section
	"""

    # position() = 1 gets the nearest ancestor
    parents = hchild.xpath(
        "./ancestor::*[name() = 'section' or name() = 'article'][@id][position() = 1]"
    )

    if parents:
        return parents[0].get_attr("id")

    return ""

コード例 #9

0

ファイルを表示

ファイル: se_epub_generate_toc.py プロジェクト: zoeypeterson/tools

def process_a_heading(node: EasyXmlElement, textf: str, is_toplevel: bool,
                      single_file: bool) -> TocItem:
    """
	Generate and return a single TocItem from this heading.

	INPUTS:
	node: an EasyXml node representing a heading
	text: the path to the file
	is_toplevel: is this heading at the top-most level in the file?
	single_file: is there only one content file in the production (like some Poetry volumes)?

	OUTPUTS:
	a qualified ToCItem object
	"""

    toc_item = TocItem()
    parent_sections = node.xpath(
        "./ancestor::*[name() = 'section' or name() = 'article']")
    if parent_sections:
        toc_item.level = len(parent_sections)
    else:
        toc_item.level = 1

    toc_item.division = get_book_division(node)

    # is_top_level stops the first heading in a file getting an anchor id, we don't generally want that.
    # The exceptions are things like poems within a single-file volume.
    toc_item.id = get_parent_id(node)  # pylint: disable=invalid-name
    if toc_item.id == "":
        toc_item.file_link = textf
    else:
        if not is_toplevel:
            toc_item.file_link = f"{textf}#{toc_item.id}"
        elif single_file:  # It IS the first heading in the file, but there's only a single content file?
            toc_item.file_link = f"{textf}#{toc_item.id}"
        else:
            toc_item.file_link = textf

    toc_item.lang = node.get_attr("xml:lang")

    epub_type = node.get_attr("epub:type")

    # it may be an empty header tag eg <h3>, so we pass its parent rather than itself to evaluate the parent's descendants
    if not epub_type and node.tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
        parent = node.parent
        if parent:
            evaluate_descendants(parent, toc_item)
        else:  # shouldn't ever happen, but... just in case, raise an error
            raise se.InvalidInputException(
                f"Header without parent in file: [path][link=file://{textf}]{textf}[/][/]."
            )
        return toc_item
    if epub_type:
        # A heading may include z3998:roman directly,
        # eg <h5 epub:type="title z3998:roman">II</h5>.
        if "z3998:roman" in epub_type:
            toc_item.roman = extract_strings(node)
            toc_item.title = f"<span epub:type=\"z3998:roman\">{toc_item.roman}</span>"
            return toc_item
        if "ordinal" in epub_type:  # but not a roman numeral (eg in Nietzche's Beyond Good and Evil)
            toc_item.title = extract_strings(node)
            toc_item.title_is_ordinal = True
            return toc_item
        # may be the halftitle page with a subtitle, so we need to burrow down
        if ("fulltitle" in epub_type) and (node.tag == "hgroup"):
            evaluate_descendants(node, toc_item)
            return toc_item
        # or it may be a straightforward one-level title eg: <h2 epub:type="title">Imprint</h2>
        if "title" in epub_type:
            toc_item.title = extract_strings(node)
            return toc_item

    # otherwise, burrow down into its structure to get the info
    evaluate_descendants(node, toc_item)

    return toc_item