Exemple #1
0
    def transform_char(self, element: Element, /):

        char_placeholder_pattern = re.compile(
            r"""
                (
                    (?P<u_plus> U\+ )?  # optional U+ prefix
                    (
                        (?P<code_point_placeholder> X{4} )  # XXXX as code point placeholder
                        | (?P<code_point> 1?[0-9A-F]?[0-9A-F]{4} )  # or actual code point
                    )
                )?
                (
                    \s
                    (?P<glyph_placeholder> \[X\] )  # [X] as glyph placeholder
                )?
                (
                    \s
                    (?P<name> [A-Z0-9 -]+ )  # actual name
                )?
            """,
            flags=re.VERBOSE,
        )

        transformed: Element = element.makeelement("span", {"class": "character"}, None)
        transformed.tail = element.tail  # type: ignore

        text: str = element.text
        element.getparent().replace(element, transformed)

        if match := char_placeholder_pattern.fullmatch(text):

            cps = set[int]()
            if name := match.group("name"):
                cps.add(ord(unicodedata2.lookup(name)))
Exemple #2
0
def remove_element(element: etree._Element, keep_children=False) -> None:
    """ Removes the given element from its tree. Unless ``keep_children`` is passed as ``True``,
        its children vanish with it into void.
    """
    if keep_children:
        for child in element:
            element.addprevious(child)
    element.getparent().remove(element)
Exemple #3
0
def _clear_context(elem: etree._Element) -> None:
    elem.clear()

    while elem.getprevious() is not None:
        del elem.getparent()[0]

    return None
Exemple #4
0
def get_item_details(section_number: str, department_name: str,
                     node: et._Element) -> Dict:
    """Extract a BOE diary entry's deatils."""
    search_details = helpers.use_tree_for_search(node)
    title_node = search_details(boe.SummaryXpath.item_title)[0]
    pdf_url_node = search_details(boe.SummaryXpath.item_pdf_url)[0]
    xml_url_node = search_details(boe.SummaryXpath.item_xml_url)[0]
    htm_url_node = search_details(boe.SummaryXpath.item_htm_url)[0]

    parent = node.getparent()
    is_epigraph = parent.tag.lower() == 'epigrafe'
    epigraph = parent.get(boe.SummaryAttribute.epigraph_name) \
        if is_epigraph \
        else ''

    details = {}
    details['id'] = node.get(boe.SummaryAttribute.item_id)
    details['epigraph'] = epigraph
    details['section'] = section_number
    details['department'] = department_name
    details['title'] = title_node.text
    details['pdf_url'] = pdf_url_node.text
    details['xml_url'] = xml_url_node.text
    details['htm_url'] = htm_url_node.text

    return details
    def _handle_text(cls, node: etree._Element, do_handle_tail_instead=False):
        if do_handle_tail_instead:
            if not node.tail or not node.tail.strip():
                return
            text = node.tail
            node.tail = ''
            insert_node = node.getparent()
            insert_start = insert_node.index(node) + 1
        else:
            if not node.text or not node.text.strip():
                return
            text = node.text.strip()
            node.text = ''
            insert_node = node
            insert_start = 0

        word_nodes = cls._str_2_word_nodes(text)

        # the child nodes all get the classes of the parents.  that's used later in postproc
        for word_node in word_nodes:
            word_node.attrib[
                cls.PARENT_CLASS_ATTRIB_NAME] = insert_node.attrib.get(
                    'class', '')

        # set the newly created word nodes as children of the parent node.
        # for text they go below the current node, at the beginning.
        # for tail, they get inserted into the current node's parent after the current node.
        for word_ind, word_node in enumerate(word_nodes):
            insert_node.insert(word_ind + insert_start, word_node)

        return
Exemple #6
0
def get_parent(xml_obj: _Element):
    """ Returns the parent of the current xml object

    Args:
        xml_obj (Element): The xml element
    Returns:
         The xml element's parent
    """
    return xml_obj.getparent()
Exemple #7
0
def remove_one_element(element: _Element) -> None:
    """
    Remove single specified element.

    element -- element to be removed
    """
    parent = element.getparent()
    if parent is not None:
        parent.remove(element)
Exemple #8
0
def remove_preserving_whitespace(element: Element) -> None:
    parent = element.getparent()
    if element.tail:
        prev = element.getprevious()
        if prev is not None:
            prev.tail = (prev.tail or "") + element.tail
        else:
            parent.text = (parent.text or "") + element.tail
    parent.remove(element)
Exemple #9
0
    def transform_element(self, element: Element, /):

        match element.tag:

            case "char":
                self.transform_char(element)

            case ("h1" | "figcaption" | "a") as tag if (mode := "numbering") in element.keys():

                self.expand_placeholder_in_element(element, mode)

                if tag == "a":

                    wrap = element.makeelement("cite", {}, None)
                    wrap.tail, element.tail = element.tail, None  # type: ignore

                    element.getparent().replace(element, wrap)
                    wrap.append(element)
Exemple #10
0
def get_path_to_root(e: etree._Element,
                     preserve_ns: bool = False) -> List[AnyStr]:
    anc = e.getparent()
    path = []
    while anc is not None:
        path.insert(
            0,
            anc.tag.strip() if preserve_ns else re.sub(r'\{.*\}(.*)', r'\1',
                                                       anc.tag.strip()))
        anc = anc.getparent()
    return path
Exemple #11
0
def get_parent_resource(resource_el: _Element) -> Optional[_Element]:
    """
    Return a direct ancestor of a specified resource or None if the resource
    has no ancestor.
    Example: for a resource in group which is in clone, this function will
    return group element.

    resource_el -- resource element of which parent resource should be returned
    """
    parent_el = resource_el.getparent()
    if parent_el is not None and is_wrapper_resource(parent_el):
        return parent_el
    return None
    def fix_tail(self, item: etree._Element) -> None:
        """Fix self-closing elements.

        Designed only to work with self closing elements after item has just
        been inserted/appended
        """
        parent = item.getparent()
        idx = parent.index(item)
        if idx == 0:
            # item is the first child element, move the text to after item
            item.tail = parent.text
        else:
            # There are other elements, possibly also text, before this child
            # element.
            # Move this element's tail to the previous element (note: .text is
            # only the text after the last child element, text before that and
            # surrounding elements are attributes of the elements)
            item.tail = parent[idx - 1].tail
            # If this is the last child element, it gets the remaining text.
            if idx == len(parent) - 1:
                parent[idx - 1].tail = parent.text
Exemple #13
0
    def remove_node(self, node: ET._Element, hold_tail=False):
        """
        删除指定节点

        @param {ET._Element} node - 要删除的节点
        @param {bool} hold_tail=False - 是否保留上一节点的tail信息
        """
        _parent = node.getparent()
        if _parent is not None:
            if hold_tail and node.tail is not None:
                # 保存上一节点tail信息
                _tail = node.tail
                _previous = node.getprevious()
                if _previous is not None:
                    _previous.tail = (_previous.tail or '') + _tail
                else:
                    _parent.text = (_parent.text or '') + _tail
                _parent.remove(node)
            else:
                # 直接删除
                _parent.remove(node)
Exemple #14
0
def get_root(element: Element) -> Element:
    parent = element.getparent()
    while parent is not None:
        element, parent = parent, parent.getparent()
    return element
Exemple #15
0
def cleanup(elem: etree._Element):
    elem.clear()
    while elem.getprevious() is not None:
        del elem.getparent()[0]  # clean up preceding siblings
Exemple #16
0
    def _process_elem(self, parent_state: PTState, t_elem: etree._Element):
        if isinstance(t_elem, etree._Comment):
            return
        self._ext.set_elem_context(t_elem)
        qname = etree.QName(t_elem.tag)
        state = PTState(parent_state, t_elem)

        if state["reorder"]:
            self._reorder.append(state)

        # duplicate subtree for each source
        if len(state["sources"].secondary):
            # prevent triggering this processing branch on sibling passes
            del t_elem.attrib[self._pt_clark("sources")]
            # We temporarily detach the t_elem subtree and insert each elem subtree at
            # the original location of t_elem before populating, which ensures that
            # resolved paths are always in the form /path/to/elem[1]/child, which will
            # match corresponding source elements (e.g. /path/to/elem/child) in the
            # multi source fetch scenario. Caveat: downstream deferred pt:fill or
            # pt:required will be evaluated in the context of their element's final
            # path (e.g. /path/to/elem[3]/child).
            #
            # Inserting and populating the subtrees in reverse order ensures that their
            # final document order for multi source fetches is aligned with the order of
            # the source_map sources.
            parent = t_elem.getparent()
            idx = parent.index(t_elem)
            parent.remove(t_elem)
            for source in reversed(
                (state["sources"].primary, *state["sources"].secondary)
            ):
                elem = (
                    t_elem if source is state["sources"].primary else deepcopy(t_elem)
                )
                state["sources"] = SourceGroup(source)
                parent.insert(idx, elem)
                self._process_elem(state, elem)
            return

        if state["fetch"]:
            path = self.label.getelementpath(t_elem)
            s_elems = state["sources"].primary.findall(path)
            if len(s_elems) > 1:
                if state["multi"] is not True and len(s_elems) != state["multi"]:
                    raise PTFetchError(
                        f"{len(s_elems)} source elements found but pt:multi is set to"
                        f" expect {int(state['multi'])}",
                        t_elem,
                    )  # cast False to 0 for readability
                self._process_multi_branch(t_elem, parent_state, len(s_elems) - 1)
                return
            elif not len(s_elems):
                if state["required"]:
                    url = state["sources"].primary.docinfo.URL
                    source_file = (
                        Path(url).name if url is not None else "<unresolved filename>"
                    )
                    raise PTFetchError(
                        f"{qname.localname} could not be located at path {path} in"
                        f" source {state.exp['sources']} from {source_file}",  # FIXME: .exp is None in descendants where source is inherited...
                        t_elem,
                    )
                t_elem.getparent().remove(t_elem)
                return
            elif not len(t_elem):  # len(s_elems) == 1:
                t_elem.attrib.update(s_elems[0].attrib)
                t_elem.text = s_elems[0].text
        else:
            if isinstance(state["multi"], int) and state["multi"] > 1:
                self._process_multi_branch(t_elem, parent_state, state["multi"] - 1)
                return
            # non-fetch required condition; should be evaluated at export
            if state.exp["required"] is not None:
                self._deferred_reqs.append(state)

        if len(t_elem):
            for child_elem in t_elem.getchildren():
                self._process_elem(state, child_elem)
        elif state.exp["fill"]:
            if state["defer"]:
                self._deferred_fills.append(state)
            else:
                self._handle_fill(state.t_elem, state.eval_deferred("fill"))

        state.remove_elem_pt_attrs()
Exemple #17
0
def getparent(dom: etree._Element) -> etree._Element:
    return cast(etree._Element, dom.getparent())