Python XML.xpathの例

プログラミング言語: Python

名前空間/パッケージ名: xml.etree.ElementTree

クラス/型: XML

メソッド/関数: xpath

hotexamples.comのコード掲載数: 2

Python XML.xpath - 2件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのxml.etree.ElementTree.XML.xpathの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

XML(30)

find(30)

findall(30)

iter(23)

get(9)

findtext(6)

getiterator(6)

getchildren(4)

append(4)

xpath(2)

makeelement(2)

parse(1)

set(1)

tail(1)

text(1)

textlist(1)

getroot(1)

コード例 #1

ファイルを表示

class _LxmlFrameParser(_XMLFrameParser):
    """
    Internal class to parse XML into DataFrames with third-party
    full-featured XML library, `lxml`, that supports
    XPath 1.0 and XSLT 1.0.
    """

    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

    def parse_data(self) -> list[dict[str, str | None]]:
        """
        Parse xml data.

        This method will call the other internal methods to
        validate xpath, names, optionally parse and run XSLT,
        and parse original or transformed XML and return specific nodes.
        """
        from lxml.etree import XML

        self.xml_doc = XML(self._parse_doc(self.path_or_buffer))

        if self.stylesheet is not None:
            self.xsl_doc = XML(self._parse_doc(self.stylesheet))
            self.xml_doc = XML(self._transform_doc())

        self._validate_path()
        self._validate_names()

        return self._parse_nodes()

    def _parse_nodes(self) -> list[dict[str, str | None]]:
        elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
        dicts: list[dict[str, str | None]]

        if self.elems_only and self.attrs_only:
            raise ValueError("Either element or attributes can be parsed not both.")

        elif self.elems_only:
            if self.names:
                dicts = [
                    {
                        **(
                            {el.tag: el.text.strip()}
                            if el.text and not el.text.isspace()
                            else {}
                        ),
                        **{
                            nm: ch.text.strip() if ch.text else None
                            for nm, ch in zip(self.names, el.xpath("*"))
                        },
                    }
                    for el in elems
                ]
            else:
                dicts = [
                    {
                        ch.tag: ch.text.strip() if ch.text else None
                        for ch in el.xpath("*")
                    }
                    for el in elems
                ]

        elif self.attrs_only:
            dicts = [el.attrib for el in elems]

        else:
            if self.names:
                dicts = [
                    {
                        **el.attrib,
                        **(
                            {el.tag: el.text.strip()}
                            if el.text and not el.text.isspace()
                            else {}
                        ),
                        **{
                            nm: ch.text.strip() if ch.text else None
                            for nm, ch in zip(self.names, el.xpath("*"))
                        },
                    }
                    for el in elems
                ]
            else:
                dicts = [
                    {
                        **el.attrib,
                        **(
                            {el.tag: el.text.strip()}
                            if el.text and not el.text.isspace()
                            else {}
                        ),
                        **{
                            ch.tag: ch.text.strip() if ch.text else None
                            for ch in el.xpath("*")
                        },
                    }
                    for el in elems
                ]

        if self.namespaces or "}" in list(dicts[0].keys())[0]:
            dicts = [
                {k.split("}")[1] if "}" in k else k: v for k, v in d.items()}
                for d in dicts
            ]

        keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
        dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]

        if self.names:
            dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]

        return dicts

    def _validate_path(self) -> None:

        msg = (
            "xpath does not return any nodes. "
            "Be sure row level nodes are in xpath. "
            "If document uses namespaces denoted with "
            "xmlns, be sure to define namespaces and "
            "use them in xpath."
        )

        elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
        children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces)
        attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces)

        if elems == []:
            raise ValueError(msg)

        if elems != [] and attrs == [] and children == []:
            raise ValueError(msg)

    def _validate_names(self) -> None:
        """
        Validate names.

        This method will check if names is a list and aligns with
        length of parse nodes.

        Raises
        ------
        ValueError
            * If value is not a list and less then length of nodes.
        """
        if self.names:
            children = self.xml_doc.xpath(
                self.xpath + "[1]/*", namespaces=self.namespaces
            )

            if is_list_like(self.names):
                if len(self.names) < len(children):
                    raise ValueError(
                        "names does not match length of child elements in xpath."
                    )
            else:
                raise TypeError(
                    f"{type(self.names).__name__} is not a valid type for names"
                )

    def _parse_doc(self, raw_doc) -> bytes:
        from lxml.etree import (
            XMLParser,
            fromstring,
            parse,
            tostring,
        )

        handle_data = get_data_from_filepath(
            filepath_or_buffer=raw_doc,
            encoding=self.encoding,
            compression=self.compression,
            storage_options=self.storage_options,
        )

        with preprocess_data(handle_data) as xml_data:
            curr_parser = XMLParser(encoding=self.encoding)

            if isinstance(xml_data, io.StringIO):
                doc = fromstring(
                    xml_data.getvalue().encode(self.encoding), parser=curr_parser
                )
            else:
                doc = parse(xml_data, parser=curr_parser)

        return tostring(doc)

    def _transform_doc(self) -> bytes:
        """
        Transform original tree using stylesheet.

        This method will transform original xml using XSLT script into
        am ideally flatter xml document for easier parsing and migration
        to Data Frame.
        """
        from lxml.etree import XSLT

        transformer = XSLT(self.xsl_doc)
        new_doc = transformer(self.xml_doc)

        return bytes(new_doc)

コード例 #2

ファイルを表示

ファイル: xml.py プロジェクト: YarShev/pandas

class _LxmlFrameParser(_XMLFrameParser):
    """
    Internal class to parse XML into DataFrames with third-party
    full-featured XML library, `lxml`, that supports
    XPath 1.0 and XSLT 1.0.
    """
    def parse_data(self) -> list[dict[str, str | None]]:
        """
        Parse xml data.

        This method will call the other internal methods to
        validate xpath, names, optionally parse and run XSLT,
        and parse original or transformed XML and return specific nodes.
        """
        from lxml.etree import (
            XML,
            iterparse,
        )

        if self.iterparse is None:
            self.xml_doc = XML(self._parse_doc(self.path_or_buffer))

            if self.stylesheet:
                self.xsl_doc = XML(self._parse_doc(self.stylesheet))
                self.xml_doc = XML(self._transform_doc())

            self._validate_path()
            elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)

        self._validate_names()

        xml_dicts: list[dict[str,
                             str | None]] = (self._parse_nodes(elems)
                                             if self.iterparse is None else
                                             self._iterparse_nodes(iterparse))

        return xml_dicts

    def _validate_path(self) -> None:

        msg = ("xpath does not return any nodes. "
               "Be sure row level nodes are in xpath. "
               "If document uses namespaces denoted with "
               "xmlns, be sure to define namespaces and "
               "use them in xpath.")

        elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
        children = self.xml_doc.xpath(self.xpath + "/*",
                                      namespaces=self.namespaces)
        attrs = self.xml_doc.xpath(self.xpath + "/@*",
                                   namespaces=self.namespaces)

        if elems == []:
            raise ValueError(msg)

        if elems != [] and attrs == [] and children == []:
            raise ValueError(msg)

    def _validate_names(self) -> None:
        children: list[Any]

        if self.names:
            if self.iterparse:
                children = self.iterparse[next(iter(self.iterparse))]
            else:
                children = self.xml_doc.xpath(self.xpath + "[1]/*",
                                              namespaces=self.namespaces)

            if is_list_like(self.names):
                if len(self.names) < len(children):
                    raise ValueError(
                        "names does not match length of child elements in xpath."
                    )
            else:
                raise TypeError(
                    f"{type(self.names).__name__} is not a valid type for names"
                )

    def _parse_doc(
            self,
            raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]) -> bytes:
        from lxml.etree import (
            XMLParser,
            fromstring,
            parse,
            tostring,
        )

        handle_data = get_data_from_filepath(
            filepath_or_buffer=raw_doc,
            encoding=self.encoding,
            compression=self.compression,
            storage_options=self.storage_options,
        )

        with preprocess_data(handle_data) as xml_data:
            curr_parser = XMLParser(encoding=self.encoding)

            if isinstance(xml_data, io.StringIO):
                if self.encoding is None:
                    raise TypeError(
                        "Can not pass encoding None when input is StringIO.")

                doc = fromstring(xml_data.getvalue().encode(self.encoding),
                                 parser=curr_parser)
            else:
                doc = parse(xml_data, parser=curr_parser)

        return tostring(doc)

    def _transform_doc(self) -> bytes:
        """
        Transform original tree using stylesheet.

        This method will transform original xml using XSLT script into
        am ideally flatter xml document for easier parsing and migration
        to Data Frame.
        """
        from lxml.etree import XSLT

        transformer = XSLT(self.xsl_doc)
        new_doc = transformer(self.xml_doc)

        return bytes(new_doc)