コード例 #1
0
class _LxmlFrameParser(_XMLFrameParser):
    """
    Internal class to parse XML into DataFrames with third-party
    full-featured XML library, `lxml`, that supports
    XPath 1.0 and XSLT 1.0.
    """

    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

    def parse_data(self) -> list[dict[str, str | None]]:
        """
        Parse xml data.

        This method will call the other internal methods to
        validate xpath, names, optionally parse and run XSLT,
        and parse original or transformed XML and return specific nodes.
        """
        from lxml.etree import XML

        self.xml_doc = XML(self._parse_doc(self.path_or_buffer))

        if self.stylesheet is not None:
            self.xsl_doc = XML(self._parse_doc(self.stylesheet))
            self.xml_doc = XML(self._transform_doc())

        self._validate_path()
        self._validate_names()

        return self._parse_nodes()

    def _parse_nodes(self) -> list[dict[str, str | None]]:
        elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
        dicts: list[dict[str, str | None]]

        if self.elems_only and self.attrs_only:
            raise ValueError("Either element or attributes can be parsed not both.")

        elif self.elems_only:
            if self.names:
                dicts = [
                    {
                        **(
                            {el.tag: el.text.strip()}
                            if el.text and not el.text.isspace()
                            else {}
                        ),
                        **{
                            nm: ch.text.strip() if ch.text else None
                            for nm, ch in zip(self.names, el.xpath("*"))
                        },
                    }
                    for el in elems
                ]
            else:
                dicts = [
                    {
                        ch.tag: ch.text.strip() if ch.text else None
                        for ch in el.xpath("*")
                    }
                    for el in elems
                ]

        elif self.attrs_only:
            dicts = [el.attrib for el in elems]

        else:
            if self.names:
                dicts = [
                    {
                        **el.attrib,
                        **(
                            {el.tag: el.text.strip()}
                            if el.text and not el.text.isspace()
                            else {}
                        ),
                        **{
                            nm: ch.text.strip() if ch.text else None
                            for nm, ch in zip(self.names, el.xpath("*"))
                        },
                    }
                    for el in elems
                ]
            else:
                dicts = [
                    {
                        **el.attrib,
                        **(
                            {el.tag: el.text.strip()}
                            if el.text and not el.text.isspace()
                            else {}
                        ),
                        **{
                            ch.tag: ch.text.strip() if ch.text else None
                            for ch in el.xpath("*")
                        },
                    }
                    for el in elems
                ]

        if self.namespaces or "}" in list(dicts[0].keys())[0]:
            dicts = [
                {k.split("}")[1] if "}" in k else k: v for k, v in d.items()}
                for d in dicts
            ]

        keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
        dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]

        if self.names:
            dicts = [{nm: v for nm, v in zip(self.names, d.values())} for d in dicts]

        return dicts

    def _validate_path(self) -> None:

        msg = (
            "xpath does not return any nodes. "
            "Be sure row level nodes are in xpath. "
            "If document uses namespaces denoted with "
            "xmlns, be sure to define namespaces and "
            "use them in xpath."
        )

        elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
        children = self.xml_doc.xpath(self.xpath + "/*", namespaces=self.namespaces)
        attrs = self.xml_doc.xpath(self.xpath + "/@*", namespaces=self.namespaces)

        if elems == []:
            raise ValueError(msg)

        if elems != [] and attrs == [] and children == []:
            raise ValueError(msg)

    def _validate_names(self) -> None:
        """
        Validate names.

        This method will check if names is a list and aligns with
        length of parse nodes.

        Raises
        ------
        ValueError
            * If value is not a list and less then length of nodes.
        """
        if self.names:
            children = self.xml_doc.xpath(
                self.xpath + "[1]/*", namespaces=self.namespaces
            )

            if is_list_like(self.names):
                if len(self.names) < len(children):
                    raise ValueError(
                        "names does not match length of child elements in xpath."
                    )
            else:
                raise TypeError(
                    f"{type(self.names).__name__} is not a valid type for names"
                )

    def _parse_doc(self, raw_doc) -> bytes:
        from lxml.etree import (
            XMLParser,
            fromstring,
            parse,
            tostring,
        )

        handle_data = get_data_from_filepath(
            filepath_or_buffer=raw_doc,
            encoding=self.encoding,
            compression=self.compression,
            storage_options=self.storage_options,
        )

        with preprocess_data(handle_data) as xml_data:
            curr_parser = XMLParser(encoding=self.encoding)

            if isinstance(xml_data, io.StringIO):
                doc = fromstring(
                    xml_data.getvalue().encode(self.encoding), parser=curr_parser
                )
            else:
                doc = parse(xml_data, parser=curr_parser)

        return tostring(doc)

    def _transform_doc(self) -> bytes:
        """
        Transform original tree using stylesheet.

        This method will transform original xml using XSLT script into
        am ideally flatter xml document for easier parsing and migration
        to Data Frame.
        """
        from lxml.etree import XSLT

        transformer = XSLT(self.xsl_doc)
        new_doc = transformer(self.xml_doc)

        return bytes(new_doc)
コード例 #2
0
ファイル: xml.py プロジェクト: YarShev/pandas
class _LxmlFrameParser(_XMLFrameParser):
    """
    Internal class to parse XML into DataFrames with third-party
    full-featured XML library, `lxml`, that supports
    XPath 1.0 and XSLT 1.0.
    """
    def parse_data(self) -> list[dict[str, str | None]]:
        """
        Parse xml data.

        This method will call the other internal methods to
        validate xpath, names, optionally parse and run XSLT,
        and parse original or transformed XML and return specific nodes.
        """
        from lxml.etree import (
            XML,
            iterparse,
        )

        if self.iterparse is None:
            self.xml_doc = XML(self._parse_doc(self.path_or_buffer))

            if self.stylesheet:
                self.xsl_doc = XML(self._parse_doc(self.stylesheet))
                self.xml_doc = XML(self._transform_doc())

            self._validate_path()
            elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)

        self._validate_names()

        xml_dicts: list[dict[str,
                             str | None]] = (self._parse_nodes(elems)
                                             if self.iterparse is None else
                                             self._iterparse_nodes(iterparse))

        return xml_dicts

    def _validate_path(self) -> None:

        msg = ("xpath does not return any nodes. "
               "Be sure row level nodes are in xpath. "
               "If document uses namespaces denoted with "
               "xmlns, be sure to define namespaces and "
               "use them in xpath.")

        elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
        children = self.xml_doc.xpath(self.xpath + "/*",
                                      namespaces=self.namespaces)
        attrs = self.xml_doc.xpath(self.xpath + "/@*",
                                   namespaces=self.namespaces)

        if elems == []:
            raise ValueError(msg)

        if elems != [] and attrs == [] and children == []:
            raise ValueError(msg)

    def _validate_names(self) -> None:
        children: list[Any]

        if self.names:
            if self.iterparse:
                children = self.iterparse[next(iter(self.iterparse))]
            else:
                children = self.xml_doc.xpath(self.xpath + "[1]/*",
                                              namespaces=self.namespaces)

            if is_list_like(self.names):
                if len(self.names) < len(children):
                    raise ValueError(
                        "names does not match length of child elements in xpath."
                    )
            else:
                raise TypeError(
                    f"{type(self.names).__name__} is not a valid type for names"
                )

    def _parse_doc(
            self,
            raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]) -> bytes:
        from lxml.etree import (
            XMLParser,
            fromstring,
            parse,
            tostring,
        )

        handle_data = get_data_from_filepath(
            filepath_or_buffer=raw_doc,
            encoding=self.encoding,
            compression=self.compression,
            storage_options=self.storage_options,
        )

        with preprocess_data(handle_data) as xml_data:
            curr_parser = XMLParser(encoding=self.encoding)

            if isinstance(xml_data, io.StringIO):
                if self.encoding is None:
                    raise TypeError(
                        "Can not pass encoding None when input is StringIO.")

                doc = fromstring(xml_data.getvalue().encode(self.encoding),
                                 parser=curr_parser)
            else:
                doc = parse(xml_data, parser=curr_parser)

        return tostring(doc)

    def _transform_doc(self) -> bytes:
        """
        Transform original tree using stylesheet.

        This method will transform original xml using XSLT script into
        am ideally flatter xml document for easier parsing and migration
        to Data Frame.
        """
        from lxml.etree import XSLT

        transformer = XSLT(self.xsl_doc)
        new_doc = transformer(self.xml_doc)

        return bytes(new_doc)