def parse(src): """ Returns an element tree create by `LXML <http://lxml.de/>`_. :param src: A readable object such as a :class:`wex.response.Response`. """ if not hasattr(src, 'read'): return src etree = _ElementTree() try: stream = HTMLStream(src) # Sometimes we get URLs containing characters that aren't # acceptable to lxml (e.g. "http:/foo.com/bar?this=array[]"). # When this happens lxml will quote the whole URL. # We don't want to have to check for this so we just always # quote it here and then unquote it in the `base_url` function. quoted_base_url = quote(src.url) if src.url else src.url while True: try: parser = HTMLParser() fp = replace_invalid_ncr(stream) etree.parse(fp, parser=parser, base_url=quoted_base_url) break except UnicodeDecodeError as exc: stream.next_encoding() except IOError as exc: logger = logging.getLogger(__name__) logger.warning("IOError parsing %s (%s)", src.url, exc) root = etree.getroot() if root is None: etree._setroot(UNPARSEABLE) return etree
def setUp(self): self.text = open("tests/testing_data/texts/sample.xml", "rb") self.TEI = MyCapytain.resources.texts.local.capitains.cts.CapitainsCtsText( resource=self.text, urn="urn:cts:latinLit:phi1294.phi002.perseus-lat2") self.treeroot = etree._ElementTree() with open("tests/testing_data/texts/text_or_xpath.xml") as f: self.text_complex = MyCapytain.resources.texts.local.capitains.cts.CapitainsCtsText( resource=f, urn="urn:cts:latinLit:phi1294.phi002.perseus-lat2") with open("tests/testing_data/texts/seneca.xml") as f: self.seneca = MyCapytain.resources.texts.local.capitains.cts.CapitainsCtsText( resource=f)
def setUp(self): self.text = open("tests/testing_data/texts/sample.xml", "rb") self.TEI = MyCapytain.resources.texts.locals.tei.Text( resource=self.text, urn="urn:cts:latinLit:phi1294.phi002.perseus-lat2" ) self.treeroot = etree._ElementTree() with open("tests/testing_data/texts/text_or_xpath.xml") as f: self.text_complex = MyCapytain.resources.texts.locals.tei.Text( resource=f, urn="urn:cts:latinLit:phi1294.phi002.perseus-lat2" ) with open("tests/testing_data/texts/seneca.xml") as f: self.seneca = MyCapytain.resources.texts.locals.tei.Text( resource=f )
def parse(src): """ Returns an element tree create by `LXML <http://lxml.de/>`_. :param src: A readable object such as a :class:`wex.response.Response`. """ if not hasattr(src, 'read'): return src etree = _ElementTree() try: stream = HTMLStream(src) # Sometimes we get URLs containing characters that aren't # acceptable to lxml (e.g. "http:/foo.com/bar?this=array[]"). # When this happens lxml will quote the whole URL. # We don't want to have to check for this so we just always # quote it here and then unquote it in the `base_url` function. quoted_base_url = quote_base_url(src.url) if src.url else src.url while True: try: fp = replace_invalid_ncr(stream) # fp is a Unicode stream # The lxml FAQ tells us that it is inefficient to do this # http://lxml.de/FAQ.html#can-lxml-parse-from-file-objects-opened-in-unicode-text-mode # but actually it seems just fine as long as you tell the parser to use 'utf-8'!? parser = HTMLParser(encoding='utf-8') etree.parse(fp, parser=parser, base_url=quoted_base_url) break except UnicodeDecodeError as exc: stream.next_encoding() except IOError as exc: logger = logging.getLogger(__name__) logger.warning("IOError parsing %s (%s)", src.url, exc) root = etree.getroot() if root is None: etree._setroot(UNPARSEABLE) return etree
def ElementTree(element: GenericElement[AnyStr] = None, *, file: Union[AnyStr, IO[Any]] = None, parser: XMLParser = None) -> GenericElementTree[AnyStr]: return cast(Any, _ElementTree(cast(Any, element), file=file, parser=parser))