def parse_xml(xml_str_or_file, parser=None, **parser_kwargs): """ simple lxml parser to execute the correct parsing method and use ETree class as the default so we can use xpathbuilder directly without str casting Args: xml_str_or_file (str or IOBase): parser: **parser_kwargs: Returns: """ if not parser: parser_lookup = ET.ElementDefaultClassLookup( element=ElementBaseXpathStr) parser = ET.XMLParser(**parser_kwargs) parser.set_element_class_lookup(parser_lookup) if isinstance(xml_str_or_file, str): return ET.fromstring(xml_str_or_file, parser=parser) elif isinstance(xml_str_or_file, bytes): return ET.parse(io.BytesIO(xml_str_or_file), parser=parser).getroot() elif hasattr(xml_str_or_file, 'read'): return ET.parse(xml_str_or_file, parser=parser).getroot() else: raise NotImplementedError( 'We only know how to parse string, bytes or file objects. Use straight lxml methods' )
def fromfile(self, source, tags_factory=Tag, fragment=False, no_leading_text=False, encoding='utf-8', **kw): """Parse a XML file In: - ``source`` -- can be a filename or a file object - ``fragment`` -- if ``True``, can parse a XML fragment i.e a XML without a unique root - ``no_leading_text`` -- if ``fragment`` is ``True``, ``no_leading_text`` is ``False`` and the XML to parsed begins by a text, this text is keeped - ``kw`` -- keywords parameters are passed to the XML parser Return: - the root element of the parsed XML, if ``fragment`` is ``False`` - a list of XML elements, if ``fragment`` is ``True`` """ if isinstance(source, (str, type(u''))): if source.startswith(('http://', 'https://', 'ftp://')): source = urlopen(source) else: source = fileopen(source, encoding=encoding) # Create a dedicated parser with the ``kw`` parameter parser = self._parser.__class__(encoding=encoding, **kw) # This parser will generate nodes of type ``Tag`` parser.set_element_class_lookup( etree.ElementDefaultClassLookup(element=tags_factory)) if not fragment: # Parse a tree (only one root) # ---------------------------- root = etree.parse(source, parser).getroot() source.close() # Attach the renderer to the root root._renderer = self return root # Parse a fragment (multiple roots) # --------------------------------- # Create a dummy root xml = BufferIO(b'<html><body>%s</body></html>' % source.read()) source.close() root = etree.parse(xml, parser).getroot()[0] for e in root: if isinstance(e, tags_factory): # Attach the renderer to each roots e._renderer = self # Return the children of the dummy root return ((root.text.encode(encoding), ) if root.text and not no_leading_text else ()) + tuple(root[:])
def parse_html(self, source, fragment=False, no_leading_text=False, xhtml=False, **kw): """Parse a (X)HTML file In: - ``source`` -- can be a filename or a file object - ``fragment`` -- if ``True``, can parse a HTML fragment i.e a HTML without a unique root - ``no_leading_text`` -- if ``fragment`` is ``True``, ``no_leading_text`` is ``False`` and the HTML to parsed begins by a text, this text is keeped - ``xhtml`` -- is the HTML to parse a valid XHTML ? - ``kw`` -- keywords parameters are passed to the HTML parser Return: - the root element of the parsed HTML, if ``fragment`` is ``False`` - a list of HTML elements, if ``fragment`` is ``True`` """ parser = ET.XMLParser(**kw) if xhtml else ET.HTMLParser(**kw) parser.setElementClassLookup( ET.ElementDefaultClassLookup(element=_HTMLTag)) return self._parse_html(parser, source, fragment, no_leading_text, **kw)
class Renderer(xml.XmlRenderer): doctype = '<!DOCTYPE document SYSTEM "rml.dtd">' content_type = 'application/pdf' namespace = 'http://namespaces.zope.org/rml' _parser = etree.XMLParser() _parser.set_element_class_lookup(etree.ElementDefaultClassLookup(element=Tag)) def __init__(self, parent=None, *args, **kw): super(Renderer, self).__init__(parent, *args, **kw) self.namespaces = {None: self.namespace} @classmethod def get_tags(cls, tags, tag, signature): if tag not in tags: tags[tag] = set(schema.getFields(signature)) for child in signature.queryTaggedValue('directives', ()): cls.get_tags(tags, child.tag, child.signature) return tags @classmethod def create_RML_tags(cls): for tag, signature in cls.get_tags({}, 'document', document.IDocument).items(): setattr(cls, tag, xml.TagProp(tag, signature)) @classmethod def create_para_extra_tags(cls): tags = [method[8:] for method in Para.__dict__ if method.startswith('compile_')] for tag in tags: setattr(cls, tag, xml.TagProp(tag))
async def __aiter__(self) -> _AsyncGenerator[NSElement, None]: if not Api.agent: raise RuntimeError("The API's user agent is not yet set.") url = self.value # pylint: disable=E1101 tag = self.name.upper().rstrip("S") parser = etree.XMLPullParser(["end"], base_url=url, remove_blank_text=True, tag=tag) parser.set_element_class_lookup( etree.ElementDefaultClassLookup(element=NSElement)) events = parser.read_events() dobj = zlib.decompressobj(16 + zlib.MAX_WBITS) async with Api.session.request("GET", url, headers={"User-Agent": Api.agent}) as response: async for data, _ in response.content.iter_chunks(): parser.feed(dobj.decompress(data)) for _, element in events: yield element element.clear() while element.getparent( ) is not None and element.getprevious() is not None: del element.getparent()[0]
def createDefaultParser(self): parser = _etree.XMLParser(**self.parser_config) element_class = self.element_class if self.element_class is not None: lookup = _etree.ElementDefaultClassLookup(element=element_class) parser.set_element_class_lookup(lookup) return parser
def __getattr__(cls, name: str) -> str: """ Magic method to call a HTML element. Args: name: The HTML element name Returns: HTML element Raises: AttributeError: If an HTML element name is invalid """ if name == '__elements__' or name not in cls.__elements__: raise AttributeError('Invalid element') parser = etree.HTMLParser() parser.set_element_class_lookup( etree.ElementDefaultClassLookup(element=HtmlElement)) def _wrapper(*args, **kwargs): return \ getattr(builder.ElementMaker(makeelement=parser.makeelement), name)(*args, **kwargs) return _wrapper
def createFromExisting(cls, source): """ Create a TouchOSCLayout instance from an existing TouchOSC Layout. @type source: filename or fileobject @param source: Path to an existing .touchosc file, or TouchOSC index.xml file (from unzipping .touchosc file) @rtype: Layout @return: An instance containing the layout """ fallback = etree.ElementDefaultClassLookup() lookupTabpages = etree.ElementNamespaceClassLookup(fallback) namespace = lookupTabpages.get_namespace(None) namespace['tabpage'] = tabpage.Tabpage lookupControls = etree.AttributeBasedElementClassLookup( 'type', controls.type_class_mapping, lookupTabpages) layoutParser = etree.XMLParser(remove_blank_text=True) layoutParser.setElementClassLookup(lookupControls) if type(source) is str: (path, fname) = os.path.split(source) (name, extension) = os.path.splitext(fname) if extension == ".touchosc": f = ZipFile(source, "r") layoutTree = etree.parse(StringIO(f.read("index.xml")), layoutParser) f.close() elif extension == ".xml": name = None layoutTree = etree.parse(source, layoutParser) return Layout(layoutTree, name)
def parseXML(filename): """ Parse an XML document, thus also suitable for XHTML """ # XML doesn't require jumping through the same hoops as HTML since there # are no existing custom element classes. parser_lookup = _etree.ElementDefaultClassLookup(element=HtHtmlElement) parser = _etree.XMLParser() parser.set_element_class_lookup(parser_lookup) return _etree.parse(filename, parser=parser)
def Element(name, attrib={}, **extra): attrib = attrib.copy() attrib.update(extra) parser_lookup = etree.ElementDefaultClassLookup(element=MyWriter) parser = etree.XMLParser() parser.set_element_class_lookup(parser_lookup) tag = parser.makeelement(name, attrib=attrib) return tag
def class_init(cls, special_tags): """Class initialisation In: -- ``special_tags`` -- tags that have a special factory """ # Create a XML parser that generate ``_Tag`` nodes cls._xml_parser = ET.XMLParser() cls._xml_parser.setElementClassLookup( ET.ElementDefaultClassLookup(element=_Tag))
def parse_xml(self, source, fragment=False, no_leading_text=False, **kw): """Parse a XML file In: - ``source`` -- can be a filename or a file object - ``fragment`` -- if ``True``, can parse a XML fragment i.e a XML without a unique root - ``no_leading_text`` -- if ``fragment`` is ``True``, ``no_leading_text`` is ``False`` and the XML to parsed begins by a text, this text is keeped - ``kw`` -- keywords parameters are passed to the XML parser Return: - the root element of the parsed XML, if ``fragment`` is ``False`` - a list of XML elements, if ``fragment`` is ``True`` """ if isinstance(source, basestring): if source.startswith(('http://', 'https://', 'ftp://')): source = urllib.urlopen(source) else: source = open(source) # Create a dedicated XML parser with the ``kw`` parameter parser = ET.XMLParser(**kw) # This parser will generate nodes of type ``_Tag`` parser.setElementClassLookup( ET.ElementDefaultClassLookup(element=_Tag)) if not fragment: # Parse a XML file # ---------------- root = ET.parse(source, parser).getroot() source.close() # Attach the renderer to the root root._renderer = self return root # Parse a XML fragment # -------------------- # Create a dummy root xml = cStringIO.StringIO('<dummy>%s</dummy>' % source.read()) source.close() root = ET.parse(xml, parser).getroot() for e in root[:]: # Attach the renderer to each roots e._renderer = self # Return the children of the dummy root return ([root.text] if root.text and not no_leading_text else []) + root[:]
def create_tag(name, value, attrib={}, **extra): # print ("tworze tag %s o wartości %s"%(name,value)) attrib = attrib.copy() attrib.update(extra) parser_lookup = etree.ElementDefaultClassLookup(element=MyWriter) parser = etree.XMLParser() parser.set_element_class_lookup(parser_lookup) tag = parser.makeelement(name, attrib=attrib) if value: # tag.text = value.decode("utf-8") tag.text = value return tag
def __init__(self, file_location): """ Parser/iterator for the OAIRecord class. Iterates over record elements in any namespace (repox or oai-pmh). :param file_location: """ oai_parser_registration = etree.ElementDefaultClassLookup( element=OAIRecord) oai_parser = etree.XMLParser() oai_parser.set_element_class_lookup(oai_parser_registration) super(OAIReader, self).__init__(file_location, '{*}record', parser=oai_parser)
def __init__(self, file_location): """ Parser/iterator for the MODSRecord class. Iterates on mods:mods elements. :param file_location: """ mods_parser_registration = etree.ElementDefaultClassLookup( element=MODSRecord) mods_parser = etree.XMLParser() mods_parser.set_element_class_lookup(mods_parser_registration) super(MODSReader, self).__init__(file_location, '{0}mods'.format(NAMESPACES['mods']), parser=mods_parser)
def metadata(self): """ Exposes the metadata content of an OAIRecord. :return: A reparsed root element either in the MODSRecord or DCRecord class, as appropriate. """ record_data = self.find('./{*}metadata') if record_data is not None: try: if 'mods' in record_data[0].tag: mods_parser_registration = etree.ElementDefaultClassLookup( element=MODSRecord) mods_parser = etree.XMLParser() mods_parser.set_element_class_lookup( mods_parser_registration) return etree.XML(etree.tostring( record_data[0], encoding='UTF-8').decode('utf-8'), parser=mods_parser) elif 'qualified' in record_data[0].tag: qdc_parser_registration = etree.ElementDefaultClassLookup( element=DCRecord) qdc_parser = etree.XMLParser() qdc_parser.set_element_class_lookup( qdc_parser_registration) return etree.XML(etree.tostring( record_data[0], encoding='UTF-8').decode('utf-8'), parser=qdc_parser) elif 'dc' in record_data[0].tag: dc_parser_registration = etree.ElementDefaultClassLookup( element=DCRecord) dc_parser = etree.XMLParser() dc_parser.set_element_class_lookup(dc_parser_registration) return etree.XML(etree.tostring( record_data[0], encoding='UTF-8').decode('utf-8'), parser=dc_parser) except IndexError: pass
def _build(self): """ create layout and define widget attribute by tkouter html """ if not self.layout: return env = Environment(loader=self.loader) if '.html' in self.layout or 'xml' in self.layout: template = env.get_template(self.layout) self._html = template.render(self.context) else: self._html = Template(self.layout).render(self.context) # lxml parser parser_lookup = etree.ElementDefaultClassLookup(element=TkOutElement) self._parser = etree.XMLParser() self._parser.set_element_class_lookup(parser_lookup) self._tree = etree.parse(StringIO(self._html), self._parser) # we should cache the elements for storing data to it self._proxy_cache = list(self._tree.getroot().iter()) # css css = None for e in self._tree.getroot().iter(): if e.is_css and e.get('href'): self._css = env.get_template(e.get('href')).render() self._css_parser = tinycss.make_parser() self._stylesheet = self._css_parser.parse_stylesheet(self._css) for rule in self._stylesheet.rules: for e in self._select(rule.selector.as_css()): for d in rule.declarations: if e.get(d.name) is None: e.set(d.name, d.value.as_css()) # post init etree elements and display their widgets for e in self._tree.getroot().iter(): try: e.init(self) e.display() except TagError as err: print('Error when parsing tag: ') print( etree.tostring(e, pretty_print=True, encoding=str, method='html')) raise err
def set_parser_to_relaxed(cls): """ Creates a XML parser which attempts to recover syntactically-flawed XML. Returns: None """ # Creates an `etree.XMLParser` object, equivalent to the default parser used # by the parser `lxml.fromstring()` (see `lxml.GlobalParserTLS.createDefaultParser()`), # except enabling the `recover=True` attribute. relaxed_xml_parser = etree.XMLParser(recover=True, resolve_entities=False) lookup = etree.ElementDefaultClassLookup(element=RestrictedElement) relaxed_xml_parser.set_element_class_lookup(lookup) # Inject parser cls._parse_etree = partial(super()._parse_etree, parser=relaxed_xml_parser)
def class_init(cls, specialTags): """Class initialisation In: -- ``special_tags`` -- tags that have a special factory """ class CustomLookup(ET.CustomElementClassLookup): def __init__(self, specialTags, defaultLookup): super(CustomLookup, self).__init__(defaultLookup) self._specialTags = specialTags def lookup(self, node_type, document, namespace, name): return self._specialTags.get(name) cls._specialTags.update(specialTags) cls._custom_lookup = CustomLookup(cls._specialTags, ET.ElementDefaultClassLookup(element=xhtml_base._HTMLTag)) cls._html_parser = ET.HTMLParser() cls._html_parser.setElementClassLookup(cls._custom_lookup)
async def __aiter__(self): url = self.value parser = etree.XMLPullParser(["end"], base_url=url, remove_blank_text=True) parser.set_element_class_lookup( etree.ElementDefaultClassLookup(element=_NSElement)) events = parser.read_events() dobj = zlib.decompressobj(16 + zlib.MAX_WBITS) async with Api.session.request("GET", url, headers={"User-Agent": Api.agent}) as response: yield parser.makeelement("HEADERS", attrib=response.headers) async for data, _ in response.content.iter_chunks(): parser.feed(dobj.decompress(data)) for _, element in events: yield element element.clear()
async def __aiter__( self, *, no_clear: bool = False) -> _AsyncGenerator[NSElement, None]: if not self.agent: raise RuntimeError("The API's user agent is not yet set.") if not self: # Preempt the request to conserve ratelimit raise ValueError("Bad request") if "a" in self and self["a"].lower() == "sendtg": raise RuntimeError( "This API wrapper does not support API telegrams.") url = str(self) parser = etree.XMLPullParser(["end"], base_url=url, remove_blank_text=True) parser.set_element_class_lookup( etree.ElementDefaultClassLookup(element=NSElement)) events = parser.read_events() async with self.session.request("GET", url, headers={"User-Agent": self.agent}) as response: encoding = response.headers["Content-Type"].split( "charset=")[1].split(",")[0] async for data, _ in response.content.iter_chunks(): parser.feed(data.decode(encoding)) for _, element in events: if not no_clear and (element.getparent() is None or element.getparent().getparent() is not None): continue yield element if no_clear: continue element.clear() while element.getprevious() is not None: del element.getparent()[0]
async def __aiter__(self, *, clear: bool = True): if not self: raise ValueError("Bad request") url = str(self) parser = etree.XMLPullParser(["end"], base_url=url, remove_blank_text=True) parser.set_element_class_lookup( etree.ElementDefaultClassLookup(element=_NSElement)) events = parser.read_events() async with type(self).session.request( "GET", url, headers={"User-Agent": type(self).agent}) as response: yield parser.makeelement("HEADERS", attrib=response.headers) encoding = response.headers["Content-Type"].split( "charset=")[1].split(",")[0] async for data, _ in response.content.iter_chunks(): parser.feed(data.decode(encoding)) for _, element in events: yield element if clear: element.clear()
def __init__(self): self.parser = etree.XMLParser() fallback = etree.ElementDefaultClassLookup(PDFXML) lookup = etree.ElementNamespaceClassLookup(fallback) namespace = lookup.get_namespace(None) #leafs namespace['name'] = PDFName namespace['string'] = PDFString namespace['number'] = PDFNumber namespace['null'] = PDFNull namespace['bool'] = PDFBool namespace['R'] = PDFR namespace['header'] = PDFHeader namespace['startxref'] = PDFStartxref namespace['data'] = PDFData #trees namespace['entry'] = PDFEntry namespace['dictionary'] = PDFDictionary namespace['stream'] = PDFStream namespace['pdf'] = PDFPdf namespace['pdf_update'] = PDFUpdate namespace['indirect_object'] = PDFIndirect namespace['array'] = PDFArray self.parser.set_element_class_lookup(lookup)
def main(benchmark_class): import_lxml = True callgrind_zero = False if len(sys.argv) > 1: try: sys.argv.remove('-i') # run benchmark 'inplace' sys.path.insert(0, 'src') except ValueError: pass try: sys.argv.remove('-nolxml') # run without lxml import_lxml = False except ValueError: pass try: sys.argv.remove('-z') # reset callgrind after tree setup callgrind_zero = True except ValueError: pass initArgs(sys.argv) _etrees = [] if import_lxml: from lxml import etree _etrees.append(etree) try: sys.argv.remove('-fel') except ValueError: pass else: # use fast element creation in lxml.etree etree.set_element_class_lookup(etree.ElementDefaultClassLookup()) if len(sys.argv) > 1: if '-a' in sys.argv or '-c' in sys.argv: # 'all' or 'C-implementations' ? try: sys.argv.remove('-c') except ValueError: pass try: import cElementTree as cET _etrees.append(cET) except ImportError: try: import xml.etree.cElementTree as cET _etrees.append(cET) except ImportError: pass try: # 'all' ? sys.argv.remove('-a') except ValueError: pass else: try: from elementtree import ElementTree as ET _etrees.append(ET) except ImportError: try: from xml.etree import ElementTree as ET _etrees.append(ET) except ImportError: pass if not _etrees: print("No library to test. Exiting.") sys.exit(1) print("Preparing test suites and trees ...") selected = set(sys.argv[1:]) benchmark_suites, benchmarks = \ buildSuites(benchmark_class, _etrees, selected) print("Running benchmark on", ', '.join(b.lib_name for b in benchmark_suites)) print('') printSetupTimes(benchmark_suites) if callgrind_zero: cmd = open("callgrind.cmd", 'w') cmd.write('+Instrumentation\n') cmd.write('Zero\n') cmd.close() runBenchmarks(benchmark_suites, benchmarks)
class RDFXMLReader: CORE_SYNTAX_TERMS = { RDF.RDF, RDF.ID, RDF.about, RDF.parseType, RDF.resource, RDF.nodeID, RDF.datatype } SYNTAX_TERMS = CORE_SYNTAX_TERMS | {RDF.Description, RDF.li} OLD_TERMS = {RDF.aboutEach, RDF.aboutEachPrefix, RDF.bagID} XML_TERMS = {XML.base, XML.lang} ILLEGAL_NODE_TAGS = CORE_SYNTAX_TERMS | {RDF.li} | OLD_TERMS ILLEGAL_PROPERTY_TAGS = CORE_SYNTAX_TERMS | {RDF.Description} | OLD_TERMS ILLEGAL_PROPERTY_ATTRS = SYNTAX_TERMS | OLD_TERMS _PARSER_LOOKUP = etree.ElementDefaultClassLookup(element=Element) def __init__(self, parser=None): if parser is None: parser = etree.XMLParser(remove_comments=True, remove_pis=True) parser.set_element_class_lookup(self._PARSER_LOOKUP) self.parser = parser def read(self, lines, base_uri=None): root = etree.parse(lines, self.parser, base_url=base_uri).getroot() ids = set() # rdf:RDF is not necessarily the root element. for element in root if root.uri == RDF.RDF else [root]: self._validate(element) for triple in self._node_element(element, ids): yield triple def _validate(self, element): for attr, value in element.items(): attr = QName(attr) # Ignore unknown and reserved XML attributes. if attr.namespace is None or (attr.namespace == XML and attr not in _XML_ATTRS): del element.attrib[attr] # Validate but ignore old syntax terms. elif attr == QName(RDF, 'bagID'): if not _NCNAME.match(value): raise ParseError( "rdf:bagID does not match NCName: {!r}".format(value)) del element.attrib[attr] elif attr in _OLD_ATTRS: raise ParseError def _node_element(self, element, ids): # 7.2.11 Production nodeElement self._validate(element) if element.uri in self.ILLEGAL_NODE_TAGS: raise ParseError("Illegal node element: {!s}".format(element.tag)) element.subject = self._subject(element, ids) # 2.13 Typed Node Elements if element.uri != RDF.Description: yield (element.subject, RDF.type, element.uri) for triple in self._property_attrs(element): yield triple for triple in self._property_elements(element, ids): yield triple def _subject(self, element, ids): id_ = self._id(element, ids) node_id = element.get(QName(RDF, 'nodeID')) about = element.get(QName(RDF, 'about')) if id_ is not None: if node_id is None: if about is None: return id_ raise ParseError raise ParseError elif node_id is not None: if about is None: if _NCNAME.match(node_id): return BlankNode(node_id) raise ParseError raise ParseError elif about is not None: return self._uri(about, element.base_uri) return BlankNode() def _uri(self, uri, base_uri=None): if base_uri and not uri: base_uri = base_uri.rsplit('#', 1)[0] return URI(urllib.parse.urljoin(base_uri or '', uri)) def _id(self, element, ids): name = element.get(QName(RDF, 'ID')) if name is not None: if _NCNAME.match(name): uri = self._uri('#' + name, element.base_uri) if uri not in ids: ids.add(uri) return uri else: raise ParseError("rdf:ID is not unique: {!r}".format(uri)) else: raise ParseError( "rdf:ID does not match NCName: {!r}".format(name)) def _property_attrs(self, element): # 2.5 Property Attributes for attr, value in element.items(): if attr not in _XML_ATTRS: predicate = URI(QName(attr)) if predicate not in self.ILLEGAL_PROPERTY_ATTRS: if predicate != RDF.type: object_ = PlainLiteral(value, element.language) else: object_ = URI(value) yield (element.subject, predicate, object_) elif predicate == RDF.li: raise ParseError("rdf:li is not allowed as attribute") def _property_elements(self, parent, ids): # 7.2.13 Production propertyEltList li_counter = 1 for element in parent: # 7.2.14 Production propertyElt self._validate(element) if element.uri in self.ILLEGAL_PROPERTY_TAGS: raise ParseError("Illegal property element: {!s}".format( element.tag)) elif element.uri == RDF.li: # Container Membership Property Elements: rdf:li and rdf:_n element.uri = RDF['_' + str(li_counter)] li_counter += 1 parse_type = element.attrib.get(QName(RDF, 'parseType')) legal_attrs = _XML_ATTRS | {QName(RDF, 'ID')} if parse_type is not None: legal_attrs.add(QName(RDF, 'parseType')) if any(attr not in legal_attrs for attr in element.keys()): raise ParseError elif parse_type == 'Resource': triples = self._parse_type_resource_property( element, parent, ids) elif parse_type == 'Collection': triples = self._parse_type_collection_property( element, parent, ids) else: triples = self._parse_type_literal_property( element, parent, ids) elif len(element) == 1: if all(attr not in legal_attrs for attr in element.keys()): triples = self._resource_property(element, parent, ids) else: raise ParseError elif len(element) == 0: if element.text: legal_attrs.add(QName(RDF, 'datatype')) if all(attr in legal_attrs for attr in element.keys()): triples = self._literal_property(element, parent, ids) else: raise ParseError else: triples = self._empty_property(element, parent, ids) for triple in triples: yield triple def _reify(self, uri, triple): yield (uri, RDF.type, RDF.Statement) yield (uri, RDF.subject, triple[0]) yield (uri, RDF.predicate, triple[1]) yield (uri, RDF.object, triple[2]) def _resource_property(self, element, parent, ids): # 7.2.15 Production resourcePropertyElt node_element = element[0] for triple in self._node_element(node_element, ids): yield triple triple = (parent.subject, element.uri, node_element.subject) yield triple id_ = self._id(element, ids) if id_ is not None: # 7.3 Reification Rules for triple in self._reify(id_, triple): yield triple def _literal_property(self, element, parent, ids): # 7.2.16 Production literalPropertyElt datatype = element.get(QName(RDF, 'datatype')) if datatype is not None: object_ = TypedLiteral(element.text, URI(datatype)) else: object_ = PlainLiteral(element.text, element.language) triple = (parent.subject, element.uri, object_) yield triple id_ = self._id(element, ids) if id_ is not None: # 7.3 Reification Rules for triple in self._reify(id_, triple): yield triple def _parse_type_resource_property(self, element, parent, ids): # 7.2.18 Production parseTypeResourcePropertyElt node_element = element.makeelement(QName(RDF, 'Description')) node_element[:] = element for triple in self._node_element(node_element, ids): yield triple triple = (parent.subject, element.uri, node_element.subject) yield triple id_ = self._id(element, ids) if id_ is not None: # 7.3 Reification Rules for triple in self._reify(id_, triple): yield triple def _parse_type_collection_property(self, element, parent, ids): # 7.2.19 Production parseTypeCollectionPropertyElt node_ids = [] for node_element in element: for triple in self._node_element(node_element, ids): yield triple node_ids.append((node_element, BlankNode())) for node_element, object_ in node_ids: break else: object_ = RDF.nil triple = (parent.subject, element.uri, object_) yield triple id_ = self._id(element, ids) if id_ is not None: # 7.3 Reification Rules for triple in self._reify(id_, triple): yield triple for i, (node_element, object_) in enumerate(node_ids): yield (object_, RDF.first, node_element.subject) try: next_pair = node_ids[i + 1] except IndexError: next_object = RDF.nil else: next_element, next_object = next_pair yield (object_, RDF.rest, next_object) def _parse_type_literal_property(self, element, parent, ids): literal = element.text or "" if len(element): tree = etree.ElementTree(element[0]) bytes_io = BytesIO() tree.write_c14n(bytes_io, exclusive=True, with_comments=True) literal += bytes_io.getvalue().decode('utf-8') literal += element[0].tail or "" object_ = TypedLiteral(literal, RDF.XMLLiteral) triple = (parent.subject, element.uri, object_) yield triple id_ = self._id(element, ids) if id_ is not None: # 7.3 Reification Rules for triple in self._reify(id_, triple): yield triple def _empty_property(self, element, parent, ids): # 7.2.21 Production emptyPropertyElt id_ = self._id(element, ids) literal_attrs = _XML_ATTRS | {QName(RDF, 'ID')} if all(attr in literal_attrs for attr in element.keys()): object_ = PlainLiteral("", element.language) triple = (parent.subject, element.uri, object_) yield triple if id_ is not None: for triple in self._reify(id_, triple): yield triple else: resource = element.attrib.get(QName(RDF, 'resource')) node_id = element.attrib.get(QName(RDF, 'nodeID')) if resource is not None: if node_id is None: object_ = self._uri(resource, element.base_uri) else: raise ParseError elif node_id is not None: if _NCNAME.match(node_id): object_ = BlankNode(node_id) else: raise ParseError( "rdf:nodeID does not match NCName: {!r}".format( node_id)) else: object_ = BlankNode() triple = (parent.subject, element.uri, object_) yield triple if id_ is not None: for triple in self._reify(id_, triple): yield triple subject = object_ property_attrs = set(element.keys()) property_attrs -= literal_attrs | { QName(RDF, 'resource'), QName(RDF, 'nodeID') } for attr in property_attrs: predicate = URI(QName(attr)) if predicate in self.XML_TERMS: continue elif predicate in self.ILLEGAL_PROPERTY_ATTRS: raise ParseError value = element.get(attr) if predicate != RDF.type: object_ = PlainLiteral(value, element.language) else: object_ = self._uri(value, element.base_uri) yield (subject, predicate, object_)
def HTMLParser(*args, **kwargs): lookup = etree.ElementDefaultClassLookup(element=HTMLElement) parser = etree.HTMLParser(*args, **kwargs) parser.set_element_class_lookup(lookup) return parser
def HTMLParser(*args, **kwargs): kwargs.setdefault('encoding', 'utf-8') lookup = etree.ElementDefaultClassLookup(element=HTMLElement) parser = etree.HTMLParser(*args, **kwargs) parser.set_element_class_lookup(lookup) return parser
return True class ExactlyOneError(ValueError): pass def one(mylist): """ assert that there's only one thing, and get it. """ if len(mylist) != 1: raise ExactlyOneError( 'Expected exactly one item. Got %i: %r' % ( len(mylist), [ item.tostring() if isinstance(item, etree.ElementBase) else item for item in mylist ] ) ) return mylist[0] node_lookup = etree.ElementDefaultClassLookup(element=RefactorLibNodeBase) __all__ = ('RefactorLibNodeBase',)
@staticmethod def to_string(element): if callable(element): element = element() return etree.tostring( element, encoding='utf-8', xml_declaration=True, pretty_print=True, ) XMLParserLookup = etree.ElementNamespaceClassLookup( fallback=etree.ElementDefaultClassLookup(element=ElementBase)) XMLParser = etree.XMLParser(encoding='utf-8', no_network=False) XMLParser.set_element_class_lookup(XMLParserLookup) E = ElementMaker( nsmap=SOAP_NSMAP, makeelement=XMLParser.makeelement, ) S = ElementMaker( namespace=SOAP_ENV_URI, nsmap=SOAP_NSMAP, makeelement=XMLParser.makeelement, )
# create etree parser using custom Element class class LayoutElement(etree.ElementBase): @property def layout(self): if not hasattr(self, '_layout'): self._layout = None return self._layout @layout.setter def layout(self, value): self._layout = value parser_lookup = etree.ElementDefaultClassLookup(element=LayoutElement) parser = etree.XMLParser() parser.set_element_class_lookup(parser_lookup) # main class class PDFQuery(object): def __init__( self, file, merge_tags=('LTChar', 'LTAnno'), round_floats=True, round_digits=3, input_text_formatter=None, normalize_spaces=True, resort=True,