def testIdentityTransformWithNS(self): builder = SubTreesTreeBuilder(buildFor={ 'one': lambda stack: [d['tag'] for d in stack] == ['{u:ri/default#}root'], }) parser = XMLParser(target=builder) parser.feed(XML_NS) parser.close() subtrees = [t for t in builder.getSubtrees()] self.assertEquals(1, len(subtrees)) id, lxml = subtrees[0] self.assertEquals('one', id) self.assertEqualsLxml(parseString(XML_NS), lxml)
def start(self): def isPath(stack): return [d['tag'] for d in stack] == self._path builder = SubTreesTreeBuilder(buildFor={ 'simple': isPath, }) def processSubtrees(): for id, subtree in builder.getSubtrees(): self._callback(subtree) parser = XMLParser(target=builder) data = self._stream.read(4096) while data: parser.feed(data) processSubtrees() data = self._stream.read(4096) parser.close() processSubtrees()
def run(self, xmlPath, xmlFile): logger.info(f'{self.name}, normalise start ...') try: parser = XMLParser(target=self.nodeTree, recover=True) logger.info(f'parsing {xmlFile} ...') with open(xmlPath, 'r') as fhr: parser.feed('<Root>\n') for xmlRecord in fhr: try: self.nodeTree.count() parser.feed(xmlRecord) except ParseError as ex: logger.error(exc_info=True) parser.feed('<\Root>\n') parser.close() rowcount = self.nodeTree.result() logger.info(f'### {xmlFile} rowcount : {rowcount}') except Exception as ex: errMsg = f'xml inputFile, recnum : {xmlFile}, {rowcount}' logger.error(errMsg, exc_info=True) raise
def dictnode_to_lxml(tree, node_lookup=None, encoding=None): """ Input: A dictionary-based representation of a node tree. Output: An lxml representation of the same. Each dictionary has three attributes: name -- The type of node, a string. In html, this would be the tag name. text -- The content of the node: <b>text</b> tail -- Any content after the end of this node, but before the start of the next: <br/>tail attrs -- A dictionary of any extra attributes. children -- An ordered list of more node-dictionaries. """ if not node_lookup: from refactorlib.node import node_lookup from lxml.etree import Element, XMLParser root = None stack = [(tree, root)] while stack: node, parent = stack.pop() # sort attributes for determinism attrs = node.get('attrs', {}) attrs = {k: attrs[k] for k in sorted(attrs)} if parent is None: # We use this roundabout method becuase the encoding is always set # to 'UTF8' if we use parser.makeelement() parser = XMLParser(encoding=encoding) parser.set_element_class_lookup(node_lookup) parser.feed(b'<a/>') lxmlnode = parser.close() lxmlnode.tag = node['name'] lxmlnode.attrib.update(attrs) root = lxmlnode else: lxmlnode = Element(node['name'], attrib=attrs) parent.append(lxmlnode) lxmlnode.text = node['text'] lxmlnode.tail = node['tail'] for child in reversed(node['children']): stack.append((child, lxmlnode)) return root
def dictnode_to_lxml(tree, node_lookup=None, encoding=None): """ Input: A dictionary-based representation of a node tree. Output: An lxml representation of the same. Each dictionary has three attributes: name -- The type of node, a string. In html, this would be the tag name. text -- The content of the node: <b>text</b> tail -- Any content after the end of this node, but before the start of the next: <br/>tail attrs -- A dictionary of any extra attributes. children -- An ordered list of more node-dictionaries. """ if not node_lookup: from node import node_lookup from lxml.etree import XMLParser lxml_parser_object = XMLParser(encoding=encoding) lxml_parser_object.set_element_class_lookup(node_lookup) Element = lxml_parser_object.makeelement root = None stack = [ (tree,root) ] while stack: node, parent = stack.pop() if parent is None: # We use this roundabout method becuase the encoding is always set # to 'UTF8' if we use parser.makeelement() lxml_parser_object.feed('<trash></trash>') lxmlnode = lxml_parser_object.close() lxmlnode.tag = node['name'] lxmlnode.attrib.update(node.get('attrs', {})) root = lxmlnode else: lxmlnode = Element(node['name'], attrib=node.get('attrs', {})) parent.append(lxmlnode) lxmlnode.text = node['text'] lxmlnode.tail = node['tail'] for child in reversed(node['children']): stack.append((child, lxmlnode)) return root
def parseIncrementallyBy20(builder, inputXml): parser = XMLParser(target=builder) xmlStream = StringIO(inputXml) result = [] data = xmlStream.read(20) loops = 0 while data: loops += 1 parser.feed(data) for id, subtree in builder.getSubtrees(): result.append((id, subtree)) data = xmlStream.read(20) retval = parser.close() for id, subtree in builder.getSubtrees(): result.append((id, subtree)) assert retval is None, 'Errr?' assert ceil(len(inputXml) / 20.0) == loops, 'Errr?' return result, loops
def start(self, tag, attrib): if tag != 'outline': # Ignore anything not part of the outline return if not attrib.get('xmlUrl'): # Remember the current group self.group_name = attrib['text'] else: # Output a podcast entry self.writer.writerow( (self.group_name, attrib['text'], attrib['xmlUrl'], attrib.get('htmlUrl', ''))) def end(self, tag): "Ignore closing tags" def data(self, data): "Ignore data inside nodes" def close(self): "Nothing special to do here" target = PodcastListToCSV(sys.stdout) parser = XMLParser(target=target) with open('podcasts.opml', 'rt') as f: for line in f: parser.feed(line) parser.close()