def testIdentityTransformWithNS(self):
        builder = SubTreesTreeBuilder(buildFor={
            'one': lambda stack: [d['tag'] for d in stack] == ['{u:ri/default#}root'],
        })
        parser = XMLParser(target=builder)
        parser.feed(XML_NS)
        parser.close()

        subtrees = [t for t in builder.getSubtrees()]
        self.assertEquals(1, len(subtrees))

        id, lxml = subtrees[0]
        self.assertEquals('one', id)
        self.assertEqualsLxml(parseString(XML_NS), lxml)
Esempio n. 2
0
    def start(self):
        def isPath(stack):
            return [d['tag'] for d in stack] == self._path
        builder = SubTreesTreeBuilder(buildFor={
            'simple': isPath,
        })
        def processSubtrees():
            for id, subtree in builder.getSubtrees():
                self._callback(subtree)
        parser = XMLParser(target=builder)

        data = self._stream.read(4096)
        while data:
            parser.feed(data)
            processSubtrees()
            data = self._stream.read(4096)
        parser.close()
        processSubtrees()
Esempio n. 3
0
    def start(self):
        def isPath(stack):
            return [d['tag'] for d in stack] == self._path

        builder = SubTreesTreeBuilder(buildFor={
            'simple': isPath,
        })

        def processSubtrees():
            for id, subtree in builder.getSubtrees():
                self._callback(subtree)

        parser = XMLParser(target=builder)

        data = self._stream.read(4096)
        while data:
            parser.feed(data)
            processSubtrees()
            data = self._stream.read(4096)
        parser.close()
        processSubtrees()
Esempio n. 4
0
 def run(self, xmlPath, xmlFile):
     logger.info(f'{self.name}, normalise start ...')
     try:
         parser = XMLParser(target=self.nodeTree, recover=True)
         logger.info(f'parsing {xmlFile} ...')
         with open(xmlPath, 'r') as fhr:
             parser.feed('<Root>\n')
             for xmlRecord in fhr:
                 try:
                     self.nodeTree.count()
                     parser.feed(xmlRecord)
                 except ParseError as ex:
                     logger.error(exc_info=True)
             parser.feed('<\Root>\n')
             parser.close()
         rowcount = self.nodeTree.result()
         logger.info(f'### {xmlFile} rowcount : {rowcount}')
     except Exception as ex:
         errMsg = f'xml inputFile, recnum : {xmlFile}, {rowcount}'
         logger.error(errMsg, exc_info=True)
         raise
Esempio n. 5
0
def dictnode_to_lxml(tree, node_lookup=None, encoding=None):
    """
    Input: A dictionary-based representation of a node tree.
    Output: An lxml representation of the same.

    Each dictionary has three attributes:
        name -- The type of node, a string. In html, this would be the tag name.
        text -- The content of the node: <b>text</b>
        tail -- Any content after the end of this node, but before the start of the next: <br/>tail
        attrs -- A dictionary of any extra attributes.
        children -- An ordered list of more node-dictionaries.
    """
    if not node_lookup:
        from refactorlib.node import node_lookup

    from lxml.etree import Element, XMLParser

    root = None
    stack = [(tree, root)]

    while stack:
        node, parent = stack.pop()

        # sort attributes for determinism
        attrs = node.get('attrs', {})
        attrs = {k: attrs[k] for k in sorted(attrs)}

        if parent is None:
            # We use this roundabout method becuase the encoding is always set
            # to 'UTF8' if we use parser.makeelement()
            parser = XMLParser(encoding=encoding)
            parser.set_element_class_lookup(node_lookup)
            parser.feed(b'<a/>')
            lxmlnode = parser.close()
            lxmlnode.tag = node['name']
            lxmlnode.attrib.update(attrs)
            root = lxmlnode
        else:
            lxmlnode = Element(node['name'], attrib=attrs)
            parent.append(lxmlnode)

        lxmlnode.text = node['text']
        lxmlnode.tail = node['tail']

        for child in reversed(node['children']):
            stack.append((child, lxmlnode))

    return root
Esempio n. 6
0
def dictnode_to_lxml(tree, node_lookup=None, encoding=None):
	"""
	Input: A dictionary-based representation of a node tree.
	Output: An lxml representation of the same.

	Each dictionary has three attributes:
	    name -- The type of node, a string. In html, this would be the tag name.
		text -- The content of the node: <b>text</b>
		tail -- Any content after the end of this node, but before the start of the next: <br/>tail
		attrs -- A dictionary of any extra attributes.
		children -- An ordered list of more node-dictionaries.
	"""
	if not node_lookup:
		from node import node_lookup

	from lxml.etree import XMLParser
	lxml_parser_object = XMLParser(encoding=encoding)
	lxml_parser_object.set_element_class_lookup(node_lookup)
	Element = lxml_parser_object.makeelement

	root = None
	stack = [ (tree,root) ]

	while stack:
		node, parent = stack.pop()


		if parent is None:
			# We use this roundabout method becuase the encoding is always set
			# to 'UTF8' if we use parser.makeelement()
			lxml_parser_object.feed('<trash></trash>')
			lxmlnode = lxml_parser_object.close()
			lxmlnode.tag = node['name']
			lxmlnode.attrib.update(node.get('attrs', {}))
			root = lxmlnode
		else:
			lxmlnode = Element(node['name'], attrib=node.get('attrs', {}))
			parent.append(lxmlnode)

		lxmlnode.text = node['text']
		lxmlnode.tail = node['tail']

		for child in reversed(node['children']):
			stack.append((child, lxmlnode))

	return root
def parseIncrementallyBy20(builder, inputXml):
    parser = XMLParser(target=builder)
    xmlStream = StringIO(inputXml)
    result = []
    data = xmlStream.read(20)
    loops = 0
    while data:
        loops += 1
        parser.feed(data)
        for id, subtree in builder.getSubtrees():
            result.append((id, subtree))
        data = xmlStream.read(20)
    retval = parser.close()
    for id, subtree in builder.getSubtrees():
        result.append((id, subtree))
    assert retval is None, 'Errr?'
    assert ceil(len(inputXml) / 20.0) == loops, 'Errr?'
    return result, loops
Esempio n. 8
0
    def start(self, tag, attrib):
        if tag != 'outline':
            # Ignore anything not part of the outline
            return
        if not attrib.get('xmlUrl'):
            # Remember the current group
            self.group_name = attrib['text']
        else:
            # Output a podcast entry
            self.writer.writerow(
                (self.group_name, attrib['text'], attrib['xmlUrl'],
                 attrib.get('htmlUrl', '')))

    def end(self, tag):
        "Ignore closing tags"

    def data(self, data):
        "Ignore data inside nodes"

    def close(self):
        "Nothing special to do here"


target = PodcastListToCSV(sys.stdout)
parser = XMLParser(target=target)
with open('podcasts.opml', 'rt') as f:
    for line in f:
        parser.feed(line)
parser.close()