def testWordParse(self): with gumboc.parse('Test') as output: doctype_node = output.contents.document.contents self.assertEquals(gumboc.NodeType.DOCUMENT, doctype_node.type) document = doctype_node.v.document self.assertEquals('', document.name) self.assertEquals('', document.public_identifier) self.assertEquals('', document.system_identifier) root = output.contents.root.contents self.assertEquals(gumboc.NodeType.ELEMENT, root.type) self.assertEquals(gumboc.Tag.HTML, root.tag) self.assertEquals(gumboc.Namespace.HTML, root.tag_namespace) self.assertEquals(2, len(root.children)) head = root.children[0] self.assertEquals(gumboc.NodeType.ELEMENT, head.type) self.assertEquals(gumboc.Tag.HEAD, head.tag) self.assertEquals('head', head.tag_name) self.assertEquals(gumboc.Namespace.HTML, head.tag_namespace) self.assertEquals(0, len(head.original_tag)) self.assertEquals('', str(head.original_end_tag)) self.assertEquals(0, head.children.length) body = root.children[1] self.assertNotEquals(body, doctype_node) self.assertEquals(gumboc.NodeType.ELEMENT, body.type) self.assertEquals(gumboc.Tag.BODY, body.tag) self.assertEquals('body', body.tag_name) self.assertEquals(1, len(body.children)) text_node = body.children[0] self.assertEquals(gumboc.NodeType.TEXT, text_node.type) self.assertEquals('Test', text_node.text)
def parse(text, **kwargs): with gumboc.parse(text, **kwargs) as output: soup = bs4.BeautifulSoup('', "html.parser") _add_document(soup, output.contents.document.contents) for node in output.contents.document.contents.children: soup.append(_add_node(soup, node)) _add_next_prev_pointers(soup.html) return soup
def testFragment(self): with gumboc.parse('<div></div>', fragment_context=gumboc.Tag.TITLE, fragment_namespace=gumboc.Namespace.SVG) as output: root = output.contents.root.contents self.assertEquals(1, len(root.children)) div = root.children[0] self.assertEquals(gumboc.NodeType.ELEMENT, div.type) self.assertEquals(gumboc.Tag.DIV, div.tag) self.assertEquals(gumboc.Namespace.HTML, div.tag_namespace)
def testFragment(self): with gumboc.parse( '<div></div>', container=gumboc.Tag.TITLE, container_namespace=gumboc.Namespace.SVG) as output: root = output.contents.root.contents self.assertEqual(1, len(root.children)) div = root.children[0] self.assertEqual(gumboc.NodeType.ELEMENT, div.type) self.assertEqual(gumboc.Tag.DIV, div.tag) self.assertEqual(gumboc.Namespace.HTML, div.tag_namespace)
def testSarcasm(self): with gumboc.parse('<div><sarcasm><div></div></sarcasm></div>') as output: root = output.contents.root.contents body = root.children[1] div = body.children[0] sarcasm = div.children[0] self.assertEquals(gumboc.NodeType.ELEMENT, sarcasm.type) self.assertEquals(gumboc.Tag.UNKNOWN, sarcasm.tag) self.assertEquals('<sarcasm>', str(sarcasm.original_tag)) self.assertEquals('</sarcasm>', str(sarcasm.original_end_tag)) self.assertEquals('sarcasm', sarcasm.tag_name.decode('utf-8'))
def testUnknownTag(self): with gumboc.parse('<foo bar=quux>1<p>2</foo>') as output: root = output.contents.root.contents body = root.children[1] foo = body.children[0] self.assertEquals(gumboc.NodeType.ELEMENT, foo.type) self.assertEquals(gumboc.Tag.UNKNOWN, foo.tag) self.assertEquals('<foo bar=quux>', str(foo.original_tag)) self.assertEquals('', str(foo.original_end_tag)) self.assertEquals('foo', foo.tag_name.decode('utf-8')) self.assertEquals('bar', foo.attributes[0].name) self.assertEquals('quux', foo.attributes[0].value)
def testBufferThatGoesAway(self): for i in range(10): source = StringIO.StringIO('<foo bar=quux>1<p>2</foo>') parse_tree = gumboc.parse(source.read()) source.close() with parse_tree as output: root = output.contents.root.contents body = root.children[1] foo = body.children[0] self.assertEquals(gumboc.NodeType.ELEMENT, foo.type) self.assertEquals(gumboc.Tag.UNKNOWN, foo.tag) self.assertEquals('<foo bar=quux>', str(foo.original_tag)) self.assertEquals('', str(foo.original_end_tag)) self.assertEquals('foo', foo.tag_name.decode('utf-8')) self.assertEquals('bar', foo.attributes[0].name) self.assertEquals('quux', foo.attributes[0].value)
def parse(self, text_or_file, **kwargs): try: text = text_or_file.read() except AttributeError: # Assume a string. text = text_or_file with gumboc.parse(text, **kwargs) as output: _convert_doctype(self.tree, output.contents.document.contents) for node in output.contents.document.contents.children: if node.type == gumboc.NodeType.COMMENT: self.tree.insertComment({'data': node.v.text.text.decode('utf-8')}, self.tree.document) elif node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE): _insert_root(self.tree, output.contents.root.contents) else: assert 'Only comments and <html> nodes allowed at the root' return self.tree.getDocument()
def __init__(self, text, **kwargs): # We need to record the addresses of GumboNodes as we add them and correlate # them with the BeautifulSoup objects that they become. This lets us # correctly wire up the next/previous pointers so that they point to # BeautifulSoup objects instead of ctypes ones. self._node_map = {} self._HANDLERS = [ _add_document, self._add_element, _add_text(BeautifulSoup.NavigableString), _add_text(BeautifulSoup.CData), _add_text(BeautifulSoup.Comment), _add_text(BeautifulSoup.NavigableString), ] self.soup = BeautifulSoup.BeautifulSoup() with gumboc.parse(text, **kwargs) as output: self.soup.append(self._add_node(output.contents.root.contents)) self._fix_next_prev_pointers(self.soup)
def parse(self, text_or_file, **kwargs): try: text = text_or_file.read() except AttributeError: # Assume a string. text = text_or_file with gumboc.parse(text, **kwargs) as output: _convert_doctype(self.tree, output.contents.document.contents) for node in output.contents.document.contents.children: if node.type == gumboc.NodeType.COMMENT: self.tree.insertComment( {'data': node.v.text.text.decode('utf-8')}, self.tree.document) elif node.type == gumboc.NodeType.ELEMENT: _insert_root(self.tree, output.contents.root.contents) else: assert 'Only comments and <html> nodes allowed at the root' return self.tree.getDocument()
def parseFragment(self, text_or_file, container, **kwargs): try: text = text_or_file.read() except AttributeError: # Assume a string. text = text_or_file if ' ' in container: container_ns, container = container.split(' ') else: container_ns = "html" with gumboc.parse( text, container=gumboc.Tag.from_str(container), container_namespace=getattr(gumboc.Namespace, container_ns.upper()), **kwargs) as output: for node in output.contents.document.contents.children: if node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE): _insert_root(self.tree, output.contents.root.contents, False) else: assert 'Malformed fragment parse (??)' return self.tree.getFragment()
def parseFragment(self, text_or_file, container, **kwargs): try: text = text_or_file.read() except AttributeError: # Assume a string. text = text_or_file if ' ' in container: container_ns, container = container.split(' ') else: container_ns = "html" with gumboc.parse(text, container=gumboc.Tag.from_str(container), container_namespace=getattr(gumboc.Namespace, container_ns.upper()), **kwargs) as output: for node in output.contents.document.contents.children: if node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE): _insert_root(self.tree, output.contents.root.contents, False) else: assert 'Malformed fragment parse (??)' return self.tree.getFragment()
def parse(text, **kwargs): with gumboc.parse(text, **kwargs) as output: soup = BeautifulSoup.BeautifulSoup() soup.append(_add_node(soup, output.contents.root.contents)) return soup
def parse(text, **kwargs): with gumboc.parse(text, **kwargs) as output: soup = BeautifulSoup.BeautifulSoup() soup.append(_add_node(soup, output.contents.root.contents)) _add_next_prev_pointers(soup) return soup