Exemple #1
0
    def testWordParse(self):
        with gumboc.parse('Test') as output:
            doctype_node = output.contents.document.contents
            self.assertEquals(gumboc.NodeType.DOCUMENT, doctype_node.type)
            document = doctype_node.v.document
            self.assertEquals('', document.name)
            self.assertEquals('', document.public_identifier)
            self.assertEquals('', document.system_identifier)

            root = output.contents.root.contents
            self.assertEquals(gumboc.NodeType.ELEMENT, root.type)
            self.assertEquals(gumboc.Tag.HTML, root.tag)
            self.assertEquals(gumboc.Namespace.HTML, root.tag_namespace)
            self.assertEquals(2, len(root.children))

            head = root.children[0]
            self.assertEquals(gumboc.NodeType.ELEMENT, head.type)
            self.assertEquals(gumboc.Tag.HEAD, head.tag)
            self.assertEquals('head', head.tag_name)
            self.assertEquals(gumboc.Namespace.HTML, head.tag_namespace)
            self.assertEquals(0, len(head.original_tag))
            self.assertEquals('', str(head.original_end_tag))
            self.assertEquals(0, head.children.length)

            body = root.children[1]
            self.assertNotEquals(body, doctype_node)
            self.assertEquals(gumboc.NodeType.ELEMENT, body.type)
            self.assertEquals(gumboc.Tag.BODY, body.tag)
            self.assertEquals('body', body.tag_name)
            self.assertEquals(1, len(body.children))

            text_node = body.children[0]
            self.assertEquals(gumboc.NodeType.TEXT, text_node.type)
            self.assertEquals('Test', text_node.text)
Exemple #2
0
  def testWordParse(self):
    with gumboc.parse('Test') as output:
      doctype_node = output.contents.document.contents
      self.assertEquals(gumboc.NodeType.DOCUMENT, doctype_node.type)
      document = doctype_node.v.document
      self.assertEquals('', document.name)
      self.assertEquals('', document.public_identifier)
      self.assertEquals('', document.system_identifier)

      root = output.contents.root.contents
      self.assertEquals(gumboc.NodeType.ELEMENT, root.type)
      self.assertEquals(gumboc.Tag.HTML, root.tag)
      self.assertEquals(gumboc.Namespace.HTML, root.tag_namespace)
      self.assertEquals(2, len(root.children))

      head = root.children[0]
      self.assertEquals(gumboc.NodeType.ELEMENT, head.type)
      self.assertEquals(gumboc.Tag.HEAD, head.tag)
      self.assertEquals('head', head.tag_name)
      self.assertEquals(gumboc.Namespace.HTML, head.tag_namespace)
      self.assertEquals(0, len(head.original_tag))
      self.assertEquals('', str(head.original_end_tag))
      self.assertEquals(0, head.children.length)

      body = root.children[1]
      self.assertNotEquals(body, doctype_node)
      self.assertEquals(gumboc.NodeType.ELEMENT, body.type)
      self.assertEquals(gumboc.Tag.BODY, body.tag)
      self.assertEquals('body', body.tag_name)
      self.assertEquals(1, len(body.children))

      text_node = body.children[0]
      self.assertEquals(gumboc.NodeType.TEXT, text_node.type)
      self.assertEquals('Test', text_node.text)
Exemple #3
0
def parse(text, **kwargs):
    with gumboc.parse(text, **kwargs) as output:
        soup = bs4.BeautifulSoup('', "html.parser")
        _add_document(soup, output.contents.document.contents)
        for node in output.contents.document.contents.children:
            soup.append(_add_node(soup, node))
        _add_next_prev_pointers(soup.html)
        return soup
def parse(text, **kwargs):
    with gumboc.parse(text, **kwargs) as output:
        soup = bs4.BeautifulSoup('', "html.parser")
        _add_document(soup, output.contents.document.contents)
        for node in output.contents.document.contents.children:
            soup.append(_add_node(soup, node))
        _add_next_prev_pointers(soup.html)
        return soup
Exemple #5
0
 def testFragment(self):
     with gumboc.parse('<div></div>',
                       fragment_context=gumboc.Tag.TITLE,
                       fragment_namespace=gumboc.Namespace.SVG) as output:
         root = output.contents.root.contents
         self.assertEquals(1, len(root.children))
         div = root.children[0]
         self.assertEquals(gumboc.NodeType.ELEMENT, div.type)
         self.assertEquals(gumboc.Tag.DIV, div.tag)
         self.assertEquals(gumboc.Namespace.HTML, div.tag_namespace)
 def testFragment(self):
     with gumboc.parse(
         '<div></div>',
         container=gumboc.Tag.TITLE,
         container_namespace=gumboc.Namespace.SVG) as output:
         root = output.contents.root.contents
         self.assertEqual(1, len(root.children))
         div = root.children[0]
         self.assertEqual(gumboc.NodeType.ELEMENT, div.type)
         self.assertEqual(gumboc.Tag.DIV, div.tag)
         self.assertEqual(gumboc.Namespace.HTML, div.tag_namespace)
Exemple #7
0
 def testSarcasm(self):
   with gumboc.parse('<div><sarcasm><div></div></sarcasm></div>') as output:
     root = output.contents.root.contents
     body = root.children[1]
     div = body.children[0]
     sarcasm = div.children[0]
     self.assertEquals(gumboc.NodeType.ELEMENT, sarcasm.type)
     self.assertEquals(gumboc.Tag.UNKNOWN, sarcasm.tag)
     self.assertEquals('<sarcasm>', str(sarcasm.original_tag))
     self.assertEquals('</sarcasm>', str(sarcasm.original_end_tag))
     self.assertEquals('sarcasm', sarcasm.tag_name.decode('utf-8'))
Exemple #8
0
 def testSarcasm(self):
   with gumboc.parse('<div><sarcasm><div></div></sarcasm></div>') as output:
     root = output.contents.root.contents
     body = root.children[1]
     div = body.children[0]
     sarcasm = div.children[0]
     self.assertEquals(gumboc.NodeType.ELEMENT, sarcasm.type)
     self.assertEquals(gumboc.Tag.UNKNOWN, sarcasm.tag)
     self.assertEquals('<sarcasm>', str(sarcasm.original_tag))
     self.assertEquals('</sarcasm>', str(sarcasm.original_end_tag))
     self.assertEquals('sarcasm', sarcasm.tag_name.decode('utf-8'))
Exemple #9
0
 def testUnknownTag(self):
     with gumboc.parse('<foo bar=quux>1<p>2</foo>') as output:
         root = output.contents.root.contents
         body = root.children[1]
         foo = body.children[0]
         self.assertEquals(gumboc.NodeType.ELEMENT, foo.type)
         self.assertEquals(gumboc.Tag.UNKNOWN, foo.tag)
         self.assertEquals('<foo bar=quux>', str(foo.original_tag))
         self.assertEquals('', str(foo.original_end_tag))
         self.assertEquals('foo', foo.tag_name.decode('utf-8'))
         self.assertEquals('bar', foo.attributes[0].name)
         self.assertEquals('quux', foo.attributes[0].value)
Exemple #10
0
 def testUnknownTag(self):
   with gumboc.parse('<foo bar=quux>1<p>2</foo>') as output:
     root = output.contents.root.contents
     body = root.children[1]
     foo = body.children[0]
     self.assertEquals(gumboc.NodeType.ELEMENT, foo.type)
     self.assertEquals(gumboc.Tag.UNKNOWN, foo.tag)
     self.assertEquals('<foo bar=quux>', str(foo.original_tag))
     self.assertEquals('', str(foo.original_end_tag))
     self.assertEquals('foo', foo.tag_name.decode('utf-8'))
     self.assertEquals('bar', foo.attributes[0].name)
     self.assertEquals('quux', foo.attributes[0].value)
Exemple #11
0
 def testBufferThatGoesAway(self):
     for i in range(10):
         source = StringIO.StringIO('<foo bar=quux>1<p>2</foo>')
         parse_tree = gumboc.parse(source.read())
         source.close()
     with parse_tree as output:
         root = output.contents.root.contents
         body = root.children[1]
         foo = body.children[0]
         self.assertEquals(gumboc.NodeType.ELEMENT, foo.type)
         self.assertEquals(gumboc.Tag.UNKNOWN, foo.tag)
         self.assertEquals('<foo bar=quux>', str(foo.original_tag))
         self.assertEquals('', str(foo.original_end_tag))
         self.assertEquals('foo', foo.tag_name.decode('utf-8'))
         self.assertEquals('bar', foo.attributes[0].name)
         self.assertEquals('quux', foo.attributes[0].value)
Exemple #12
0
 def testBufferThatGoesAway(self):
   for i in range(10):
     source = StringIO.StringIO('<foo bar=quux>1<p>2</foo>')
     parse_tree = gumboc.parse(source.read())
     source.close()
   with parse_tree as output:
     root = output.contents.root.contents
     body = root.children[1]
     foo = body.children[0]
     self.assertEquals(gumboc.NodeType.ELEMENT, foo.type)
     self.assertEquals(gumboc.Tag.UNKNOWN, foo.tag)
     self.assertEquals('<foo bar=quux>', str(foo.original_tag))
     self.assertEquals('', str(foo.original_end_tag))
     self.assertEquals('foo', foo.tag_name.decode('utf-8'))
     self.assertEquals('bar', foo.attributes[0].name)
     self.assertEquals('quux', foo.attributes[0].value)
  def parse(self, text_or_file, **kwargs):
    try:
      text = text_or_file.read()
    except AttributeError:
      # Assume a string.
      text = text_or_file

    with gumboc.parse(text, **kwargs) as output:
      _convert_doctype(self.tree, output.contents.document.contents)
      for node in output.contents.document.contents.children:
        if node.type == gumboc.NodeType.COMMENT:
          self.tree.insertComment({'data': node.v.text.text.decode('utf-8')},
                                  self.tree.document)
        elif node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE):
          _insert_root(self.tree, output.contents.root.contents)
        else:
          assert 'Only comments and <html> nodes allowed at the root'
      return self.tree.getDocument()
    def __init__(self, text, **kwargs):
        # We need to record the addresses of GumboNodes as we add them and correlate
        # them with the BeautifulSoup objects that they become.  This lets us
        # correctly wire up the next/previous pointers so that they point to
        # BeautifulSoup objects instead of ctypes ones.
        self._node_map = {}
        self._HANDLERS = [
            _add_document,
            self._add_element,
            _add_text(BeautifulSoup.NavigableString),
            _add_text(BeautifulSoup.CData),
            _add_text(BeautifulSoup.Comment),
            _add_text(BeautifulSoup.NavigableString),
        ]
        self.soup = BeautifulSoup.BeautifulSoup()
        with gumboc.parse(text, **kwargs) as output:
            self.soup.append(self._add_node(output.contents.root.contents))

        self._fix_next_prev_pointers(self.soup)
 def __init__(self, text, **kwargs):
   # We need to record the addresses of GumboNodes as we add them and correlate
   # them with the BeautifulSoup objects that they become.  This lets us
   # correctly wire up the next/previous pointers so that they point to
   # BeautifulSoup objects instead of ctypes ones.
   self._node_map = {}
   self._HANDLERS = [
       _add_document,
       self._add_element,
       _add_text(BeautifulSoup.NavigableString),
       _add_text(BeautifulSoup.CData),
       _add_text(BeautifulSoup.Comment),
       _add_text(BeautifulSoup.NavigableString),
       ]
   self.soup = BeautifulSoup.BeautifulSoup()
   with gumboc.parse(text, **kwargs) as output:
     self.soup.append(self._add_node(output.contents.root.contents))
   
   self._fix_next_prev_pointers(self.soup)
    def parse(self, text_or_file, **kwargs):
        try:
            text = text_or_file.read()
        except AttributeError:
            # Assume a string.
            text = text_or_file

        with gumboc.parse(text, **kwargs) as output:
            _convert_doctype(self.tree, output.contents.document.contents)
            for node in output.contents.document.contents.children:
                if node.type == gumboc.NodeType.COMMENT:
                    self.tree.insertComment(
                        {'data': node.v.text.text.decode('utf-8')},
                        self.tree.document)
                elif node.type == gumboc.NodeType.ELEMENT:
                    _insert_root(self.tree, output.contents.root.contents)
                else:
                    assert 'Only comments and <html> nodes allowed at the root'
            return self.tree.getDocument()
  def parseFragment(self, text_or_file, container, **kwargs):
    try:
      text = text_or_file.read()
    except AttributeError:
      # Assume a string.
      text = text_or_file
    if ' ' in container:
      container_ns, container = container.split(' ')
    else:
      container_ns = "html"

    with gumboc.parse(
        text,
        container=gumboc.Tag.from_str(container),
        container_namespace=getattr(gumboc.Namespace, container_ns.upper()),
        **kwargs) as output:
      for node in output.contents.document.contents.children:
        if node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE):
          _insert_root(self.tree, output.contents.root.contents, False)
        else:
          assert 'Malformed fragment parse (??)'
      return self.tree.getFragment()
    def parseFragment(self, text_or_file, container, **kwargs):
        try:
            text = text_or_file.read()
        except AttributeError:
            # Assume a string.
            text = text_or_file
        if ' ' in container:
            container_ns, container = container.split(' ')
        else:
            container_ns = "html"

        with gumboc.parse(text,
                          container=gumboc.Tag.from_str(container),
                          container_namespace=getattr(gumboc.Namespace,
                                                      container_ns.upper()),
                          **kwargs) as output:
            for node in output.contents.document.contents.children:
                if node.type in (gumboc.NodeType.ELEMENT,
                                 gumboc.NodeType.TEMPLATE):
                    _insert_root(self.tree, output.contents.root.contents,
                                 False)
                else:
                    assert 'Malformed fragment parse (??)'
            return self.tree.getFragment()
def parse(text, **kwargs):
    with gumboc.parse(text, **kwargs) as output:
        soup = BeautifulSoup.BeautifulSoup()
        soup.append(_add_node(soup, output.contents.root.contents))
        return soup
Exemple #20
0
def parse(text, **kwargs):
  with gumboc.parse(text, **kwargs) as output:
    soup = BeautifulSoup.BeautifulSoup()
    soup.append(_add_node(soup, output.contents.root.contents))
    _add_next_prev_pointers(soup)
    return soup