Exemple #1
0
    def assertSoupEquals(self, to_parse, compare_parsed_to=None):
        builder = self.default_builder
        obj = ThoughtfulSoup(to_parse, builder=builder)
        if compare_parsed_to is None:
            compare_parsed_to = to_parse

        self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
Exemple #2
0
 def __init__(self, namespaceHTMLElements, soup=None):
     if soup:
         self.soup = soup
     else:
         from thoughtfulsoup import ThoughtfulSoup
         self.soup = ThoughtfulSoup("", "html.parser")
     super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
Exemple #3
0
    def test_formatter_processes_script_tag_for_xml_documents(self):
        doc = """
  <script type="text/javascript">
  </script>
"""
        soup = ThoughtfulSoup(doc, "lxml-xml")
        # lxml would have stripped this while parsing, but we can add
        # it later.
        soup.script.string = 'console.log("< < hey > > ");'
        encoded = soup.encode()
        self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
Exemple #4
0
    def test_last_ditch_entity_replacement(self):
        # This is a UTF-8 document that contains bytestrings
        # completely incompatible with UTF-8 (ie. encoded with some other
        # encoding).
        #
        # Since there is no consistent encoding for the document,
        # Unicode, Dammit will eventually encode the document as UTF-8
        # and encode the incompatible characters as REPLACEMENT
        # CHARACTER.
        #
        # If chardet is installed, it will detect that the document
        # can be converted into ISO-8859-1 without errors. This happens
        # to be the wrong encoding, but it is a consistent encoding, so the
        # code we're testing here won't run.
        #
        # So we temporarily disable chardet if it's present.
        doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
<html><b>\330\250\330\252\330\261</b>
<i>\310\322\321\220\312\321\355\344</i></html>"""
        chardet = thoughtfulsoup.dammit.chardet_dammit
        logging.disable(logging.WARNING)
        try:
            def noop(str):
                return None
            thoughtfulsoup.dammit.chardet_dammit = noop
            dammit = UnicodeDammit(doc)
            self.assertEqual(True, dammit.contains_replacement_characters)
            self.assertTrue(u"\ufffd" in dammit.unicode_markup)

            soup = ThoughtfulSoup(doc, "html.parser")
            self.assertTrue(soup.contains_replacement_characters)
        finally:
            logging.disable(logging.NOTSET)
            thoughtfulsoup.dammit.chardet_dammit = chardet
Exemple #5
0
    def test_thoughtfulsoup_constructor_does_lookup(self):

        with warnings.catch_warnings(record=True) as w:
            # This will create a warning about not explicitly
            # specifying a parser, but we'll ignore it.

            # You can pass in a string.
            ThoughtfulSoup("", features="html")
            # Or a list of strings.
            ThoughtfulSoup("", features=["html", "fast"])

        # You'll get an exception if BS can't find an appropriate
        # builder.
        self.assertRaises(ValueError,
                          ThoughtfulSoup,
                          "",
                          features="no-such-feature")
Exemple #6
0
 def run_against(self, *parser_names):
     uniform_results = True
     previous_output = None
     for parser in parser_names:
         try:
             soup = ThoughtfulSoup(self.markup, parser)
             if markup.startswith("<div>"):
                 # Extract the interesting part
                 output = soup.div
             else:
                 output = soup
         except Exception, e:
             output = "[EXCEPTION] %s" % str(e)
         self.results[parser] = output
         if previous_output is None:
             previous_output = output
         elif previous_output != output:
             uniform_results = False
Exemple #7
0
def benchmark_parsers(num_elements=100000):
    """Very basic head-to-head performance benchmark."""
    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
    data = rdoc(num_elements)
    print "Generated a large invalid HTML document (%d bytes)." % len(data)

    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
        success = False
        try:
            a = time.time()
            soup = ThoughtfulSoup(data, parser)
            b = time.time()
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
Exemple #8
0
 def soup(self, markup, **kwargs):
     """Build a Beautiful Soup object from markup."""
     builder = kwargs.pop('builder', self.default_builder)
     return ThoughtfulSoup(markup, builder=builder, **kwargs)
Exemple #9
0
 def fragmentClass(self):
     from thoughtfulsoup import ThoughtfulSoup
     self.soup = ThoughtfulSoup("", "html.parser")
     self.soup.name = "[document_fragment]"
     return Element(self.soup, self.soup, None)
Exemple #10
0
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
    def __init__(self, namespaceHTMLElements, soup=None):
        if soup:
            self.soup = soup
        else:
            from thoughtfulsoup import ThoughtfulSoup
            self.soup = ThoughtfulSoup("", "html.parser")
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)

    def documentClass(self):
        self.soup.reset()
        return Element(self.soup, self.soup, None)

    def insertDoctype(self, token):
        name = token["name"]
        publicId = token["publicId"]
        systemId = token["systemId"]

        doctype = Doctype.for_name_and_ids(name, publicId, systemId)
        self.soup.object_was_parsed(doctype)

    def elementClass(self, name, namespace):
        tag = self.soup.new_tag(name, namespace)
        return Element(tag, self.soup, namespace)

    def commentClass(self, data):
        return TextNode(Comment(data), self.soup)

    def fragmentClass(self):
        from thoughtfulsoup import ThoughtfulSoup
        self.soup = ThoughtfulSoup("", "html.parser")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup, None)

    def appendChild(self, node):
        # XXX This code is not covered by the BS4 tests.
        self.soup.append(node.element)

    def getDocument(self):
        return self.soup

    def getFragment(self):
        return treebuilder_base.TreeBuilder.getFragment(self).element

    def testSerializer(self, element):
        from thoughtfulsoup import ThoughtfulSoup
        rv = []
        doctype_re = re.compile(
            r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')

        def serializeElement(element, indent=0):
            if isinstance(element, ThoughtfulSoup):
                pass
            if isinstance(element, Doctype):
                m = doctype_re.match(element)
                if m:
                    name = m.group(1)
                    if m.lastindex > 1:
                        publicId = m.group(2) or ""
                        systemId = m.group(3) or m.group(4) or ""
                        rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
                                  (' ' * indent, name, publicId, systemId))
                    else:
                        rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
                else:
                    rv.append("|%s<!DOCTYPE >" % (' ' * indent, ))
            elif isinstance(element, Comment):
                rv.append("|%s<!-- %s -->" % (' ' * indent, element))
            elif isinstance(element, NavigableString):
                rv.append("|%s\"%s\"" % (' ' * indent, element))
            else:
                if element.namespace:
                    name = "%s %s" % (prefixes[element.namespace],
                                      element.name)
                else:
                    name = element.name
                rv.append("|%s<%s>" % (' ' * indent, name))
                if element.attrs:
                    attributes = []
                    for name, value in element.attrs.items():
                        if isinstance(name, NamespacedAttribute):
                            name = "%s %s" % (prefixes[name.namespace],
                                              name.name)
                        if isinstance(value, list):
                            value = " ".join(value)
                        attributes.append((name, value))

                    for name, value in sorted(attributes):
                        rv.append('|%s%s="%s"' % (' ' *
                                                  (indent + 2), name, value))
                indent += 2
                for child in element.children:
                    serializeElement(child, indent)

        serializeElement(element, 0)

        return "\n".join(rv)
Exemple #11
0
        data = data.read()
    elif os.path.exists(data):
        print '"%s" looks like a filename. Reading data from the file.' % data
        with open(data) as fp:
            data = fp.read()
    elif data.startswith("http:") or data.startswith("https:"):
        print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
        print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
        return
    print

    for parser in basic_parsers:
        print "Trying to parse your markup with %s" % parser
        success = False
        try:
            soup = ThoughtfulSoup(data, parser)
            success = True
        except Exception, e:
            print "%s could not parse the markup." % parser
            traceback.print_exc()
        if success:
            print "Here's what %s did with the markup:" % parser
            print soup.prettify()

        print "-" * 80

def lxml_trace(data, html=True, **kwargs):
    """Print out the lxml events that occur during parsing.

    This lets you see how lxml parses a document when no Beautiful
    Soup code is running.