def assertSoupEquals(self, to_parse, compare_parsed_to=None): builder = self.default_builder obj = ThoughtfulSoup(to_parse, builder=builder) if compare_parsed_to is None: compare_parsed_to = to_parse self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
def __init__(self, namespaceHTMLElements, soup=None): if soup: self.soup = soup else: from thoughtfulsoup import ThoughtfulSoup self.soup = ThoughtfulSoup("", "html.parser") super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
def test_formatter_processes_script_tag_for_xml_documents(self): doc = """ <script type="text/javascript"> </script> """ soup = ThoughtfulSoup(doc, "lxml-xml") # lxml would have stripped this while parsing, but we can add # it later. soup.script.string = 'console.log("< < hey > > ");' encoded = soup.encode() self.assertTrue(b"< < hey > >" in encoded)
def test_last_ditch_entity_replacement(self): # This is a UTF-8 document that contains bytestrings # completely incompatible with UTF-8 (ie. encoded with some other # encoding). # # Since there is no consistent encoding for the document, # Unicode, Dammit will eventually encode the document as UTF-8 # and encode the incompatible characters as REPLACEMENT # CHARACTER. # # If chardet is installed, it will detect that the document # can be converted into ISO-8859-1 without errors. This happens # to be the wrong encoding, but it is a consistent encoding, so the # code we're testing here won't run. # # So we temporarily disable chardet if it's present. doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> <html><b>\330\250\330\252\330\261</b> <i>\310\322\321\220\312\321\355\344</i></html>""" chardet = thoughtfulsoup.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None thoughtfulsoup.dammit.chardet_dammit = noop dammit = UnicodeDammit(doc) self.assertEqual(True, dammit.contains_replacement_characters) self.assertTrue(u"\ufffd" in dammit.unicode_markup) soup = ThoughtfulSoup(doc, "html.parser") self.assertTrue(soup.contains_replacement_characters) finally: logging.disable(logging.NOTSET) thoughtfulsoup.dammit.chardet_dammit = chardet
def test_thoughtfulsoup_constructor_does_lookup(self): with warnings.catch_warnings(record=True) as w: # This will create a warning about not explicitly # specifying a parser, but we'll ignore it. # You can pass in a string. ThoughtfulSoup("", features="html") # Or a list of strings. ThoughtfulSoup("", features=["html", "fast"]) # You'll get an exception if BS can't find an appropriate # builder. self.assertRaises(ValueError, ThoughtfulSoup, "", features="no-such-feature")
def run_against(self, *parser_names): uniform_results = True previous_output = None for parser in parser_names: try: soup = ThoughtfulSoup(self.markup, parser) if markup.startswith("<div>"): # Extract the interesting part output = soup.div else: output = soup except Exception, e: output = "[EXCEPTION] %s" % str(e) self.results[parser] = output if previous_output is None: previous_output = output elif previous_output != output: uniform_results = False
def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" print "Comparative parser benchmark on Beautiful Soup %s" % __version__ data = rdoc(num_elements) print "Generated a large invalid HTML document (%d bytes)." % len(data) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False try: a = time.time() soup = ThoughtfulSoup(data, parser) b = time.time() success = True except Exception, e: print "%s could not parse the markup." % parser traceback.print_exc() if success: print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
def soup(self, markup, **kwargs): """Build a Beautiful Soup object from markup.""" builder = kwargs.pop('builder', self.default_builder) return ThoughtfulSoup(markup, builder=builder, **kwargs)
def fragmentClass(self): from thoughtfulsoup import ThoughtfulSoup self.soup = ThoughtfulSoup("", "html.parser") self.soup.name = "[document_fragment]" return Element(self.soup, self.soup, None)
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): def __init__(self, namespaceHTMLElements, soup=None): if soup: self.soup = soup else: from thoughtfulsoup import ThoughtfulSoup self.soup = ThoughtfulSoup("", "html.parser") super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) def documentClass(self): self.soup.reset() return Element(self.soup, self.soup, None) def insertDoctype(self, token): name = token["name"] publicId = token["publicId"] systemId = token["systemId"] doctype = Doctype.for_name_and_ids(name, publicId, systemId) self.soup.object_was_parsed(doctype) def elementClass(self, name, namespace): tag = self.soup.new_tag(name, namespace) return Element(tag, self.soup, namespace) def commentClass(self, data): return TextNode(Comment(data), self.soup) def fragmentClass(self): from thoughtfulsoup import ThoughtfulSoup self.soup = ThoughtfulSoup("", "html.parser") self.soup.name = "[document_fragment]" return Element(self.soup, self.soup, None) def appendChild(self, node): # XXX This code is not covered by the BS4 tests. self.soup.append(node.element) def getDocument(self): return self.soup def getFragment(self): return treebuilder_base.TreeBuilder.getFragment(self).element def testSerializer(self, element): from thoughtfulsoup import ThoughtfulSoup rv = [] doctype_re = re.compile( r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') def serializeElement(element, indent=0): if isinstance(element, ThoughtfulSoup): pass if isinstance(element, Doctype): m = doctype_re.match(element) if m: name = m.group(1) if m.lastindex > 1: publicId = m.group(2) or "" systemId = m.group(3) or m.group(4) or "" rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" % (' ' * indent, name, publicId, systemId)) else: rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name)) else: rv.append("|%s<!DOCTYPE >" % (' ' * indent, )) elif isinstance(element, Comment): rv.append("|%s<!-- %s -->" % (' ' * indent, element)) elif isinstance(element, NavigableString): rv.append("|%s\"%s\"" % (' ' * indent, element)) else: if element.namespace: name = "%s %s" % (prefixes[element.namespace], element.name) else: name = element.name rv.append("|%s<%s>" % (' ' * indent, name)) if element.attrs: attributes = [] for name, value in element.attrs.items(): if isinstance(name, NamespacedAttribute): name = "%s %s" % (prefixes[name.namespace], name.name) if isinstance(value, list): value = " ".join(value) attributes.append((name, value)) for name, value in sorted(attributes): rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) indent += 2 for child in element.children: serializeElement(child, indent) serializeElement(element, 0) return "\n".join(rv)
data = data.read() elif os.path.exists(data): print '"%s" looks like a filename. Reading data from the file.' % data with open(data) as fp: data = fp.read() elif data.startswith("http:") or data.startswith("https:"): print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." return print for parser in basic_parsers: print "Trying to parse your markup with %s" % parser success = False try: soup = ThoughtfulSoup(data, parser) success = True except Exception, e: print "%s could not parse the markup." % parser traceback.print_exc() if success: print "Here's what %s did with the markup:" % parser print soup.prettify() print "-" * 80 def lxml_trace(data, html=True, **kwargs): """Print out the lxml events that occur during parsing. This lets you see how lxml parses a document when no Beautiful Soup code is running.