def demo4(): text = """ <html> <table> <tr><td>http://google.ca</td></tr> <tr><td>http://reddit.com</td></tr> </table> </html> """ doc = lx.to_doc(text) doc = lx.autolink(doc) print lx.prettify(doc)
def test_prettify(): doc = lx.to_doc(ex.UGLY, parser=scraper.LXML_HTML) # nice = lx.prettify(doc, method=scraper.LXML_HTML) assert '</h1>' in nice and '</html>' in nice # # nice = lx.prettify(doc, method=scraper.HTML5PARSER) # missing # nice = lx.prettify(doc, method=scraper.BEAUTIFULSOUP) assert '</h1>' in nice and '</html>' in nice # nice = lx.prettify(doc, method=scraper.TIDY) assert '</h1>' in nice and '</html>' in nice # nice = lx.prettify(doc, method=None) assert nice is None
def demo3(): html = """<html> <head> <script type="text/javascript" src="stuff.js"></script> <link rel="alternate" type="text/rss" src="some-rss"> <style> body {background-image: url(javascript:do_something)}; div {color: expression(something)}; </style> </head> <body onload="some_function()"> Hello World! </body> </html>""" doc = lx.to_doc(html) print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
def demo1(): text = "<table><td>foo" parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("lxml"), namespaceHTMLElements=False) doc = parser.parse(text) print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
def demo3(): doc = lx.to_doc(text, parser=scraper.BEAUTIFULSOUP) print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
def demo2(): doc = lx.to_doc(text, parser=scraper.HTML5PARSER) print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
def demo1(): doc = lx.to_doc(text) print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)