E.TITLE(title) ), E.BODY( E.H1(E.CLASS("heading"), title), lxml.html.fromstring(html) ) ) html_out.getroottree().write(file="summarized-roanoke.html", method="html") if __name__ == "__main__": cleaner = Cleaner() cleaner.javascript = True cleaner.scripts = True cleaner.frame = True cleaner.meta = True cleaner.comments = True cleaner.links = True cleaner.style = True cleaner.kill_tags = ["cite", "sup", "img", "noscript", "label", "video"] url = "https://en.wikipedia.org/wiki/Roanoke_Colony" doc = urllib2.urlopen(url) tree = lxml.html.parse(doc) title = tree.find(".//title").text tree = cleaner.clean_html(tree) netloc = urlparse(url).netloc