Esempio n. 1
0
            E.TITLE(title)
            ),
        E.BODY(
            E.H1(E.CLASS("heading"), title),
            lxml.html.fromstring(html)
            )
        )

    html_out.getroottree().write(file="summarized-roanoke.html", method="html")

if __name__ == "__main__":
    
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.scripts = True
    cleaner.frame = True
    cleaner.meta = True
    cleaner.comments = True
    cleaner.links = True
    cleaner.style = True    
    cleaner.kill_tags = ["cite", "sup", "img", "noscript", "label", "video"]
        
    url = "https://en.wikipedia.org/wiki/Roanoke_Colony"
    doc = urllib2.urlopen(url)
    
    tree = lxml.html.parse(doc)
    title = tree.find(".//title").text
    
    tree = cleaner.clean_html(tree)

    netloc = urlparse(url).netloc