def cleanTags(string): """Remove all html tags from the string. >>> cleanTags("<html><head><title>Hello</title><body>Test</body></html>") 'HelloTest' @type string: string @param string: the string to clean @rtype: string @return: the cleaned up string """ # http://lxml.de/api/lxml.html.clean.Cleaner-class.html htmlCleaned = Cleaner(allow_tags=[''], remove_unknown_tags=False, style=True ).clean_html(string or u"dummy") nice = htmlCleaned[5:-6] if htmlCleaned.startswith("<div>") else htmlCleaned return resub(r"\s\s+" , " ", resub(r"\s\s+" , " ", nice)).strip()