def strip_and_clean(self, html): """ Cleans up the HTML structure and strips all tags. """ html = "".join([line.strip() for line in html.split("\n")]) html = re.sub('<!DOCTYPE.*?>', '', html) html = StringUtil.br2nl(html) # strip html html = ''.join(BeautifulSoup(html).findAll(text=True)) return html