def strip_html_and_tags(s, invalid_tags): ''' content between "invalid_tags" is removed ''' if not s: return s from util.BeautifulSoup import BeautifulSoup soup = BeautifulSoup(s.replace('<br>','\n').replace('<br/>','\n').replace('<br />', '\n')) for tag in invalid_tags: for result in soup.findAll(name=tag): result.replaceWith("") return ''.join(e for e in soup.recursiveChildGenerator() if isinstance(e,unicode))
def strip_html_and_tags(s, invalid_tags): ''' content between "invalid_tags" is removed ''' if not s: return s from util.BeautifulSoup import BeautifulSoup soup = BeautifulSoup( s.replace('<br>', '\n').replace('<br/>', '\n').replace('<br />', '\n')) for tag in invalid_tags: for result in soup.findAll(name=tag): result.replaceWith("") return ''.join(e for e in soup.recursiveChildGenerator() if isinstance(e, unicode))
def strip_html2(s): ''' Strips out HTML with the BeautifulSoup library. >>> strip_html2('<html><body><b>Some <i>ugly</i></b> html.</body></html>') u'Some ugly html.' ''' if not s: return s from util.BeautifulSoup import BeautifulSoup soup = BeautifulSoup(s) text_pieces = [] for pc in soup.recursiveChildGenerator(): if isinstance(pc, unicode): text_pieces.append(pc) elif pc.name == 'br': text_pieces.append('\n') return ''.join(text_pieces)
def strip_html2(s): ''' Strips out HTML with the BeautifulSoup library. >>> strip_html2('<html><body><b>Some <i>ugly</i></b> html.</body></html>') u'Some ugly html.' ''' if not s: return s from util.BeautifulSoup import BeautifulSoup soup = BeautifulSoup(s) text_pieces = [] for pc in soup.recursiveChildGenerator(): if isinstance(pc, unicode): text_pieces.append(pc) elif pc.name == 'br': text_pieces.append('\n') return ''.join(text_pieces)