def clean_text(self, html_text): text_without_tags = w3lib.html.remove_tags(html_text) text_without_escape_chars = w3lib.html.replace_escape_chars( text_without_tags) text_without_escape_chars = html.unescape(text_without_escape_chars) text_without_whitespace = text_without_escape_chars.strip() return text_without_whitespace
def filteHTML(string): content = remove_comments(string) # 过滤注释 content = html.unescape(content) # 去掉实体字符 content = content.replace(' ', '') content = content.replace(' ', '') content = content.replace(' ', '') return content