Example #1
0
 def clean_text(self, html_text):
     text_without_tags = w3lib.html.remove_tags(html_text)
     text_without_escape_chars = w3lib.html.replace_escape_chars(
         text_without_tags)
     text_without_escape_chars = html.unescape(text_without_escape_chars)
     text_without_whitespace = text_without_escape_chars.strip()
     return text_without_whitespace
Example #2
0
def filteHTML(string):
    content = remove_comments(string)  # 过滤注释
    content = html.unescape(content)  # 去掉实体字符
    content = content.replace(' ', '')
    content = content.replace(' ', '')
    content = content.replace(' ', '')

    return content