# -*- coding: utf-8 -*- from __future__ import print_function import re import os import lxml from bs4 import BeautifulSoup from lxml.html.clean import Cleaner from lxml.etree import XMLSyntaxError from store_helper import StoreHelper from text_helper import TextHelper cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter cleaner.inline_style = True cleaner.whitelist_tags = set([]) cleaner.remove_tags = [ 'p', 'ul', 'li', 'b', 'br', 'article', 'div', 'body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'span' ] cleaner.kill_tags = ['footer', 'a', 'noscript', 'header', 'label'] class HTMLHelper(object): @staticmethod def remove_tag(web_source): text = re.sub(r'<[^>]+>', '', web_source) return text @staticmethod def get_text(web_source):