def clean(self, doc): doc = to_unicode_if_htmlentity(doc) # remove comment doc = re.compile('<!--.*?-->', re.DOTALL).sub('', doc) # remove ignore tags for tag in self.ignore_tags: doc = re.compile('<%s.*?/%s>' % (tag, tag), re.DOTALL).sub('', doc) return doc
def clean(self, doc): doc = to_unicode_if_htmlentity(doc) doc = script_regx.sub('', doc) doc = noscript_regx.sub('', doc) doc = style_regx.sub('', doc) doc = iframe_regx.sub('', doc) doc = select_regx.sub('', doc) doc = comment_regx.sub('', doc) return doc
def sweep(ss): # html cleansing ss = to_unicode_if_htmlentity(ss) ss = re.compile('<script.*?/script>', re.DOTALL).sub('', ss) ss = re.compile('<noscript.*?/noscript>', re.DOTALL).sub('', ss) ss = re.compile('<style.*?/style>', re.DOTALL).sub('', ss) ss = re.compile('<iframe.*?/iframe>', re.DOTALL).sub('', ss) ss = re.compile('<select.*?/select>', re.DOTALL).sub('', ss) ss = re.compile('<!--.*?-->', re.DOTALL).sub('', ss) return ss