def cleanpage(html): # cleaner setup cleaner = Cleaner() cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False cleaner.javascript = True # activate the javascript filter cleaner.style = True # activate the styles & stylesheet filter cleaner.links = False cleaner.frames = True cleaner.embedded = True cleaner.comments = True cleaner.annoying_tags = True cleaner.inline_style = True cleaner.page_structure = False # cleaner.remove_tags = ['b','img','h'] cleaner.kill_tags = ['img','script'] #invoke cleaner try: content=cleaner.clean_html(html) except: #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr content = u"" return content
def get_text(self, html_content: str): cleaner = Cleaner() cleaner.style = True cleaner.inline_style = True cleaned = cleaner.clean_html(html_content) soup = BeautifulSoup(cleaned, 'lxml') text_lines = soup.findAll(text=True) text_lines_merged = [] merge_str = '' text_lines_merged.append(text_lines[0]) for line in text_lines[1:]: if '\n' == line or '' == line or ' ' == line: if merge_str is not '': text_lines_merged.append(merge_str) merge_str = '' else: merge_str += (' ' + line) text_lines_merged = [ self.strip(line) for line in text_lines_merged if len(self.strip(line)) > 128 ] print(' '.join(text_lines_merged))
#!/usr/bin/python # -*- coding: utf-8 -*- from __future__ import print_function import re import os import lxml from bs4 import BeautifulSoup from lxml.html.clean import Cleaner from lxml.etree import XMLSyntaxError from store_helper import StoreHelper from text_helper import TextHelper cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter cleaner.inline_style = True cleaner.whitelist_tags = set([]) cleaner.remove_tags = [ 'p', 'ul', 'li', 'b', 'br', 'article', 'div', 'body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'span' ] cleaner.kill_tags = ['footer', 'a', 'noscript', 'header', 'label'] class HTMLHelper(object): @staticmethod def remove_tag(web_source): text = re.sub(r'<[^>]+>', '', web_source) return text @staticmethod
def scrape(lineHashDB, html, encoding): # cleaner setup cleaner = Cleaner(allow_tags=['div', 'p'], remove_unknown_tags=False) cleaner.javascript = True # activate the javascript filter cleaner.style = True # activate the styles & stylesheet filter cleaner.comments = True cleaner.annoying_tags = True cleaner.inline_style = True cleaner.page_structure = False cleaner.remove_tags = ['b', 'a', 'h'] cleaner.kill_tags = ['script'] #invoke cleaner try: page = cleaner.clean_html(html) except: #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr content = u"" return content page8 = page page8 = re.sub(u'\n', ' ', page8) # remove NL # page8 = re.sub(u'\s','',page8,re.UNICODE) # blanks -> space page8 = re.sub(u' ', ' ', page8) # remove CR page8 = re.sub(u'<!--.*?-->', ' ', page8) # remove comments page8 = re.sub(u' class=".*?"', ' ', page8) # remove attributes page8 = re.sub(u' id=".*?"', ' ', page8) page8 = re.sub(u' rel=".*?"', ' ', page8) page8 = re.sub(u'\[an error occurred while processing this directive\]', ' ', page8) page8 = re.sub(u'>\s*?<', '><', page8) # remove blanks between tags # cycle to remove spurious divs for count in range(1, 20): page8 = re.sub(u'>.{0,10}<', '><', page8) # remove words under 10 chars between tags page8 = re.sub(u'<div></div>', ' ', page8) page8 = re.sub(u'<p></p>', ' ', page8) page8 = re.sub(u'<span></span>', ' ', page8) page8 = re.sub(u'\s+', ' ', page8) # remove repeated blanks #XPATHs xpath = '//*[((p) or (a) or (b) or (div) or (span)) ]/node()[(string-length() > 300)]/text()' xpath = '//*[((p) or (div))]/node()[(string-length() > 100)]/text()' sel = Selector(text=page8, type="html") text = sel.xpath(xpath).extract() content = u"" if text: for s in text: # squash duplicate whitespaces ' '.join(s.split()) # remove short lines # on empirical analysis, no unfrequent sentence under 40 chars is a relevant part of the article text, excluding repetition of title, authors, dates, etc. if len(s) < 40: next # remove leading whitespace #if s.endswith(" "): s = s[:-1] if s.startswith(" "): s = s[1:] content += s content += "\n" return content