def lxml_extractor(html, url): '''LXML PARSER''' cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter cleaner.comments = True cleaner.embedded = True cleaner.forms= True cleaner.frames = True cleaner.annoying_tags = True cleaner.kill_tags = NEGATIVE_K cleaner.allow_tag = POSITIVE_K cleaner.safe_attrs_only = True #~ oc = document_fromstring(html, parser=parser, base_url=base_url, **kw) #~ File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 752, in document_fromstring #~ value = etree.fromstring(html, parser, **kw) try: html = lxml.html.fromstring(html, base_url="url") tree = cleaner.clean_html(html) #tree.make_links_absolute(url) doc = lxml.html.tostring(tree) doc = soup_extractor(doc, url) except ValueError: doc = soup_extractor(html, url) #~ (title, doc, article, text) = read_extractor(html, url) #~ print title #~ doc = (self.doc).replace(unichr(160), " ") #~ doc = re.sub(spaces,"",self.doc) return doc
def cleanpage(html): # cleaner setup cleaner = Cleaner() cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False cleaner.javascript = True # activate the javascript filter cleaner.style = True # activate the styles & stylesheet filter cleaner.links = False cleaner.frames = True cleaner.embedded = True cleaner.comments = True cleaner.annoying_tags = True cleaner.inline_style = True cleaner.page_structure = False # cleaner.remove_tags = ['b','img','h'] cleaner.kill_tags = ['img','script'] #invoke cleaner try: content=cleaner.clean_html(html) except: #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr content = u"" return content
def clean_html(html_text, javascript=True, scripts=True, style=True, embedded=True, links=True, forms=True, frames=True, comments=True, annoying_tags=True, meta=True, safe_attrs_only=True, remove_unknown_tags=True, processing_instructions=True): """Clean all the javascript and styles from the HTML returning the string with only the html content""" # True = Remove | False = Keep cleaner = Cleaner() cleaner.javascript = javascript # This is True because we want to activate the javascript filter cleaner.scripts = scripts # This is True because we want to activate the scripts filter cleaner.style = style cleaner.embedded = embedded cleaner.links = links cleaner.forms = forms cleaner.frames = frames cleaner.comments = comments cleaner.page_structure = False # Keep page structure cleaner.annoying_tags = annoying_tags cleaner.meta = meta cleaner.safe_attrs_only = safe_attrs_only cleaner.remove_unknown_tags = remove_unknown_tags cleaner.processing_instructions = processing_instructions clean_content = cleaner.clean_html(lxml.html.fromstring(html_text)) return lxml.html.tostring(clean_content)
def cleaner_parameters(): reject_list = [ 'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label', 'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math' ] accept_list = [ 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' 'span', 'b', 'a', 'u', 'i', 'body' ] html_cleaner = Cleaner() html_cleaner.remove_unknown_tags = True html_cleaner.processing_instructions = True html_cleaner.style = True html_cleaner.comments = True html_cleaner.scripts = True html_cleaner.javascript = True html_cleaner.meta = True html_cleaner.links = True html_cleaner.embedded = True html_cleaner.annoying_tags = True html_cleaner.frames = True html_cleaner.forms = True html_cleaner.remove_tags = accept_list html_cleaner.kill_tags = reject_list return html_cleaner
def get_cleaner(): cleaner = Cleaner() cleaner.embedded = True cleaner.frames = True cleaner.style = True cleaner.remove_unknown_tags = True cleaner.processing_instructions = True cleaner.annoying_tags = True cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'b', 'a', 'u', 'i', 'body', 'div', 'span', 'p'] cleaner.kill_tags = ['table', 'img', 'semantics', 'script', 'noscript', 'style', 'meta', 'label', 'li', 'ul', 'ol', 'sup', 'math', 'nav', 'dl', 'dd', 'sub'] return cleaner
def clean_text(data): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.meta = True cleaner.annoying_tags = True stuff = lxml.html.tostring(cleaner.clean_html(data)) soup = BeautifulSoup(stuff.decode('utf-8', 'ignore')) all_text = ' '.join(filter(lambda val: val, \ map(lambda x: x.strip(), soup.findAll(text=True)))) return all_text
def clean_text(data): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.meta = True cleaner.annoying_tags = True doc = UnicodeDammit(data, is_html=True) parser = html.HTMLParser(encoding=doc.original_encoding) root = html.document_fromstring(data, parser=parser) stuff = lxml.html.tostring(cleaner.clean_html(root)) soup = BeautifulSoup(stuff.decode('utf-8', 'ignore')) all_text = ' '.join(filter(lambda val: val, \ map(lambda x: x.strip(), soup.findAll(text=True)))) return all_text.encode('ascii', 'ignore')
import re from lxml import etree from lxml.html.clean import Cleaner from .filters import duplicate_test, textfilter from .settings import CUT_EMPTY_ELEMS, DEFAULT_CONFIG, MANUALLY_CLEANED, MANUALLY_STRIPPED from .utils import trim LOGGER = logging.getLogger(__name__) # HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html # https://lxml.de/apidoc/lxml.html.clean.html HTML_CLEANER = Cleaner() HTML_CLEANER.annoying_tags = False # True HTML_CLEANER.comments = True HTML_CLEANER.embedded = False # True HTML_CLEANER.forms = False # True HTML_CLEANER.frames = False # True HTML_CLEANER.javascript = False HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = False HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = False HTML_CLEANER.style = False #HTML_CLEANER.remove_tags = MANUALLY_STRIPPED #HTML_CLEANER.kill_tags = MANUALLY_CLEANED
import urllib.request import sys import testApp.processing as process import re from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.frames = True cleaner.forms = True cleaner.annoying_tags = True def get_url_content(url): try: with urllib.request.urlopen(url) as page: text = page.read() except Exception as e: return "Couldn't load url" return text def index(request): return HttpResponse("Hello, world. You're at the homepage.")
'.//*[(self::div or self::section or self::ul)][starts-with(@class, "author-") or starts-with(@id, "shar") or starts-with(@class, "shar") or contains(@class, "share-") or contains(@id, "social") or contains(@class, "social") or starts-with(@id, "jp-") or starts-with(@id, "dpsp-content")]', \ './/*[(self::div or self::section)][contains(@id, "author") or contains(@class, "author")]', \ # './/aside', \ # conflicts with text extraction ] COMMENTS_DISCARD_XPATH = ['.//*[(self::div or self::section)][starts-with(@id, "respond")]', \ './/cite', \ './/quote', \ './/*[starts-with(@id, "reply-") or starts-with(@class, "reply-title")]', \ './/*[contains(@id, "akismet") or contains(@class, "akismet")]', \ ] # HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html HTML_CLEANER = Cleaner() HTML_CLEANER.annoying_tags = True HTML_CLEANER.comments = True HTML_CLEANER.embedded = True HTML_CLEANER.forms = True HTML_CLEANER.frames = True HTML_CLEANER.javascript = True HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = False HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = True HTML_CLEANER.style = False HTML_CLEANER.remove_tags = [ 'a', 'abbr', 'acronym', 'address', 'big', 'cite', 'font', 'ins', 'meta',
def scrape(lineHashDB, html, encoding): # cleaner setup cleaner = Cleaner(allow_tags=['div', 'p'], remove_unknown_tags=False) cleaner.javascript = True # activate the javascript filter cleaner.style = True # activate the styles & stylesheet filter cleaner.comments = True cleaner.annoying_tags = True cleaner.inline_style = True cleaner.page_structure = False cleaner.remove_tags = ['b', 'a', 'h'] cleaner.kill_tags = ['script'] #invoke cleaner try: page = cleaner.clean_html(html) except: #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr content = u"" return content page8 = page page8 = re.sub(u'\n', ' ', page8) # remove NL # page8 = re.sub(u'\s','',page8,re.UNICODE) # blanks -> space page8 = re.sub(u' ', ' ', page8) # remove CR page8 = re.sub(u'<!--.*?-->', ' ', page8) # remove comments page8 = re.sub(u' class=".*?"', ' ', page8) # remove attributes page8 = re.sub(u' id=".*?"', ' ', page8) page8 = re.sub(u' rel=".*?"', ' ', page8) page8 = re.sub(u'\[an error occurred while processing this directive\]', ' ', page8) page8 = re.sub(u'>\s*?<', '><', page8) # remove blanks between tags # cycle to remove spurious divs for count in range(1, 20): page8 = re.sub(u'>.{0,10}<', '><', page8) # remove words under 10 chars between tags page8 = re.sub(u'<div></div>', ' ', page8) page8 = re.sub(u'<p></p>', ' ', page8) page8 = re.sub(u'<span></span>', ' ', page8) page8 = re.sub(u'\s+', ' ', page8) # remove repeated blanks #XPATHs xpath = '//*[((p) or (a) or (b) or (div) or (span)) ]/node()[(string-length() > 300)]/text()' xpath = '//*[((p) or (div))]/node()[(string-length() > 100)]/text()' sel = Selector(text=page8, type="html") text = sel.xpath(xpath).extract() content = u"" if text: for s in text: # squash duplicate whitespaces ' '.join(s.split()) # remove short lines # on empirical analysis, no unfrequent sentence under 40 chars is a relevant part of the article text, excluding repetition of title, authors, dates, etc. if len(s) < 40: next # remove leading whitespace #if s.endswith(" "): s = s[:-1] if s.startswith(" "): s = s[1:] content += s content += "\n" return content