def clean_html(html_text, javascript=True, scripts=True, style=True, embedded=True, links=True, forms=True, frames=True, comments=True, annoying_tags=True, meta=True, safe_attrs_only=True, remove_unknown_tags=True, processing_instructions=True): """Clean all the javascript and styles from the HTML returning the string with only the html content""" # True = Remove | False = Keep cleaner = Cleaner() cleaner.javascript = javascript # This is True because we want to activate the javascript filter cleaner.scripts = scripts # This is True because we want to activate the scripts filter cleaner.style = style cleaner.embedded = embedded cleaner.links = links cleaner.forms = forms cleaner.frames = frames cleaner.comments = comments cleaner.page_structure = False # Keep page structure cleaner.annoying_tags = annoying_tags cleaner.meta = meta cleaner.safe_attrs_only = safe_attrs_only cleaner.remove_unknown_tags = remove_unknown_tags cleaner.processing_instructions = processing_instructions clean_content = cleaner.clean_html(lxml.html.fromstring(html_text)) return lxml.html.tostring(clean_content)
def html2text(html): cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.processing_instructions = True cleaner.forms = True cleaner.add_nofollow = True #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore') try: document = lxml.html.document_fromstring(html) c = cleaner.clean_html(document) html = lxml.html.tostring(c) soup = BeautifulSoup(html, 'lxml') parsed_text = soup.get_text() if (len(parsed_text) > MINSIZE_CHARSDOC): return parsed_text.lower() else: return None except: return None
def lxml_extractor(html, url): '''LXML PARSER''' cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter cleaner.comments = True cleaner.embedded = True cleaner.forms= True cleaner.frames = True cleaner.annoying_tags = True cleaner.kill_tags = NEGATIVE_K cleaner.allow_tag = POSITIVE_K cleaner.safe_attrs_only = True #~ oc = document_fromstring(html, parser=parser, base_url=base_url, **kw) #~ File "/usr/local/lib/python2.7/dist-packages/lxml/html/__init__.py", line 752, in document_fromstring #~ value = etree.fromstring(html, parser, **kw) try: html = lxml.html.fromstring(html, base_url="url") tree = cleaner.clean_html(html) #tree.make_links_absolute(url) doc = lxml.html.tostring(tree) doc = soup_extractor(doc, url) except ValueError: doc = soup_extractor(html, url) #~ (title, doc, article, text) = read_extractor(html, url) #~ print title #~ doc = (self.doc).replace(unichr(160), " ") #~ doc = re.sub(spaces,"",self.doc) return doc
def clearTag_old(self, text: str) -> str: import lxml from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.links = True cleaner.meta = True cleaner.forms = True cleaner.embedded = True cleaner.frames = True cleaner.remove_unknown_tags = True cleaner.kill_tags = ["img"] cleaner.remove_tags = [ "strong", "div", "body", "br", "a", "p", "blockquote", "h3", "ol", "li", "font", ] return cleaner.clean_html( lxml.html.document_fromstring(text)).decode("utf-8")
def cleaner_parameters(): reject_list = [ 'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label', 'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math' ] accept_list = [ 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' 'span', 'b', 'a', 'u', 'i', 'body' ] html_cleaner = Cleaner() html_cleaner.remove_unknown_tags = True html_cleaner.processing_instructions = True html_cleaner.style = True html_cleaner.comments = True html_cleaner.scripts = True html_cleaner.javascript = True html_cleaner.meta = True html_cleaner.links = True html_cleaner.embedded = True html_cleaner.annoying_tags = True html_cleaner.frames = True html_cleaner.forms = True html_cleaner.remove_tags = accept_list html_cleaner.kill_tags = reject_list return html_cleaner
def sanitize_html(rendered): if rendered: cleaner = Cleaner(safe_attrs_only=False) cleaner.host_whitelist = ('www.youtube.com', 'player.vimeo.com', 'embed.ted.com', 'prezi.com', 'www.google.com', 'www.slideshare.net', 'maps.google.com', 'docs.google.com') cleaner.forms = False rendered = cleaner.clean_html(rendered) # remove div wrapper if there is one if rendered.startswith('<div>'): rendered = rendered[5:-6] return rendered
def sanitize_html(rendered): if rendered: cleaner = Cleaner(safe_attrs_only=False) cleaner.host_whitelist = ( 'www.youtube.com', 'player.vimeo.com', 'embed.ted.com', ) cleaner.forms = False rendered = cleaner.clean_html(rendered) # remove div wrapper if there is one if rendered.startswith('<div>'): rendered = rendered[5:-6] return rendered
def sanitize_html(rendered): if rendered: cleaner = Cleaner(safe_attrs_only=False) cleaner.host_whitelist = ( "www.youtube.com", "player.vimeo.com", "embed.ted.com", "prezi.com", "www.google.com", "www.slideshare.net", "maps.google.com", "docs.google.com", ) cleaner.forms = False rendered = cleaner.clean_html(rendered) # remove div wrapper if there is one if rendered.startswith("<div>"): rendered = rendered[5:-6] return rendered
def f_parse(args): def isAlphabet(word): alphabet = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'i', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'w', 'z', 'à', 'è', 'é', 'ì', 'í', 'ò', 'ó', 'ù', 'ú' ] guard = True for t in word: if t not in alphabet: guard = False return guard loc = args[0] corpuses = args[1] MINSIZE_WORD = 4 MAXSIZE_WORD = 15 MINSIZE_CHARSDOC = 100 MINSIZE_WORDSDOC = 50 cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.processing_instructions = True cleaner.forms = True cleaner.add_nofollow = True ret = [] for document in corpuses: #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore') if len(document) > 0: try: document = lxml.html.document_fromstring(document) c = cleaner.clean_html(document) html = lxml.html.tostring(c) soup = BeautifulSoup(html, 'lxml') parsed_text = soup.get_text() if (len(parsed_text) > MINSIZE_CHARSDOC): parsed_text = parsed_text.lower() tokenizer = RegexpTokenizer(r'\w+') # create English stop words list en_stop = get_stop_words('en') it_stop = get_stop_words('it') sp_stop = get_stop_words('es') ge_stop = get_stop_words('de') fr_stop = get_stop_words('fr') # Create p_stemmer of class PorterStemmer #p_stemmer = PorterStemmer() # clean and tokenize document string tokens = tokenizer.tokenize(parsed_text) # remove stop words from tokens stopped_tokens1 = [i for i in tokens if not i in en_stop] stopped_tokens2 = [ i for i in stopped_tokens1 if not i in it_stop ] stopped_tokens3 = [ i for i in stopped_tokens2 if not i in sp_stop ] stopped_tokens4 = [ i for i in stopped_tokens3 if not i in ge_stop ] stopped_tokens5 = [ i for i in stopped_tokens4 if not i in fr_stop ] for word in stopped_tokens5: if not any(char.isdigit() for char in word): if len(word) > 1: #check if the word has the alphabet character if isAlphabet(word): ret.append(word) except: print('Exception : Document empty') return [loc, ret]
import lxml from lxml import etree from lxml.html.clean import autolink_html from lxml.html.clean import Cleaner #LXML PARSER cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter cleaner.comments = True cleaner.embedded = True cleaner.forms= True cleaner.frames = True #cleaner.safe_attrs_only = True import re notalpha = re.compile('[^a-zA-Z]') #BS PARSER from bs4 import BeautifulSoup as bs from bs4 import Comment def make_links_absolute(soup, url): return [urlparse.urljoin(url, tag['href']) for tag in soup.findAll('a', href=True)] def clean_html(soup): soup = bs(" ".join([s.extract() for s in soup('script')])) soup = bs(" ".join([s.extract() for s in soup('iframe')])) soup = bs(" ".join([s.extract() for s in soup('form')])) soup = bs(" ".join([s.extract() for s in soup('embed')])) soup = bs(" ".join([s.extract() for s in soup('style')]))
from lxml.html.clean import Cleaner from .filters import duplicate_test, textfilter from .settings import CUT_EMPTY_ELEMS, DEFAULT_CONFIG, MANUALLY_CLEANED, MANUALLY_STRIPPED from .utils import trim LOGGER = logging.getLogger(__name__) # HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html # https://lxml.de/apidoc/lxml.html.clean.html HTML_CLEANER = Cleaner() HTML_CLEANER.annoying_tags = False # True HTML_CLEANER.comments = True HTML_CLEANER.embedded = False # True HTML_CLEANER.forms = False # True HTML_CLEANER.frames = False # True HTML_CLEANER.javascript = False HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = False HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = False HTML_CLEANER.style = False #HTML_CLEANER.remove_tags = MANUALLY_STRIPPED #HTML_CLEANER.kill_tags = MANUALLY_CLEANED def tree_cleaning(tree, include_tables, include_images=False):
"//*[contains(@id, 'date') or contains(@id, 'Date') or contains(@id, 'datum') or contains(@id, 'Datum') or contains(@id, 'time') or contains(@class, 'post-meta-time')]", "//*[contains(@class, 'date') or contains(@class, 'Date') or contains(@class, 'datum') or contains(@class, 'Datum')]", "//*[contains(@class, 'postmeta') or contains(@class, 'post-meta') or contains(@class, 'entry-meta') or contains(@class, 'postMeta') or contains(@class, 'post_meta') or contains(@class, 'post__meta')]", "//*[@class='meta' or @class='meta-before' or @class='asset-meta' or contains(@id, 'article-metadata') or contains(@class, 'article-metadata') or contains(@class, 'byline') or contains(@class, 'subline')]", "//*[contains(@class, 'published') or contains(@class, 'posted') or contains(@class, 'submitted') or contains(@class, 'created-post')]", "//*[contains(@id, 'lastmod') or contains(@itemprop, 'date') or contains(@class, 'time')]", "//footer", "//*[@class='post-footer' or @class='footer' or @id='footer']", "//small", "//*[contains(@class, 'author') or contains(@class, 'autor') or contains(@class, 'field-content') or @class='meta' or contains(@class, 'info') or contains(@class, 'fa-clock-o')]", ] CLEANER = Cleaner() CLEANER.comments = False CLEANER.embedded = True CLEANER.forms = False CLEANER.frames = True CLEANER.javascript = True CLEANER.links = False CLEANER.meta = False CLEANER.page_structure = True CLEANER.processing_instructions = True CLEANER.remove_unknown_tags = False CLEANER.safe_attrs_only = False CLEANER.scripts = False CLEANER.style = True CLEANER.kill_tags = [ 'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'rdf', 'svg', 'video' ] # 'embed', 'figure', 'img', 'table'
from bs4 import BeautifulSoup import urllib.request import sys import testApp.processing as process import re from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.frames = True cleaner.forms = True cleaner.annoying_tags = True def get_url_content(url): try: with urllib.request.urlopen(url) as page: text = page.read() except Exception as e: return "Couldn't load url" return text def index(request): return HttpResponse("Hello, world. You're at the homepage.")
] COMMENTS_DISCARD_XPATH = ['.//*[(self::div or self::section)][starts-with(@id, "respond")]', \ './/cite', \ './/quote', \ './/*[starts-with(@id, "reply-") or starts-with(@class, "reply-title")]', \ './/*[contains(@id, "akismet") or contains(@class, "akismet")]', \ ] # HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html HTML_CLEANER = Cleaner() HTML_CLEANER.annoying_tags = True HTML_CLEANER.comments = True HTML_CLEANER.embedded = True HTML_CLEANER.forms = True HTML_CLEANER.frames = True HTML_CLEANER.javascript = True HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = False HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = True HTML_CLEANER.style = False HTML_CLEANER.remove_tags = [ 'a', 'abbr', 'acronym', 'address', 'big', 'cite', 'font', 'ins', 'meta', 'small', 'sub', 'sup', 'wbr' ] # 'center', 'table', 'tbody', 'td', 'th', 'tr', 'span', HTML_CLEANER.kill_tags = [
MAX_YEAR = datetime.date.today().year ## DateDataParser object PARSERCONFIG = { 'PREFER_DAY_OF_MONTH': 'first', 'PREFER_DATES_FROM': 'past', 'DATE_ORDER': 'DMY' } logger.debug('settings: %s %s %s', MIN_YEAR, TODAY, MAX_YEAR) logger.debug('dateparser configuration: %s', PARSERCONFIG) cleaner = Cleaner() cleaner.comments = True cleaner.embedded = True cleaner.forms = False cleaner.frames = True cleaner.javascript = False cleaner.links = False cleaner.meta = False cleaner.page_structure = True cleaner.processing_instructions = True cleaner.remove_unknown_tags = False cleaner.safe_attrs_only = False cleaner.scripts = False cleaner.style = False cleaner.kill_tags = [ 'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'table', 'svg', 'video' ] # 'embed', 'figure', 'img',