def html2content(html, allowed_tags=["a", "abbr", "article", "aside", "b", "base", "blockquote", "body", "br", "caption", "cite", "code", "col", "colgroup", "dd", "del", "dfn", "dl", "dt", "em", "embed", "figcaption", "figure", "footer", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "i", "img", "li", "map", "mark", "math", "meta", "meter", "nav", "noscript", "object", "ol", "optgroup", "option", "output", "p", "param", "pre", "progress", "q", "rp", "rt", "ruby", "s", "samp", "section", "small", "source", "span", "strong", "sub", "sup", "svg", "table", "tbody", "td", "th", "thead", "tfoot", "time", "title", "tr", "track", "u", "ul", "var", "video", "wbr"]): cleaner = Cleaner() cleaner.allow_tags = allowed_tags cleaner.remove_unknown_tags = False cleaner.page_structure = False cleaner.meta = False cleaner.style = True cleaner.embeded = False return cleaner.clean_html(html)
def clearTag_old(self, text: str) -> str: import lxml from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.links = True cleaner.meta = True cleaner.forms = True cleaner.embedded = True cleaner.frames = True cleaner.remove_unknown_tags = True cleaner.kill_tags = ["img"] cleaner.remove_tags = [ "strong", "div", "body", "br", "a", "p", "blockquote", "h3", "ol", "li", "font", ] return cleaner.clean_html( lxml.html.document_fromstring(text)).decode("utf-8")
def cleanpage(html): # cleaner setup cleaner = Cleaner() cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False cleaner.javascript = True # activate the javascript filter cleaner.style = True # activate the styles & stylesheet filter cleaner.links = False cleaner.frames = True cleaner.embedded = True cleaner.comments = True cleaner.annoying_tags = True cleaner.inline_style = True cleaner.page_structure = False # cleaner.remove_tags = ['b','img','h'] cleaner.kill_tags = ['img','script'] #invoke cleaner try: content=cleaner.clean_html(html) except: #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr content = u"" return content
def clean_html(html_text, javascript=True, scripts=True, style=True, embedded=True, links=True, forms=True, frames=True, comments=True, annoying_tags=True, meta=True, safe_attrs_only=True, remove_unknown_tags=True, processing_instructions=True): """Clean all the javascript and styles from the HTML returning the string with only the html content""" # True = Remove | False = Keep cleaner = Cleaner() cleaner.javascript = javascript # This is True because we want to activate the javascript filter cleaner.scripts = scripts # This is True because we want to activate the scripts filter cleaner.style = style cleaner.embedded = embedded cleaner.links = links cleaner.forms = forms cleaner.frames = frames cleaner.comments = comments cleaner.page_structure = False # Keep page structure cleaner.annoying_tags = annoying_tags cleaner.meta = meta cleaner.safe_attrs_only = safe_attrs_only cleaner.remove_unknown_tags = remove_unknown_tags cleaner.processing_instructions = processing_instructions clean_content = cleaner.clean_html(lxml.html.fromstring(html_text)) return lxml.html.tostring(clean_content)
def cleaner_parameters(): reject_list = [ 'script', 'noscript', 'style', 'meta', 'semantics', 'img', 'label', 'table', 'li', 'ul', 'ol', 'nav', 'dl', 'dd', 'sub', 'sup', 'math' ] accept_list = [ 'div', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' 'span', 'b', 'a', 'u', 'i', 'body' ] html_cleaner = Cleaner() html_cleaner.remove_unknown_tags = True html_cleaner.processing_instructions = True html_cleaner.style = True html_cleaner.comments = True html_cleaner.scripts = True html_cleaner.javascript = True html_cleaner.meta = True html_cleaner.links = True html_cleaner.embedded = True html_cleaner.annoying_tags = True html_cleaner.frames = True html_cleaner.forms = True html_cleaner.remove_tags = accept_list html_cleaner.kill_tags = reject_list return html_cleaner
def get_clean_html(self, html_text, text_only=True): try: etree = lxml.html.document_fromstring(html_text) self._is_etree(etree) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(etree) if text_only: return ' '.join(html.text_content().split()) # return html.text_content() res = lxml.html.tostring(html) except Exception as e: logger.error(f"While parsing email in get_clean_html {e}") res = "junk" return res
def html2text(html): cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.processing_instructions = True cleaner.forms = True cleaner.add_nofollow = True #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore') try: document = lxml.html.document_fromstring(html) c = cleaner.clean_html(document) html = lxml.html.tostring(c) soup = BeautifulSoup(html, 'lxml') parsed_text = soup.get_text() if (len(parsed_text) > MINSIZE_CHARSDOC): return parsed_text.lower() else: return None except: return None
def convertHtmlToDicts(url, content): """ given a url and content, create file and article dictionaries content has to include normal newlines, no \a or #N# replacers returns None, None on error """ # lxml does not like unicode if the document has an explicit encoding if " encoding=" not in content: content = pubGeneric.forceToUnicode(content) logging.debug("Converting to text: %s " % (repr(url))) artDict = pubStore.createEmptyArticleDict(source="bing", fulltextUrl=url) if not "<html" in content: return None, None try: logging.debug("Parsing html with lxml, html size %d" % len(content)) tree = lxml.html.document_fromstring(content) logging.debug("end parse html") except lxml.etree.XMLSyntaxError: return None, None titleEl = tree.find("head/title") if titleEl!=None: title = titleEl.text else: logging.debug("No title found?") title = "" metaTags = tree.findall("head/meta") artDict = parseMetaData(metaTags, artDict) logging.debug("Cleaning html tree") cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.meta = True cleaner.embedded = True cleaner.page_structure=True #cleaner.remove_tags = ["a", "li", "td"] cleanTree = cleaner.clean_html(tree) logging.debug("Cleaning done, now converting to ASCII") #text = cleanTree.text_content() newlineTags = ["p", "br"] asciiText = pubXml.treeToAsciiText(cleanTree, newlineTags) logging.debug("ASCII conversion done") logging.debug("title: %s" % title) if "title" not in artDict or artDict["title"]=="": artDict["title"] = title if artDict["abstract"]=="": abstract = unidecode.unidecode(asciiText[0:1500]).strip() artDict["abstract"] = abstract logging.debug("abstract: %s" % artDict["abstract"]) fileDict = pubStore.createEmptyFileDict(url=url, content=asciiText, mimeType="text/html") logging.debug("meta data extract success: %s" % artDict) return artDict, fileDict
def cleaner_li(self): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.meta = True cleaner.safe_attrs_only = True cleaner.remove_tags = ['i', 'span', 'b', 'li'] cleaner.safe_attrs = ['href'] return cleaner
def trim_html(html): """Takes a html string as input and returns the html without any styles nor javascript""" cleaner = Cleaner() cleaner.scripts = True cleaner.javascript = True # Get rid of the javascript and the style cleaner.style = True cleaner.meta = False # Keeping the meta tags is important for page redirection purposes cleaner.safe_attrs_only = False return cleaner.clean_html(html)
def clean_text(data): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.meta = True cleaner.annoying_tags = True stuff = lxml.html.tostring(cleaner.clean_html(data)) soup = BeautifulSoup(stuff.decode('utf-8', 'ignore')) all_text = ' '.join(filter(lambda val: val, \ map(lambda x: x.strip(), soup.findAll(text=True)))) return all_text
def get_clean_html(etree, text_only=False): _is_etree(etree) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(etree) if text_only: return html.text_content() return lxml.html.tostring(html)
def get_text(session, url, title, dir): r = session.get(url, stream=True) doc = lxml.html.fromstring(r.text) sidebar = doc.find_class('course-sidebar')[0] sidebar.getparent().remove(sidebar) cleaner = Cleaner() cleaner.javascript = True cleaner.meta = True cleaner.kill_tags = ['header'] cleantext = lxml.html.tostring(cleaner.clean_html(doc)) filename = os.path.join(dir, title + '.html') with open(filename, 'w') as fout: print 'Downloading [T] ' + title + ' ...' fout.write(cleantext)
def gettextonly(self, html, url): cleaner = Cleaner() cleaner.scripts = True cleaner.style = True cleaner.links = True cleaner.meta = False cleaner.page_structure = False cleaner.remove_tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'span', 'img', 'area', 'map', 'noscript', 'td', 'tr', 'table', 'a', 'p', 'br', 'li', 'ul'] doc = lxml.html.fromstring(html) path = '/html/body' try: body = doc.xpath(path)[0] except Exception as detail: print detail return False return cleaner.clean_html(body).text_content().split()
def clean_text(data): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.meta = True cleaner.annoying_tags = True doc = UnicodeDammit(data, is_html=True) parser = html.HTMLParser(encoding=doc.original_encoding) root = html.document_fromstring(data, parser=parser) stuff = lxml.html.tostring(cleaner.clean_html(root)) soup = BeautifulSoup(stuff.decode('utf-8', 'ignore')) all_text = ' '.join(filter(lambda val: val, \ map(lambda x: x.strip(), soup.findAll(text=True)))) return all_text.encode('ascii', 'ignore')
def get_clean_text(filename): utf8_parser = html.HTMLParser(encoding='utf-8') htmltxt = lxml.html.parse(filename, parser=utf8_parser) cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False htmltxt = cleaner.clean_html(htmltxt) txt = etree.tostring(htmltxt, encoding='unicode') txtresub = re.sub(r'<.+?>', ' ', txt) txtresub = re.sub(r'(\s|&?(amp;|apos;|quot;|gt;|lt;|nbsp;))+', ' ', txtresub) return txtresub
def html2content( html, allowed_tags=[ "a", "abbr", "article", "aside", "b", "base", "blockquote", "body", "br", "caption", "cite", "code", "col", "colgroup", "dd", "del", "dfn", "dl", "dt", "em", "embed", "figcaption", "figure", "footer", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "i", "img", "li", "map", "mark", "math", "meta", "meter", "nav", "noscript", "object", "ol", "optgroup", "option", "output", "p", "param", "pre", "progress", "q", "rp", "rt", "ruby", "s", "samp", "section", "small", "source", "span", "strong", "sub", "sup", "svg", "table", "tbody", "td", "th", "thead", "tfoot", "time", "title", "tr", "track", "u", "ul", "var", "video", "wbr" ]): cleaner = Cleaner() cleaner.allow_tags = allowed_tags cleaner.remove_unknown_tags = False cleaner.page_structure = False cleaner.meta = False cleaner.style = True cleaner.embeded = False return cleaner.clean_html(html)
'PREFER_DAY_OF_MONTH': 'first', 'PREFER_DATES_FROM': 'past', 'DATE_ORDER': 'DMY' } logger.debug('settings: %s %s %s', MIN_YEAR, TODAY, MAX_YEAR) logger.debug('dateparser configuration: %s', PARSERCONFIG) cleaner = Cleaner() cleaner.comments = True cleaner.embedded = True cleaner.forms = False cleaner.frames = True cleaner.javascript = False cleaner.links = False cleaner.meta = False cleaner.page_structure = True cleaner.processing_instructions = True cleaner.remove_unknown_tags = False cleaner.safe_attrs_only = False cleaner.scripts = False cleaner.style = False cleaner.kill_tags = [ 'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'table', 'svg', 'video' ] # 'embed', 'figure', 'img', def date_validator(datestring, outputformat): """Validate a string with respect to the chosen outputformat and basic heuristics"""
def f_parse(args): def isAlphabet(word): alphabet = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'i', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'w', 'z', 'à', 'è', 'é', 'ì', 'í', 'ò', 'ó', 'ù', 'ú' ] guard = True for t in word: if t not in alphabet: guard = False return guard loc = args[0] corpuses = args[1] MINSIZE_WORD = 4 MAXSIZE_WORD = 15 MINSIZE_CHARSDOC = 100 MINSIZE_WORDSDOC = 50 cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.processing_instructions = True cleaner.forms = True cleaner.add_nofollow = True ret = [] for document in corpuses: #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore') if len(document) > 0: try: document = lxml.html.document_fromstring(document) c = cleaner.clean_html(document) html = lxml.html.tostring(c) soup = BeautifulSoup(html, 'lxml') parsed_text = soup.get_text() if (len(parsed_text) > MINSIZE_CHARSDOC): parsed_text = parsed_text.lower() tokenizer = RegexpTokenizer(r'\w+') # create English stop words list en_stop = get_stop_words('en') it_stop = get_stop_words('it') sp_stop = get_stop_words('es') ge_stop = get_stop_words('de') fr_stop = get_stop_words('fr') # Create p_stemmer of class PorterStemmer #p_stemmer = PorterStemmer() # clean and tokenize document string tokens = tokenizer.tokenize(parsed_text) # remove stop words from tokens stopped_tokens1 = [i for i in tokens if not i in en_stop] stopped_tokens2 = [ i for i in stopped_tokens1 if not i in it_stop ] stopped_tokens3 = [ i for i in stopped_tokens2 if not i in sp_stop ] stopped_tokens4 = [ i for i in stopped_tokens3 if not i in ge_stop ] stopped_tokens5 = [ i for i in stopped_tokens4 if not i in fr_stop ] for word in stopped_tokens5: if not any(char.isdigit() for char in word): if len(word) > 1: #check if the word has the alphabet character if isAlphabet(word): ret.append(word) except: print('Exception : Document empty') return [loc, ret]
def handle(self, **options): since = get_last_change() writer = get_writer() last_change = since while True: doc = {} changes = settings.db.changes(since=since) since = changes["last_seq"] if since != last_change: print("Detected new tasks ".format(len(changes))) print("=== changes ===") pprint(changes) for changeset in changes["results"]: try: doc = settings.db[changeset["id"]] except couchdb.http.ResourceNotFound: print("resource not found") continue if not ("type" in doc and "page" in doc["type"]): if since != last_change: print("not processing doc: {}".format(str(doc))) last_change = since continue print("indexing", doc["url"]) ##### # raw, html, text ##################### raw = doc["content"] print("type(RAW) = %s" % type(raw)) tree = document_fromstring(str(raw)) title = " ".join([title for title in tree.xpath("//title/text()")]) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(tree) text_content = html.text_content() lxml.html.tostring(html) description = " ".join(tree.xpath("//meta[@name='description']/@content")) writer.update_document( title=title, url=doc["url"], desc=description, rank=doc["rank"], content="\n".join([title, doc["url"], text_content]), raw=raw, ) writer.commit() writer = get_writer() set_last_change(since) last_change = since
from .utils import trim LOGGER = logging.getLogger(__name__) # HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html # https://lxml.de/apidoc/lxml.html.clean.html HTML_CLEANER = Cleaner() HTML_CLEANER.annoying_tags = False # True HTML_CLEANER.comments = True HTML_CLEANER.embedded = False # True HTML_CLEANER.forms = False # True HTML_CLEANER.frames = False # True HTML_CLEANER.javascript = False HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = False HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = False HTML_CLEANER.style = False #HTML_CLEANER.remove_tags = MANUALLY_STRIPPED #HTML_CLEANER.kill_tags = MANUALLY_CLEANED def tree_cleaning(tree, include_tables, include_images=False): '''Prune the tree by discarding unwanted elements''' # determine cleaning strategy cleaning_list, stripping_list = \ MANUALLY_CLEANED.copy(), MANUALLY_STRIPPED.copy()
"//*[contains(@class, 'published') or contains(@class, 'posted') or contains(@class, 'submitted') or contains(@class, 'created-post')]", "//*[contains(@id, 'lastmod') or contains(@itemprop, 'date') or contains(@class, 'time')]", "//footer", "//*[@class='post-footer' or @class='footer' or @id='footer']", "//small", "//*[contains(@class, 'author') or contains(@class, 'autor') or contains(@class, 'field-content') or @class='meta' or contains(@class, 'info') or contains(@class, 'fa-clock-o')]", ] CLEANER = Cleaner() CLEANER.comments = False CLEANER.embedded = True CLEANER.forms = False CLEANER.frames = True CLEANER.javascript = True CLEANER.links = False CLEANER.meta = False CLEANER.page_structure = True CLEANER.processing_instructions = True CLEANER.remove_unknown_tags = False CLEANER.safe_attrs_only = False CLEANER.scripts = False CLEANER.style = True CLEANER.kill_tags = [ 'audio', 'canvas', 'label', 'map', 'math', 'object', 'picture', 'rdf', 'svg', 'video' ] # 'embed', 'figure', 'img', 'table' ## REGEX cache JSON_PATTERN = re.compile( r'"date(?:Modified|Published)":"([0-9]{4}-[0-9]{2}-[0-9]{2})') # use of regex module for speed
from django.template.loader import get_template from django.http import HttpResponse import rake from bs4 import BeautifulSoup import urllib.request import sys import testApp.processing as process import re from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.frames = True cleaner.forms = True cleaner.annoying_tags = True def get_url_content(url): try: with urllib.request.urlopen(url) as page: text = page.read() except Exception as e: return "Couldn't load url" return text
def handle(self, **options): since = get_last_change() writer = get_writer() last_change = since while True: doc = {} changes = settings.db.changes(since=since) since = changes["last_seq"] if since != last_change: print("Detected new tasks ".format(len(changes))) print("=== changes ===") pprint(changes) for changeset in changes["results"]: try: doc = settings.db[changeset["id"]] except couchdb.http.ResourceNotFound: print("resource not found") continue if not ("type" in doc and "page" in doc["type"]): if since != last_change: print("not processing doc: {}".format(str(doc))) last_change = since continue print("indexing", doc["url"]) ##### # raw, html, text ##################### raw = doc['content'] print("type(RAW) = %s" % type(raw)) tree = document_fromstring(str(raw)) title = ' '.join([title for title in tree.xpath('//title/text()')]) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(tree) text_content = html.text_content() lxml.html.tostring(html) description = ' '.join( tree.xpath("//meta[@name='description']/@content")) writer.update_document( title=title, url=doc['url'], desc=description, rank=doc['rank'], content='\n'.join([title, doc['url'], text_content]), raw=raw, ) writer.commit() writer = get_writer() set_last_change(since) last_change = since
try: readline.read_history_file(histfile) except IOError: pass try: from lxml.html.clean import Cleaner import lxml from lxml.html import document_fromstring import requests resp = requests.get('http://en.wikipedia.org/') tree = document_fromstring(resp.text) raw = resp.text # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(tree) text_content = html.text_content() except ImportError: pass atexit.register(readline.write_history_file, histfile) del os, histfile