def clear_html(in_str: str, tag_whitelist: list = []): """清除html或js""" if not in_str: return in_str cleaner = Cleaner(allow_tags=tag_whitelist, remove_unknown_tags=False) if not tag_whitelist: cleaner.html = True cleaner.javascript = True cleaner.html = True out_str = cleaner.clean_html(in_str) return re.sub(r'</p>$', '', re.sub(r'^<p>', '', out_str))
def cleanpage(html): # cleaner setup cleaner = Cleaner() cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False cleaner.javascript = True # activate the javascript filter cleaner.style = True # activate the styles & stylesheet filter cleaner.links = False cleaner.frames = True cleaner.embedded = True cleaner.comments = True cleaner.annoying_tags = True cleaner.inline_style = True cleaner.page_structure = False # cleaner.remove_tags = ['b','img','h'] cleaner.kill_tags = ['img','script'] #invoke cleaner try: content=cleaner.clean_html(html) except: #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr content = u"" return content
def get_clean_html(self, html_text, text_only=True): try: etree = lxml.html.document_fromstring(html_text) self._is_etree(etree) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(etree) if text_only: return ' '.join(html.text_content().split()) # return html.text_content() res = lxml.html.tostring(html) except Exception as e: logger.error(f"While parsing email in get_clean_html {e}") res = "junk" return res
def url2count(title): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html= True r = requests.get(makeurl(title), timeout=5) #r.text lxclean = cleaner.clean_html(r.text.replace('\t',' ').replace('\n',' ').replace('\r',' ')) text = nltk.clean_html(lxclean) collapsewhitespace = re.sub(r'\s{2,}', ' ', text) nonPunct = re.compile('.*[A-Za-z0-9].*') article_list = [w for w in collapsewhitespace.split(' ') if nonPunct.match(w)] article_length = len(article_list) return(article_length)
def get_clean_html(etree, text_only=False): _is_etree(etree) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(etree) if text_only: return html.text_content() return lxml.html.tostring(html)
def get_clean_text(filename): utf8_parser = html.HTMLParser(encoding='utf-8') htmltxt = lxml.html.parse(filename, parser=utf8_parser) cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False htmltxt = cleaner.clean_html(htmltxt) txt = etree.tostring(htmltxt, encoding='unicode') txtresub = re.sub(r'<.+?>', ' ', txt) txtresub = re.sub(r'(\s|&?(amp;|apos;|quot;|gt;|lt;|nbsp;))+', ' ', txtresub) return txtresub
def handle(self, **options): since = get_last_change() writer = get_writer() last_change = since while True: doc = {} changes = settings.db.changes(since=since) since = changes["last_seq"] if since != last_change: print("Detected new tasks ".format(len(changes))) print("=== changes ===") pprint(changes) for changeset in changes["results"]: try: doc = settings.db[changeset["id"]] except couchdb.http.ResourceNotFound: print("resource not found") continue if not ("type" in doc and "page" in doc["type"]): if since != last_change: print("not processing doc: {}".format(str(doc))) last_change = since continue print("indexing", doc["url"]) ##### # raw, html, text ##################### raw = doc["content"] print("type(RAW) = %s" % type(raw)) tree = document_fromstring(str(raw)) title = " ".join([title for title in tree.xpath("//title/text()")]) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(tree) text_content = html.text_content() lxml.html.tostring(html) description = " ".join(tree.xpath("//meta[@name='description']/@content")) writer.update_document( title=title, url=doc["url"], desc=description, rank=doc["rank"], content="\n".join([title, doc["url"], text_content]), raw=raw, ) writer.commit() writer = get_writer() set_last_change(since) last_change = since
try: readline.read_history_file(histfile) except IOError: pass try: from lxml.html.clean import Cleaner import lxml from lxml.html import document_fromstring import requests resp = requests.get('http://en.wikipedia.org/') tree = document_fromstring(resp.text) raw = resp.text # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(tree) text_content = html.text_content() except ImportError: pass atexit.register(readline.write_history_file, histfile) del os, histfile
def handle(self, **options): since = get_last_change() writer = get_writer() last_change = since while True: doc = {} changes = settings.db.changes(since=since) since = changes["last_seq"] if since != last_change: print("Detected new tasks ".format(len(changes))) print("=== changes ===") pprint(changes) for changeset in changes["results"]: try: doc = settings.db[changeset["id"]] except couchdb.http.ResourceNotFound: print("resource not found") continue if not ("type" in doc and "page" in doc["type"]): if since != last_change: print("not processing doc: {}".format(str(doc))) last_change = since continue print("indexing", doc["url"]) ##### # raw, html, text ##################### raw = doc['content'] print("type(RAW) = %s" % type(raw)) tree = document_fromstring(str(raw)) title = ' '.join([title for title in tree.xpath('//title/text()')]) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(tree) text_content = html.text_content() lxml.html.tostring(html) description = ' '.join( tree.xpath("//meta[@name='description']/@content")) writer.update_document( title=title, url=doc['url'], desc=description, rank=doc['rank'], content='\n'.join([title, doc['url'], text_content]), raw=raw, ) writer.commit() writer = get_writer() set_last_change(since) last_change = since