def sanitize_payload(payload): "Sanitize HTML" if not payload: return '', '' styles = [] payload = clean_payload(payload) body_style, body_class = get_body_style(payload) if body_style: styles.append(body_style) safe_attrs = set(defs.safe_attrs) safe_attrs.add('style') cleaner = Cleaner(remove_tags=UNCLEANTAGS, safe_attrs_only=True, safe_attrs=safe_attrs) payload = HTMLTITLE_RE.sub('', payload) try: html = cleaner.clean_html(payload) except ValueError: payload = bytes(bytearray(payload, encoding='utf-8')) html = cleaner.clean_html(payload) except XMLSyntaxError: html = '' mainstyle = sanitize_css(get_style(html)) if mainstyle: styles.append(decode(mainstyle)) style = u'\n'.join(styles) html = clean_styles(CSS_COMMENT_RE.sub('', html)) html = set_body_class(html, body_class) return html.strip(), style.strip()
def clean_html(html, safe_attrs=('src', 'href'), input_encoding='unicode', output_encoding='unicode', **kwargs): """ Fix HTML structure and remove non-allowed attributes from all tags. """ from lxml.html.clean import Cleaner # Convert HTML to Unicode html = render_html(parse_html(html, encoding=input_encoding), make_unicode=True) # Strip some shit with default lxml tools cleaner = Cleaner(page_structure=True, **kwargs) html = cleaner.clean_html(html) # Keep only allowed attributes tree = parse_html(html) for elem in tree.xpath('./descendant-or-self::*'): for key in elem.attrib.keys(): if safe_attrs: if key not in safe_attrs: del elem.attrib[key] return render_html(tree, encoding=output_encoding)
def learn_stopwords(self): req = urllib2.Request(self.html_url, headers={'Host':'github.com', 'Referer':'https://github.com', 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36'}) r = urllib2.urlopen(req) page = r.read() tree = html.fromstring(page) # get readme part readme_tree = tree.xpath('//*[@id="readme"]/article') if len(readme_tree) < 1: return readme_tree = readme_tree[0] self.origin_readme = readme_tree.text_content() cleaner = Cleaner(allow_tags=['p','h1','h2','h3','h4','h5','pre'], remove_unknown_tags=False) readme_tree = cleaner.clean_html(readme_tree) header = "" # iterate each header and paragraph for sub in readme_tree.iterchildren(): if sub is None: break if sub.tag == 'pre' and header: self.add_stopwords(self.filter_all(header)) header = "" elif sub.tag in ['h1','h2','h3','h4'] and sub.text is not None: header = sub.text.strip().lower()
def handle_item(path): # url="http://news.39.net/"+path.split("/root/39_data/news.39.net/")[1] flag,title,text=False,"","" try: # request=requests.get(url,proxies=get_proxy(),timeout=5) # if request.status_code!=200: raise with open(path,"r") as file: content=file.read() html=lxml.html.fromstring(content.decode("gbk")) try: if re.search("utf",html.xpath("//meta/@charset")[0]): html=lxml.html.fromstring(r.content.decode("utf-8")) except: pass try: if len(html.xpath("//div[@class='art_box']/h1/text()"))>0: title=html.xpath("//div[@class='art_box']/h1/text()")[0] else: title=html.xpath("//div[@class='artbox']/h1/text()")[0] except: title="" print("title:%s"%title) if len(html.xpath("//div[@id='contentText']"))>0: div1=html.xpath("//div[@id='contentText']")[0] elif len(html.xpath("//div[@class='article']"))>0: div1=html.xpath("//div[@class='article']")[0] else: raise cleaner = Cleaner(scripts = True) for p in div1.xpath("./p"): p=cleaner.clean_html(p) try: text+=p.text_content().strip()+"\n" except: pass print("text:%s"%text) flag=True except Exception,e: print(e)
def createPages(): items = source.contentItems() for item in items: doc = parse(item).getroot() cleaner = Cleaner(style=True, links=False, page_structure=True, safe_attrs_only=False) cleaned = cleaner.clean_html(doc) # get the pagetitle titles = cleaned.find_class('Pagetitle') # snag the page title - method returns list. . there's really only one title = titles[0].text_content() # get the description descrips = cleaned.find_class('Summarytext') descrip = descrips[0].text_content() #Need to have temporary id id = str(random.randint(0, 99999999)) target.invokeFactory("Document", id=uid) obj = target[uid] obj.setTitle(title) obj.setDescription(descrip) obj.setText.getBodyText() # Will finish Archetypes content item creation process, # rename-after-creation and such obj.processForm() return obj
def _statistica_(url_string): """Implementa la logica per estrarre documento e metadati da rivista-statistica """ url = urlparse.urlparse(url_string) conn = httplib.HTTPConnection(url.hostname) conn.request("GET", url.path) res = conn.getresponse() body = res.read() my_page = html.fromstring(body) # Rimuovi il banner dei cookie del ***** for el in my_page.xpath('//*[@id="cookiesAlert"]'): el.getparent().remove(el) # Rimuovi tutti i tag script e il loro contenuto cleaner = Cleaner() cleaner.javascript = True my_page = cleaner.clean_html(my_page) title = my_page.xpath('//*[@id="articleTitle"]/h3') full_content = my_page.xpath('//*[@id="content"]') doi = my_page.xpath('//*[@id="pub-id::doi"]') full_content = ''.join( [etree.tostring(fix_links(el, url_string)) for el in full_content]) result = { 'title': title[0].text_content(), 'content': full_content, 'doi': doi[0].text_content() } return json.JSONEncoder().encode(result)
def truncate(content, max_length=DEFAULT_TRUNCATE_LENGTH, allowed_tags=ALLOWED_TAGS, full_link=None): """ truncate a body of text to the expected 'max_length' and strip the body of text of all html tags that are not in 'allowed tags'. You can also specify a 'strip' value (True -> strip html tags, False -> escape html tags and leave them in text) """ if not content: return '' cleaner = Cleaner( page_structure=False, links=True, safe_attrs_only=True, remove_unknown_tags=False, allow_tags=allowed_tags ) content = defaultfilters.truncatechars_html(cleaner.clean_html(content), max_length) if full_link: try: insert_point = content.rindex('</p>') except ValueError: insert_point = content.rindex('<') ending = content[insert_point:] content = content[:insert_point] content += ' <a href="' + full_link + '">(Read More)</a>' + ending return content
def get_content(self, site): sel = None if site.id_type == "css": # translates csspath into xpath s = CSSSelector(site.identifier) sel = s.path else: sel = site.identifier try: page = requests.get(site.url) parser = le.HTMLParser() tree = le.parse(StringIO(page.text), parser) xp = tree.xpath(sel) if len(xp) < 1: return None html = lxml.html.tostring(xp[0]) cleaner = Cleaner(style=True, links=False, page_structure=False, embedded=False, frames=False, forms=False) cleaned_html = cleaner.clean_html(html) self._print("Cleaning html: " + str(len(html)) + " -> " + str(len(cleaned_html))) return cleaned_html except Exception as e: self._print("EXCEPTION! " + str(e.message)) return None
def getFormatHtml(htmlContent): try: dom = soupparser.fromstring(htmlContent) except Exception, e: cleaner = Cleaner() htmlContent = cleaner.clean_html(htmlContent) doc = soupparser.fromstring(htmlContent)
def analyze(request): url = request.GET['url'] opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1')] response = opener.open(url) raw_html = response.read() cleaner = Cleaner(kill_tags = ['style', 'script', 'head'], allow_tags = [''], remove_unknown_tags = False) raw_text = cleaner.clean_html(raw_html) ptn = re.compile('<div>|</div>') raw_text = re.sub(ptn, '', raw_text) ptn = re.compile('\s+') raw_text = re.sub(ptn, ' ', raw_text) raw_text = raw_text.strip().lower() prd, score = MLearn.predict(raw_text) donut = score * 100 results = MLearn.predict_other(raw_text) related_headline = results[0][2] related_verdict = results[0][0] related_score = results[0][1] * 100 context = { 'url': url, 'verdict': prd, 'score': donut, 'related_headline': related_headline, 'related_verdict': related_verdict, 'related_score': related_score, 'results': results, } return render(request, 'results.html', context)
def get_intro_text(text): """ Returns only the first <p> tag and preceding nodes """ #cut the text to the first paragraph index = text.lower().find('</p>', 1000) if index != -1: text = text[:index] +'</p>' cleaner = Cleaner( scripts=False, javascript=False, comments=False, style=False, links=False, meta=False, page_structure=False, processing_instructions=False, embedded=False, forms=False, remove_unknown_tags=True, ) text = cleaner.clean_html(text) return text
def sanitize(html): if not html: return html cleaner = Cleaner(allow_tags=_safe_tags, safe_attrs_only=True, safe_attrs=_safe_attrs, remove_unknown_tags=False) html = autolink_html(cleaner.clean_html(html)) parts = re.split('(<.*?>)', html) output = '' in_a_tag = False for part in parts: if not len(part): continue is_tag = part[0] == '<' if is_tag or in_a_tag: output += part if part[0:2].lower() == '<a': in_a_tag = True elif part[0:3].lower() == '</a': in_a_tag = False continue part = re.sub("([a-zA-Z0-9_\\-+\\.\']*[a-zA-Z0-9]@[0-9a-zA-Z\\-\\.]+\\.[a-zA-Z]{2,})", '<a href="mailto:\\1">\\1</a>', part) # After linking up emails, only look for twitter in the remaining parts sub_parts = re.split('(<.*?>)', part) part = '' for sub_part in sub_parts: part += re.sub("(?<![a-zA-Z0-9])@([0-9a-zA-Z_]{1,15})", '<a href="https://twitter.com/\\1">@\\1</a>', sub_part) output += part return output
def _remove_tags(self): cleaner = Cleaner( scripts = True , javascript = True , comments = True , style = False , links = True , meta = False, page_structure = None , processing_instructions = True , embedded = True , frames = True , forms = True , # annoying_tags = True , # remove_tags = None , # allow_tags = allowed_tags , remove_unknown_tags = False , # safe_attrs_only = True , # add_nofollow = False , ) # patch to add space in tags for el in self.root.iter(): if el is not None and el.text: el.text = el.text+' ' if el is not None and el.tail: el.tail = el.tail+' ' # remove tags self.root = cleaner.clean_html(self.root) for el in self.root.iter(): if el.tag=='a' and el.get('rel')=='nofollow': el.text = '' el.drop_tag()
def get_hidden_comments(post_id_with_minus): """ Загружает скрытые комментарии и возвращает список объектов CommentPostInfo """ params = urllib.urlencode({ 'act': 'get_replies', 'al': 1, 'count': 'false', 'post': post_id_with_minus }) request = urllib2.Request('http://vkontakte.ru/al_wall.php') request.add_header("X-Requested-With", "XMLHttpRequest") request.add_header("Origin", "http://vkontakte.ru") data = urllib2.urlopen(request).read() with open('b:/1.html', 'w') as f: f.write(data) data = data.decode('cp1251') #возвращает элемент, а не дерево html = lxml.html.document_fromstring( data ) cleaner = Cleaner( style=True, page_structure=False ) cleaned_html = cleaner.clean_html( html ) hidden_comments = list() for reply_element in cleaned_html.cssselect('div.reply.clear'): hidden_comments.append( VkontakteGroupNewsReader.get_reply_from_response_part( reply_element ) ) return hidden_comments
def html_cleanup(input): cleaner = Cleaner( scripts = True, javascript = True, comments = True, style = False, links = True, meta = True, page_structure = True, processing_instructions = True, embedded = False, frames = False, forms = True, annoying_tags = True, allow_tags = ['a', 'img', 'span', 'div', 'p', 'br', 'iframe', # for google cal 'strong', 'em', 'b', 'i', 'u', 'strike', 'blockquote', 'sub', 'sup', 'ul', 'ol', 'li', 'table', 'tdata', 'tr', 'th', 'td', 'h1', 'h2', 'h3', 'h4'], remove_unknown_tags = False, safe_attrs_only = True, host_whitelist = ['youtube.com', 'www.google.com'], whitelist_tags = ['iframe', 'embed', 'script', 'img'] ) sane = cleaner.clean_html("<div>%s</div>"%input) return sane[len('<div>'):-len('</div>')]
def parse(self, content): """Clean and parse HTML content.""" cleaner = Cleaner(style=True, links=False, page_structure=False, meta=True, safe_attrs_only=False, remove_unknown_tags=False) clean_content = cleaner.clean_html(content) html = etree.iterparse(StringIO(clean_content), events=("start", "end")) level = -1 css = '' # We do not want to style these elements. ignore_tags = ['html', 'body', 'head', 'meta', 'title', 'script'] if self.options.delimiter == 'spaces': delimiter = ' ' else: delimiter = '\t' for action, elem in html: if (action == 'start'): identifier = self.identify_ruleset(elem) if elem.tag not in ignore_tags: level += 1 css += delimiter * level + identifier + ' {\n' if not self.options.clean_mode: css += delimiter + delimiter * level + '/* enter your CSS here... */\n' else: if elem.tag not in ignore_tags: css += delimiter * level + '}\n' level -= 1 return css.strip()
def visit(url): if url.startswith(base_url) == False: return try: resp = urlopen(url) except URLError as e: return page = resp.read() cleaner = Cleaner() cleaner.javasript = True cleaner.style = True cleaner.kill_tags = ELEMENTS_TO_IGNORE # soup = BeautifulSoup(page, "lxml") # for link in soup.findAll('a'): # if link.has_attr('href'): # if link.has_attr('class') and 'history' in link['class']: # continue # next_link = urljoin(url,link['href']) # next_link = urldefrag(next_link)[0] # if next_link not in visited_pages: # visited_pages.append(next_link) # pages_to_visit.append(next_link) clean_page = cleaner.clean_html(page) soup = BeautifulSoup(clean_page, "lxml") extract(soup, url)
def gettextonly(self, tree): cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False) try: v = tostring(tree,method='text',encoding=unicode) except: v = None if v == None: c = lxml.html.tostring(tree) print 'v== null' # resulttext = '' # for t in c: # subtext = self.gettextonly(t) # resulttext += subtext + '\n' # return resulttext return c else: # Clean up the javascript and comment. try: v = cleaner.clean_html(v) except: # Ignore clean error pass return v.strip()
def test_allow_tags(self): html = """ <html> <head> </head> <body> <p>some text</p> <table> <tr> <td>hello</td><td>world</td> </tr> <tr> <td>hello</td><td>world</td> </tr> </table> <img> </body> </html> """ html_root = lxml.html.document_fromstring(html) cleaner = Cleaner( remove_unknown_tags = False, allow_tags = ['table', 'tr', 'td']) result = cleaner.clean_html(html_root) self.assertEqual(12-5+1, len(list(result.iter())))
def _load(self): """ Load the ElementTree from the source """ # Convert directional quotation marks to regular quotes double_quotes = ur'[\u201c\u201d]' self.source = re.sub(double_quotes, u'"', self.source) single_quotes = ur'[\u2019\u2018]' self.source = re.sub(single_quotes, u"'", self.source) # Convert colons self.source = self.source.replace(u'\uff1a', u':') # Remove line breaks and tabs self.source = self.source.replace(u'\n', u'') self.source = self.source.replace(u'\t', u'') # There are also some "zero width joiners" in random places in the text # Should remove them here, since they make string search unreliable # these are the codes: ‍,   (nbsp), \xa0 (nbsp), \u200d zero_width_joiners = u'\u200d' self.source = self.source.replace(zero_width_joiners, u'') # Also previously had some non breaking spaces in unicode \u00a0, but this # may have been fixed by changing the parser below # Use the lxml cleaner cleaner = Cleaner() parser = HTMLParser(encoding='utf-8') # Finally, load the cleaned string to an ElementTree self.tree = cleaner.clean_html(lxml.html.fromstring(to_string(self.source), parser=parser))
def sanitize_html(html, bad_tags=['body']): """Removes identified malicious HTML content from the given string.""" if html is None or html == '': return html cleaner = Cleaner(style=False, page_structure=True, remove_tags=bad_tags, safe_attrs_only=False) return cleaner.clean_html(html)
def getArticles(keyword): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True br = mechanize.Browser() br.set_handle_robots(False) br.addheaders=[('User-agent','chrome')] term = keyword.replace(" ", "+") query = "http://www.google.ca/search?&tbm=nws&num=10&q=" + term htmltext = br.open(query).read() #print htmltext soup = BeautifulSoup(htmltext) search = soup.findAll('div', attrs={'id': 'search'}) #print search[0] searchtext= str(search[0]) soup1=BeautifulSoup(searchtext) list_items=soup1.findAll('li') regex = "q=.*?&" pattern = re.compile(regex) results_array = [] for li in list_items: soup2 = BeautifulSoup(str(li)) links = soup2.findAll('a') source_link = links[0] #print source_link source_url = re.findall(pattern, str(source_link)) if len(source_url) > 0: results_array.append(str(source_url[0].replace("q=", "").replace("&", ""))) return results_array
def tokenize(n, tagsDict): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True i = 0 df = pandas.DataFrame(columns=[list(tagsDict)]) while (i < n): allVector = {} if (os.path.isfile("spam/%d.txt" % i)): try: for word in tagsDict: allVector[word] = 0 readInFile = open("spam/%d.txt" % i) content = readInFile.read() noSymbols = re.sub('[^A-Za-z-]+', ' ', content.lower()) # noSymbols is stripped of symbols allCopy = noSymbols.split() # allCopy is the set of words without symbols for tag in allCopy: df.ix[i[tag]] = df.ix[i[tag]] + 1 df.ix[i['isSpam']] = 'spam' except Exception, err: print traceback.format_exc() print sys.exc_info()[0] i = i + 1
def create_plaintext_message(message): """ Create clean plain text version of email message Parse the html and remove style and javacript tags and then create a plain-text-message by parsing the html and attaching links as endnotes """ cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.kill_tags = ['style'] doc = message.decode('utf-8', 'ignore') to_clean = lxml.html.fromstring(doc) cleaned_msg = lxml.html.tostring(cleaner.clean_html(to_clean)) plain_text_maxcols = 72 textout = cStringIO.StringIO() formtext = formatter.AbstractFormatter(formatter.DumbWriter( textout, plain_text_maxcols)) parser = HTMLParser(formtext) parser.feed(cleaned_msg) parser.close() # append the anchorlist at the bottom of a message # to keep the message readable. counter = 0 anchorlist = "\n\n" + ("-" * plain_text_maxcols) + "\n\n" for item in parser.anchorlist: counter += 1 if item.startswith('https://'): new_item = item.replace('https://', 'http://') else: new_item = item anchorlist += "[%d] %s\n" % (counter, new_item) text = textout.getvalue() + anchorlist del textout, formtext, parser, anchorlist return text
def __init__(self, input): self.title = input.get('post_title') self.content = input.get('post_content') self.category = input.get('post_category') self.is_public = input.get('post_is_public') if self.is_public: self.is_public = True else: self.is_public = False if self.category not in config.get('post_categories'): raise exceptions.CantValidateForm if self.title: # strip markup html_string = lxml.html.fromstring(self.title) self.title = unicode(html_string.text_content()) else: self.title = '' if self.content: # clean markup cleaner = Cleaner(**post_rules) self.content = cleaner.clean_html(self.content) # replace newlines self.content = self.content.replace('\r\n', '<br />') else: raise exceptions.CantValidateForm
def strip_comments__lxml(html_string=""): if not html_string: return html_string params = { 'comments': True, 'scripts': False, 'javascript': False, 'style': False, 'links': False, 'meta': False, 'page_structure': False, 'processing_instructions': False, 'embedded': False, 'frames': False, 'forms': False, 'annoying_tags': False, 'remove_tags': None, 'allow_tags': None, 'remove_unknown_tags': True, 'safe_attrs_only': False, } try: cleaner = Cleaner(**params) html = lxml.html.fromstring(html_string) clean_html = cleaner.clean_html(html) return lxml.etree.tostring(clean_html) except (XMLSyntaxError, ParserError): return html_string
def _get_breakdowns(self): """ returns breakdowns from GWDG in given timewindow """ #load feed first, since not working with lxml directly r = requests.get(URL) #load url and parse it with html parser root = lxml.etree.fromstring(r.text.encode("utf-8")) #get items items = [] for x in root.findall("channel/item"): pubdate = datetime.datetime.fromtimestamp( email.utils.mktime_tz( email.utils.parsedate_tz( x.find("pubDate").text[:-6] ) ) ) if pubdate >= OLDEST_NEWS: cleaner = Cleaner(allow_tags=[''], remove_unknown_tags=False) title = cleaner.clean_html(x.find("title").text)[5:-6] content = cleaner.clean_html(x.find("description").text)[5:-6] item = { "title" : title, "pubdate" : str(pubdate), "content" : content, } items.append(item) return sorted(items, key=lambda x: x["pubdate"], reverse=True)
def parse(self, response): item = JournalItem() base_url = "http://journals.ametsoc.org" journalTitle = response.xpath('//*[@id="journalBlurbPanel"]/div[2]/h3/text()').extract_first() item['title'] = journalTitle journalIssue = response.xpath('//*[@id="articleToolsHeading"]/text()').extract_first().strip() # remove whitespace at start and end item['issue'] = journalIssue # setup html cleaner to strip html tags from string (journal titles often use sub/superscript and splits article title) html_cleaner = Cleaner(allow_tags=[''], remove_unknown_tags=False) journalDescription = response.xpath('//*[@id="journalBlurbPanel"]/div[4]').extract() journalDescription = "".join(journalDescription) journalDescription = html_cleaner.clean_html(journalDescription)[5:-6] # remove any html tags and then trim the <div> tags that the cleaner inserts journalDescription = removeNewlines(journalDescription) # remove any \n\r\t characters journalDescription = journalDescription.strip() item['description'] = journalDescription coverImage = response.xpath('//*[@id="smallIssueCover"]/img/@src').extract_first().strip() print(coverImage) item['coverURL'] = base_url + coverImage yield item
def parse(self, response): sel = Selector(response) # urls = sel.xpath('//@href').extract() urls = sel.xpath('//li[@class="next_article"]/a/@href').extract() item = ZiwuItem() item['url'] = response.url item['title'] = ''.join(sel.xpath('//div[@id="article_details"]/div[@class="article_title"]/h1/span/a/text()').extract()) itemcontent = ''.join(sel.xpath('//div[@id="article_details"]/div[@id="article_content"]/node()').extract()) cleaner = Cleaner(page_structure=False, links=False, safe_attrs_only=True, safe_attrs = frozenset([])) cleansed = cleaner.clean_html(itemcontent) item['content'] = cleansed yield item for url in urls: utf8_url = url.encode('utf-8') base_url = get_base_url(response) """The following regex to match the prefix and postfix of urls""" postfix = re.compile(r'.+\.((jpg)|(ico)|(rar)|(zip)|(doc)|(ppt)|(xls)|(css)|(exe)|(pdf))x?$') prefix = re.compile(r'^((javascript:)|(openapi)).+') if postfix.match(utf8_url): continue if prefix.match(utf8_url): continue if not utf8_url.startswith('http://'): weburl = urljoin_rfc(base_url, utf8_url) yield Request(weburl, callback=self.parse)
def buildDicts(n): cleaner = Cleaner() cleaner.javascript = True cleaner.style = True i = 0 tagsDict = set() while (i < n): if (os.path.isfile("spam/%d.txt" % i)): try: readInFile = open("spam/%d.txt" % i) content = readInFile.read() noSymbols = re.sub('[^A-Za-z-]+', ' ', content.lower()) # noSymbols is stripped of symbols tags = set(noSymbols.split()) # allCopy is the set of words without symbols tagsDict = tagsDict.union(tags) except Exception, err: print traceback.format_exc() print sys.exc_info()[0] if (os.path.isfile("notspam/%d.txt" % i)): try: readInFile = open("notspam/%d.txt" % i) content = readInFile.read() noSymbols = re.sub('[^A-Za-z-]+', ' ', content.lower()) # noSymbols is stripped of symbols tags = set(noSymbols.split()) # allCopy is the set of words without symbols tagsDict = tagsDict.union(tags) except Exception, err: print traceback.format_exc() print sys.exc_info()[0]
def clean_cachefiles(self): """Clean silly html from all cachefiles in the cachdir""" if input( 'Do you really want to strip all cache files from bloating tags such as <script> and <style>? ' ).startswith('y'): import lxml.html from lxml.html.clean import Cleaner cleaner = Cleaner() cleaner.style = True cleaner.scripts = True cleaner.javascript = True for file in self._get_all_cache_files(): cfile = CompressedFile(file) data = cfile.read() cleaned = lxml.html.tostring( cleaner.clean_html(lxml.html.fromstring(data))) cfile.write(cleaned) logger.info('Cleaned {}. Size before: {}, after {}'.format( file, len(data), len(cleaned)))
RE_BLANK_LINE = re.compile(r'(\n\s*)(\n\s*)+') lxml_html_parser = lxml.html.HTMLParser(remove_blank_text=True, remove_comments=True, collect_ids=False) lxml_text_html_cleaner = Cleaner( scripts=True, javascript=True, comments=True, style=True, links=True, meta=True, page_structure=True, processing_instructions=True, embedded=True, frames=True, forms=True, annoying_tags=True, remove_tags=set(['body']), kill_tags=set(['code', 'pre', 'img', 'video', 'noscript']), ) def story_html_to_text(content, clean=True): """ >>> content = '''<html><body> ... <pre>hello world</pre> ... ...
'figcaption', 'footer', 'form', 'header', 'hr', 'legend', 'li', 'main', 'nav', 'table', 'tr' ]) DOUBLE_NEWLINE_TAGS = frozenset([ 'blockquote', 'dl', 'figure', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'p', 'pre', 'title', 'ul' ]) _clean_html = Cleaner( scripts=True, javascript=False, # onclick attributes are fine comments=True, style=True, links=True, meta=True, page_structure=False, # <title> may be nice to have processing_instructions=True, embedded=True, frames=True, forms=False, # keep forms annoying_tags=False, remove_unknown_tags=False, safe_attrs_only=False, ).clean_html def _cleaned_html_tree(html): if isinstance(html, lxml.html.HtmlElement): tree = html else: tree = parse_html(html) return _clean_html(tree)
def _clean_html_body(request, email, body, charset): """Clean up a html part as best we can Doesn't catch LXML errors """ html_tree = lxml_html.fromstring(body) # if the HTML doc says its a different encoding, use that for meta_tag in html_tree.xpath("/html/head/meta"): if meta_tag.get("http-equiv", None) == "Content-Type": try: content = meta_tag.attrib["content"] content = content.split(";", 1)[1] charset = dict(HEADER_PARAMS.findall(content))["charset"] break except (KeyError, IndexError): pass elif "charset" in meta_tag.attrib: charset = meta_tag.attrib["charset"] break try: # check there's a body for premailer if html_tree.find("body") is not None: html_tree = InboxenPremailer(html_tree).transform() except Exception as exc: # Yeah, a pretty wide catch, but Premailer likes to throw up everything and anything messages.info(request, _("Part of this message could not be parsed - it may not display correctly")) _log.warning("Failed to render CSS for %s: %s", email["eid"], exc) # Mail Pile uses this, give back if you come up with something better cleaner = Cleaner( allow_tags=HTML_ALLOW_TAGS, kill_tags=["style"], # remove style tags, not attrs remove_unknown_tags=False, safe_attrs=HTML_SAFE_ATTRS, safe_attrs_only=True, style=False, # keep style attrs ) html_tree = cleaner.clean_html(html_tree) # filter images if we need to if not email["display_images"]: for img in html_tree.xpath("//img"): try: # try to delete src first - we don't want to add a src where there wasn't one already del img.attrib["src"] # replace image with 1px png img.attrib["src"] = staticfiles_storage.url("imgs/placeholder.svg") email["has_images"] = True except KeyError: pass for link in html_tree.xpath("//a"): try: # proxy link url = link.attrib["href"] link.attrib["href"] = proxy_url(url) except KeyError: pass # open link in tab link.attrib["target"] = "_blank" # and prevent window.opener bug (noopener is only supported in newer # browsers, plus we already set noreferrer in the head) link.attrib["rel"] = "noreferrer" # finally, export to unicode body = unicode_damnit(etree.tostring(html_tree, method="html"), charset) return safestring.mark_safe(body)
def reflect_html(key: int, day: str, digest: str) -> Union[None, bool]: from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.chrome.options import Options from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait """ 1. すでに処理したファイルが存在していたらスキップ """ out_filename = f"{TOP_DIR}/var/Twitter/tweet/{day}/{digest}" if Path(out_filename).exists(): return True options = Options() options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument("window-size=1024x1024") options.add_argument( f"user-data-dir=/tmp/{FILE.replace('.py', '')}_{key:06d}") options.binary_location = shutil.which("google-chrome") try: driver = webdriver.Chrome(executable_path=shutil.which("chromedriver"), options=options) driver.get(f"http://localhost/twitter/input/{day}/{digest}") print('ebug', f"http://localhost/twitter/input/{day}/{digest}") html = driver.page_source time.sleep(5) html = driver.page_source driver.save_screenshot(f"/home/gimpei/{digest}.png") driver.switch_to.frame(driver.find_element_by_tag_name("iframe")) # elm = driver.find_element_by_xpath("/html") time.sleep(1) inner_html = driver.page_source # print("inner", inner_html) # inner_html = driver.page_source # print(html) """get shadow-root""" # elm = driver.execute_script("""return document.querySelector("twitter-widget").shadowRoot""") # elm = driver.execute_script("""return document.querySelector("twitter-widget").shadowRoot""") # inner_html = elm.get_attribute("innerHTML") cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False) # print(inner_html) soup = BeautifulSoup(inner_html, "lxml") imported_csses = [ el for el in soup.find_all("style", {"type": "text/css"}) ] # replace css text to local css for css in imported_csses: if "@import url" in css.text: css_url = re.search(r'url\("(.*?)"\)', css.text).group(1) css_digest = GetDigest.get_digest(css_url) # print(css_url, css_digest) with requests.get(css_url) as r: css_text = r.text Path(f"{TOP_DIR}/var/Twitter/css").mkdir(exist_ok=True, parents=True) with open(f"{TOP_DIR}/var/Twitter/css/{css_digest}", "w") as fp: fp.write(css_text) css.string = f'@import url("/twitter/css/{css_digest}")' # replace image src for img in soup.find_all(attrs={"src": True}): url = img.get("src") o = urlparse(url) if o.scheme == "": o = o._replace(scheme="https") url = o.geturl() url_digest = GetDigest.get_digest(url) if "format=jpg" in url or re.search(".jpg$", url) or re.search( ".jpeg$", url) or re.search(".JPG$", url): with requests.get(url, timeout=30) as r: binary = r.content Path(f"{TOP_DIR}/mnt/twitter_jpgs").mkdir(exist_ok=True, parents=True) with open(f"{TOP_DIR}/mnt/twitter_jpgs/{url_digest}", "wb") as fp: fp.write(binary) # print(f"downloaded! {TOP_DIR}/mnt/twitter_jpgs/{url_digest}") img["src"] = f"/twitter/jpgs/{url_digest}" elif "format=png" in url or re.search(".png$", url): with requests.get(url, timeout=30) as r: binary = r.content Path(f"{TOP_DIR}/var/Twitter/pngs").mkdir(exist_ok=True, parents=True) with open(f"{TOP_DIR}/var/Twitter/pngs/{url_digest}", "wb") as fp: fp.write(binary) img["src"] = f"/twitter/pngs/{url_digest}" elif "normal" in url or ".js" in url or ".svg" in url: continue else: continue # raise Exception(f"unsupported image! url={url}") """adhoc style edit""" if soup.find(attrs={"class": "EmbeddedTweet"}): soup.find(attrs={"class": "EmbeddedTweet" })["style"] = "margin: 0 auto; margin-top: 150px;" out_dir = f"{TOP_DIR}/var/Twitter/tweet/{day}" Path(out_dir).mkdir(exist_ok=True, parents=True) with open(f"{out_dir}/{digest}", "w") as fp: fp.write(soup.__str__()) driver.close() # if E.get("DEBUG"): print( f"[{NAME}] ordinally done, day = {day} digest = {digest}, filename = {out_dir}/{digest}" ) except Exception as exc: tb_lineno = sys.exc_info()[2].tb_lineno print( f"[{NAME}] exc = {exc}, tb_lineno = {tb_lineno}, day = {day}, digest = {digest}, filename = {out_filename}", file=sys.stderr) out_filename = f"{TOP_DIR}/var/Twitter/tweet/{day}/{digest}" Path(f"{TOP_DIR}/var/Twitter/tweet/{day}").mkdir(exist_ok=True, parents=True) # パースに失敗したやつを無視する時、有効にする # Path(out_filename).touch() time.sleep(5) return None return f"/twitter/tweet/{day}/{digest}"
def cleanpage(html): # cleaner setup cleaner = Cleaner() cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False cleaner.javascript = True # activate the javascript filter cleaner.style = True # activate the styles & stylesheet filter cleaner.links = False cleaner.frames = True cleaner.embedded = True cleaner.comments = True cleaner.annoying_tags = True cleaner.inline_style = True cleaner.page_structure = False # cleaner.remove_tags = ['b','img','h'] cleaner.kill_tags = ['img','script'] #invoke cleaner try: content=cleaner.clean_html(html) except: #error: ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fr content = u"" return content
class Topic(models.Model): author = models.ForeignKey(Member, related_name='topic_created', verbose_name=u"演讲者") in_event = models.ForeignKey(Event, related_name='topic_shown_in', blank=True, null=True, verbose_name=u"已安排在此活动中") description = models.TextField(u"简介", max_length=200, blank=False) content = models.TextField(u"内容", blank=True) html = models.TextField(u'HTML', blank=True, null=True) content_type = models.CharField(blank=False, default='html', max_length=30) accepted = models.BooleanField( default=False) #该话题是否已经被管理员接受,True才能在活动正式的公布页面显示, 同时in_event才能显示 name = models.CharField("名称", max_length=255, blank=False) created = models.DateTimeField(auto_now_add=True, auto_now=True, blank=True, null=True) last_modified = models.DateTimeField(auto_now_add=True, auto_now=True, blank=True, null=True) last_modified_by = models.ForeignKey( Member, related_name='%(class)s_last_modified') #aggrgated total_votes = models.PositiveIntegerField(default=0) total_favourites = models.PositiveIntegerField(default=0, editable=False) html_cleaner = Cleaner(style=False, embedded=False, safe_attrs_only=False) def set_author(self, user): author = user.get_profile() self.last_modified_by = author # last_modified_by 总是author? self.author = author return self @property def poll_status(self): if self.in_event: if self.accepted: if self.in_event.is_upcoming: return u'网络投票进行中' elif self.in_event.is_off: return u'本话题所属活动已经结束' else: return u'活动等待管理员审核中,审核完毕后即可开始投票' else: return u'该话题尚未加入任何活动,无法开始投票' return u'我们也不知道怎么了' @property def rendered_content(self): if self.content_type == 'restructuredtext': '''暂时取消restructuredtext的处理''' #return restructuredtext(self.content) #创建lxml的html过滤器,保留object,embed,去除js,iframe return self.html_cleaner.clean_html(self.content) #使用过滤器,返回安全的html elif self.content_type == 'html': return self.html else: return restructuredtext(self.content) @property def is_shown(self): '''该话题所属活动是否正在进行或已经结束''' return self.in_event and (self.in_event.is_off or self.in_event.is_running) @property def is_arranged(self): '''该话题是否已经加入到活动,并且活动尚未开始''' return self.in_event and (self.in_event.is_upcoming == True) @property def content_text(self): try: content = self.content.decode('utf-8') except UnicodeEncodeError: content = self.content content_element = html.fromstring(content) return content_element.text_content() @property def summary(self): content = self.content_text if len(content) > 60: return '%s...' % content[:60] else: return content def style_seed(self, range=4): '''用来显示一些随机的样式''' return self.id % range def get_absolute_url(self): return reverse('topic', args=[self.id]) def send_notification_mail(self, type): '''在话题提交及更新时发送提醒邮件''' type_dict = { 'created': u'建立', 'updated': u'更新', } subject = u"[Open Party] 话题%(type)s:%(name)s" % { 'type': type_dict[type.lower()], 'name': self.name } ctx = { 'topic': self, 'action': type_dict[type.lower()], 'modification_date': str(datetime.now()), 'site': settings.SITE_URL } message = render_to_string('core/topic_notification_email.txt', ctx) admin_user_set = User.objects.filter(is_staff=True) #给具有管理权限的用户发信 #没有用mail_admins(),更灵活一些 mail_queue = [] for each_admin in admin_user_set: email = EmailMessage(subject, message, settings.DEFAULT_FROM_EMAIL, [each_admin.email], '', headers={'Reply-To': each_admin.email}) email.content_subtype = "plain" mail_queue.append(email) #使用单次SMTP连接批量发送邮件 connection = mail.get_connection() # Use default e-mail connection connection.send_messages(mail_queue) return True def __unicode__(self): return self.name votes = generic.GenericRelation('Vote') #TODO Add a custom manager for most web voted & unshown topics, to add to a upcoming event def save(self, *args, **kwargs): self.total_votes = self.votes.count() if not self.content or self.content.strip() == '': self.content = self.description super(Topic, self).save(*args, **kwargs) class Meta: app_label = 'core'
def get_message_tree(self): tree = { 'id': self.get_msg_info(self.index.MSG_ID), 'tags': self.get_msg_info(self.index.MSG_TAGS).split(','), 'summary': self.get_msg_summary(), 'headers': {}, 'headers_lc': {}, 'attributes': {}, 'text_parts': [], 'html_parts': [], 'attachments': [], 'conversation': [], } conv_id = self.get_msg_info(self.index.MSG_CONV_MID) if conv_id: conv = Email(self.index, int(conv_id, 36)) tree['conversation'] = convs = [conv.get_msg_summary()] for rid in conv.get_msg_info(self.index.MSG_REPLIES).split(','): if rid: convs.append(Email(self.index, int(rid, 36)).get_msg_summary()) # FIXME: Decide if this is strict enough or too strict...? html_cleaner = Cleaner(page_structure=True, meta=True, links=True, javascript=True, scripts=True, frames=True, embedded=True, safe_attrs_only=True) msg = self.get_msg() for hdr in msg.keys(): tree['headers'][hdr] = self.index.hdr(msg, hdr) tree['headers_lc'][hdr.lower()] = self.index.hdr(msg, hdr) # Note: count algorithm must match that used in extract_attachment above count = 0 for part in msg.walk(): mimetype = part.get_content_type() if mimetype.startswith('multipart/'): continue count += 1 if (part.get('content-disposition', 'inline') == 'inline' and mimetype in ('text/plain', 'text/html')): payload, charset, openpgp = self.decode_payload(part) if (mimetype == 'text/html' or '<html>' in payload or '</body>' in payload): print "Adding stuff to pgp tree!" tree['html_parts'].append({ 'openpgp_status': openpgp and openpgp[0] or '', 'openpgp_data': openpgp and openpgp[1] or '', 'charset': charset, 'type': 'html', 'data': (payload.strip() and html_cleaner.clean_html(payload)) or '' }) tree['text_parts'][0]["openpgp_status"] = openpgp and openpgp[0] or '' tree['text_parts'][0]["openpgp_data"] = openpgp and openpgp[1] or '' else: tree['text_parts'].extend(self.parse_text_part(payload, charset, openpgp)) else: tree['attachments'].append({ 'mimetype': mimetype, 'count': count, 'part': part, 'length': len(part.get_payload(None, True) or ''), 'content-id': part.get('content-id', ''), 'filename': part.get_filename() or '' }) if self.is_editable(): tree['is_editable'] = True tree['editing_strings'] = self.get_editing_strings(tree) return tree
def as_clean_html(value): try: return Cleaner(style=True, scripts=True).clean_html(value.strip()) except LxmlError: return '<p></p>'
html = requests.get(url).text doc = fromstring(html) tags = [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'span', 'img', 'area', 'map' ] args = { 'meta': False, 'safe_attrs_only': False, 'page_structure': False, 'scripts': True, 'style': True, 'links': True, 'remove_tags': tags } cleaner = Cleaner(**args) path = '/html/body' body = doc.xpath(path)[0] result = cleaner.clean_html( body).text_content().encode('ascii', 'ignore') dict_result[el] += "\n\n " + " ".join( str(result).split(" ")[:count_of_words]) except: print("error at ", el[:100]) dict_result[el] = "" else: dict_result[el] = "" idx += 1 count += 1
def clean_attributes(html): while htmlstrip.search(html): html = htmlstrip.sub('<\\1\\2>', html) return html def normalize_spaces(s): if not s: return '' """replace any sequence of whitespace characters with a single space""" return ' '.join(s.split()) html_cleaner = Cleaner(scripts=True, javascript=True, comments=True, style=True, links=True, meta=False, add_nofollow=False, page_structure=False, processing_instructions=True, embedded=False, frames=False, forms=False, annoying_tags=False, remove_tags=None, remove_unknown_tags=False, safe_attrs_only=False)
single_blank_pat = re.compile(r'\s') all_digit_pat = re.compile(r'^\d*$') title_sep_pat = re.compile(r'[-_|-]') #— site_name_end_pat = re.compile(r'(网|在线|门户|频道|栏目|站点?|新闻|政府|办公室)$') escape_pat = re.compile(r'&(nbsp|lt|gt);') single_punc_pat = re.compile(r'[^ 0-9A-Za-z\u4E00-\u9FFF]') article_date_pat = re.compile( r'(?:^|[^-+\d])((?:19|20)?\d{2})([\./\-_年]?)(1[0-2]|0?[1-9])([\./\-_月]?)([1-2][0-9]|3[0-1]|0?[1-9])' + r'(?:[^-+:\d](?:\s*((?:1|0?)[0-9]|2[0-3])[:点时]((?:[1-5]|0?)[0-9])(?:[:分]((?:[1-5]|0?)[0-9]))?(?:[^-+:\d]|$))?|$)' ) blank_date_pat = re.compile(r'(?<!\d)\s|\s(?!\d)') time_prefix_pat = re.compile(r'时间|日期|时期|发表|发布|提交|上传|于') html_body_pat = re.compile(r'<\s*/\s*(html|body)\s*>', re.IGNORECASE) cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True # This is True because we want to activate the styles & stylesheet filter def remove_blanks(text): text = blank_pat.sub(' ', text) return blank_date_pat.sub('', text) def strip_site_names_from_title(title): title = title_sep_pat.split(title, 1)[0].strip() parts = title.split() while len(parts) > 1: if site_name_end_pat.search(parts[-1]): parts.pop()
import logging import re from lxml import etree from lxml.html.clean import Cleaner from .filters import duplicate_test, textfilter from .settings import CUT_EMPTY_ELEMS, DEFAULT_CONFIG, MANUALLY_CLEANED, MANUALLY_STRIPPED from .utils import trim LOGGER = logging.getLogger(__name__) # HTML_CLEANER config # http://lxml.de/api/lxml.html.clean.Cleaner-class.html # https://lxml.de/apidoc/lxml.html.clean.html HTML_CLEANER = Cleaner() HTML_CLEANER.annoying_tags = False # True HTML_CLEANER.comments = True HTML_CLEANER.embedded = False # True HTML_CLEANER.forms = False # True HTML_CLEANER.frames = False # True HTML_CLEANER.javascript = False HTML_CLEANER.links = False HTML_CLEANER.meta = False HTML_CLEANER.page_structure = False HTML_CLEANER.processing_instructions = True HTML_CLEANER.remove_unknown_tags = False HTML_CLEANER.safe_attrs_only = False HTML_CLEANER.scripts = False HTML_CLEANER.style = False #HTML_CLEANER.remove_tags = MANUALLY_STRIPPED
def get_chapters(self, extract_images=True, images_path=None, as_list=False, clean=True, cleaner_params={}): """ Extracts content of all files from epub into a single string and returns it. Args: extract_images (bool): If it should extract images from epub. Defaults to True. images_path (str): A path where all images src's should lead to. If not set, uses self.epub_info["images"], which should be set by self.extract_images() if extract_images = False. If self.epub_info["images"] is not set, uses "images/". as_list (bool): Return chapters as a list or as an HTML-string. Defaults to False. clean (bool): If chapters should be cleaned off of malicious HTML. cleaner_params (dict): Dictionary of cleaner params, a full list of which is available in the documentation to lxml.html.clean.Cleaner class. Returns: chapters (str|list): String or list of strings containing the text of the book formatted as html. None: if input file is not found. Raises: KeyError: if a file is not found in the epub archive. """ if not self.ifile: return #set paths to images in chapters' markup epub_images = self.get_epub_info().get("images") if images_path: images_path = images_path else: if epub_images: images_path = epub_images else: images_path = "images/" #extract images if extract_images: self.extract_images() files = self.__get_files() #create a cleaner cleaner = Cleaner(**cleaner_params) if clean else None if as_list: chapters = [] for filename in files: if ".htm" in filename or ".xml" in filename: original = find_file(self.ifile,filename) try: with self.ifile.open(original) as f: chapter = build_chapter(f, images_path=images_path, cleaner=cleaner) chapter.attrib["id"]=filename chapters.append(html.tostring(chapter).decode('utf-8')) except KeyError as e: handle_error(e) else: chapters = etree.Element("div") for filename in files: if ".htm" in filename or ".xml" in filename: original = find_file(self.ifile,filename) try: with self.ifile.open(original) as f: chapter = build_chapter(f, images_path=images_path, cleaner=cleaner) chapter.attrib["id"]=filename chapters.append(chapter) except KeyError as e: handle_error(e) chapters = html.tostring(chapters).decode('utf-8') return chapters
def f_parse(args): def isAlphabet(word): alphabet = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'j', 'k', 'i', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'w', 'z', 'à', 'è', 'é', 'ì', 'í', 'ò', 'ó', 'ù', 'ú' ] guard = True for t in word: if t not in alphabet: guard = False return guard loc = args[0] corpuses = args[1] MINSIZE_WORD = 4 MAXSIZE_WORD = 15 MINSIZE_CHARSDOC = 100 MINSIZE_WORDSDOC = 50 cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.processing_instructions = True cleaner.forms = True cleaner.add_nofollow = True ret = [] for document in corpuses: #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore') if len(document) > 0: try: document = lxml.html.document_fromstring(document) c = cleaner.clean_html(document) html = lxml.html.tostring(c) soup = BeautifulSoup(html, 'lxml') parsed_text = soup.get_text() if (len(parsed_text) > MINSIZE_CHARSDOC): parsed_text = parsed_text.lower() tokenizer = RegexpTokenizer(r'\w+') # create English stop words list en_stop = get_stop_words('en') it_stop = get_stop_words('it') sp_stop = get_stop_words('es') ge_stop = get_stop_words('de') fr_stop = get_stop_words('fr') # Create p_stemmer of class PorterStemmer #p_stemmer = PorterStemmer() # clean and tokenize document string tokens = tokenizer.tokenize(parsed_text) # remove stop words from tokens stopped_tokens1 = [i for i in tokens if not i in en_stop] stopped_tokens2 = [ i for i in stopped_tokens1 if not i in it_stop ] stopped_tokens3 = [ i for i in stopped_tokens2 if not i in sp_stop ] stopped_tokens4 = [ i for i in stopped_tokens3 if not i in ge_stop ] stopped_tokens5 = [ i for i in stopped_tokens4 if not i in fr_stop ] for word in stopped_tokens5: if not any(char.isdigit() for char in word): if len(word) > 1: #check if the word has the alphabet character if isAlphabet(word): ret.append(word) except: print('Exception : Document empty') return [loc, ret]
def html2text(html): cleaner = Cleaner() cleaner.javascript = True # This is True because we want to activate the javascript filter cleaner.style = True cleaner.scripts = True cleaner.comments = True cleaner.links = True cleaner.meta = True cleaner.page_structure = True cleaner.processing_instructions = True cleaner.forms = True cleaner.add_nofollow = True #html = unicodedata.normalize('NFKD', html).encode('ascii','ignore') try: document = lxml.html.document_fromstring(html) c = cleaner.clean_html(document) html = lxml.html.tostring(c) soup = BeautifulSoup(html, 'lxml') parsed_text = soup.get_text() if (len(parsed_text) > MINSIZE_CHARSDOC): return parsed_text.lower() else: return None except: return None
class UclCopySpider(scrapy.contrib.spiders.CrawlSpider): # Track parsed items lock = threading.Lock() parsed = {} # Setup html cleaner to remove js and css cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.page_structure = False # Store a counter of files parsed to save a unique filename counter = 0 name = "ucl" # Define the allowed domains for crawling allowed_domains = [ "advancedteaching.cs.ucl.ac.uk", "blogs.ucl.ac.uk", "busics.cs.ucl.ac.uk", "ccs.chem.ucl.ac.uk", "crest.cs.ucl.ac.uk", "crf.casa.ucl.ac.uk", "discovery.ucl.ac.uk", "geometry.cs.ucl.ac.uk", "haig.cs.ucl.ac.uk", "iris.ucl.ac.uk", "is.cs.ucl.ac.uk", "mediafutures.cs.ucl.ac.uk", "nrg.cs.ucl.ac.uk", "onlinestore.ucl.ac.uk", "pplv.cs.ucl.ac.uk", "readinglists.ucl.ac.uk", "reality.cs.ucl.ac.uk", "sec.cs.ucl.ac.uk", "vecg.cs.ucl.ac.uk", "vis.cs.ucl.ac.uk", "web4.cs.ucl.ac.uk", "www-mice.cs.ucl.ac.uk", "www.bartlett.ucl.ac.uk", "www.cege.ucl.ac.uk", "www.chem.ucl.ac.uk", "www.cs.ucl.ac.uk", "www.csml.ucl.ac.uk", "www.ee.ucl.ac.uk", "www.engineering.ucl.ac.uk", "www.gatsby.ucl.ac.uk", "www.geog.ucl.ac.uk", "www.grad.ucl.ac.uk", "www.homepages.ucl.ac.uk", "www.icn.ucl.ac.uk", "www.igp.ucl.ac.uk", "www.laws.ucl.ac.uk", "www.london-in-motion.ucl.ac.uk", "www.mailinglists.ucl.ac.uk", "www.mecheng.ucl.ac.uk", "www.meng.ucl.ac.uk", "www.phon.ucl.ac.uk", "www.silva-sandbox.ucl.ac.uk", "www.star.ucl.ac.uk", "www.ucl.ac.uk", "www0.cs.ucl.ac.uk", "zuserver2.star.ucl.ac.uk" ] # Define the starting pages to crawl start_urls = [ "http://advancedteaching.cs.ucl.ac.uk", "http://blogs.ucl.ac.uk", "http://busics.cs.ucl.ac.uk", "http://ccs.chem.ucl.ac.uk", "http://crest.cs.ucl.ac.uk", "http://crf.casa.ucl.ac.uk", "http://discovery.ucl.ac.uk", "http://geometry.cs.ucl.ac.uk", "http://haig.cs.ucl.ac.uk", "http://iris.ucl.ac.uk", "http://is.cs.ucl.ac.uk", "http://mediafutures.cs.ucl.ac.uk", "http://nrg.cs.ucl.ac.uk", "http://onlinestore.ucl.ac.uk", "http://pplv.cs.ucl.ac.uk", "http://readinglists.ucl.ac.uk", "http://reality.cs.ucl.ac.uk", "http://sec.cs.ucl.ac.uk", "http://vecg.cs.ucl.ac.uk", "http://vis.cs.ucl.ac.uk", "http://web4.cs.ucl.ac.uk", "http://www-mice.cs.ucl.ac.uk", "http://www.bartlett.ucl.ac.uk", "http://www.cege.ucl.ac.uk", "http://www.chem.ucl.ac.uk", "http://www.cs.ucl.ac.uk", "http://www.csml.ucl.ac.uk", "http://www.ee.ucl.ac.uk", "http://www.engineering.ucl.ac.uk", "http://www.gatsby.ucl.ac.uk", "http://www.geog.ucl.ac.uk", "http://www.grad.ucl.ac.uk", "http://www.homepages.ucl.ac.uk", "http://www.icn.ucl.ac.uk", "http://www.igp.ucl.ac.uk", "http://www.laws.ucl.ac.uk", "http://www.london-in-motion.ucl.ac.uk", "http://www.mailinglists.ucl.ac.uk", "http://www.mecheng.ucl.ac.uk", "http://www.meng.ucl.ac.uk", "http://www.phon.ucl.ac.uk", "http://www.silva-sandbox.ucl.ac.uk", "http://www.star.ucl.ac.uk", "http://www.ucl.ac.uk", "http://www0.cs.ucl.ac.uk", "http://zuserver2.star.ucl.ac.uk" ] # Define additional rules to limit crawlable_domains within allowed domains crawlable_domains = [ "http://advancedteaching.cs.ucl.ac.uk/.*", "http://blogs.ucl.ac.uk/.*", "http://busics.cs.ucl.ac.uk/.*", "http://ccs.chem.ucl.ac.uk/.*", "http://crest.cs.ucl.ac.uk/.*", "http://crf.casa.ucl.ac.uk/.*", "http://discovery.ucl.ac.uk/.*", "http://geometry.cs.ucl.ac.uk/.*", "http://haig.cs.ucl.ac.uk/.*", "http://iris.ucl.ac.uk/.*", "http://is.cs.ucl.ac.uk/.*", "http://mediafutures.cs.ucl.ac.uk/.*", "http://nrg.cs.ucl.ac.uk/.*", "http://onlinestore.ucl.ac.uk/.*", "http://pplv.cs.ucl.ac.uk/.*", "http://readinglists.ucl.ac.uk/.*", "http://reality.cs.ucl.ac.uk/.*", "http://sec.cs.ucl.ac.uk/.*", "http://vecg.cs.ucl.ac.uk/.*", "http://vis.cs.ucl.ac.uk/.*", "http://web4.cs.ucl.ac.uk/.*", "http://www-mice.cs.ucl.ac.uk/.*", "http://www.bartlett.ucl.ac.uk/.*", "http://www.cege.ucl.ac.uk/.*", "http://www.chem.ucl.ac.uk/.*", "http://www.cs.ucl.ac.uk/.*", "http://www.csml.ucl.ac.uk/.*", "http://www.ee.ucl.ac.uk/.*", "http://www.engineering.ucl.ac.uk/.*", "http://www.gatsby.ucl.ac.uk/.*", "http://www.geog.ucl.ac.uk/.*", "http://www.grad.ucl.ac.uk/.*", "http://www.homepages.ucl.ac.uk/.*", "http://www.icn.ucl.ac.uk/.*", "http://www.igp.ucl.ac.uk/.*", "http://www.laws.ucl.ac.uk/.*", "http://www.london-in-motion.ucl.ac.uk/.*", "http://www.mailinglists.ucl.ac.uk/.*", "http://www.mecheng.ucl.ac.uk/.*", "http://www.meng.ucl.ac.uk/.*", "http://www.phon.ucl.ac.uk/.*", "http://www.silva-sandbox.ucl.ac.uk/.*", "http://www.star.ucl.ac.uk/.*", "http://www.ucl.ac.uk/.*", "http://www0.cs.ucl.ac.uk/.*", "http://zuserver2.star.ucl.ac.uk/.*" ] rules = (Rule(LinkExtractor(allow_domains=crawlable_domains), callback='parse'), ) # The method called on a document retrieval def parse(self, response): # Ignore non html responses if not isinstance(response, HtmlResponse): return # Clean html responses of non-html clean_html = self.cleaner.clean_html(response.body) soup = BeautifulSoup(clean_html, "lxml") # Use a lock whilst tracking document numbers and urls crawled self.lock.acquire() try: with open('sitescrawled.txt', "a") as myfile: myfile.write(response.url + "\n") with open('sites/url' + str(self.counter), 'wb') as f: # Output BeautifulSoup formatted html, with additonal <url> header tag f.write("<url>" + response.url + "</url>\n" + soup.prettify("utf-8")) self.counter += 1 finally: self.lock.release() for href in response.css("a::attr('href')"): # Ignore php items and hyperlink tags in the url header url = response.urljoin(href.extract()) url = url.split('?')[0].split('#')[0] yield scrapy.Request(url)
dataset_dir = os.path.dirname(os.path.abspath(__file__)) + '/dataset/' lines = open(dataset_dir + 'long_list_stop_words.txt', 'r').readlines() stop_words = [ word[:-1].lower().encode('utf-8', 'ignore') for word in lines if word[-1] == '\n' ] + [ word.lower().encode('utf-8', 'ignore') for word in lines if word[-1] != '\n' ] dictionary = pickle.load(open(dataset_dir + 'dictionary.pickle', "rb")) reverse_dictionary = pickle.load( open(dataset_dir + 'reverse_dictionary.pickle', "rb")) #embeddings = pickle.load(open(dataset_dir+'embeddings.pickle', "rb")) idf_values = pickle.load(open(dataset_dir + 'idf_values.pickle', "rb")) topics = [ 'Science', 'Space', 'Astronomy', 'Photo', 'Comics', 'Cooking', 'Neuroscience', 'Politics', 'Technology', 'Videogames', 'IT', 'Devices', 'Management', 'Marketing', 'Design', 'Food', 'Startup', 'Console', 'Economics', 'Education', 'YouTube' ] cleaner = Cleaner(meta=False, page_structure=False, style=False, kill_tags=['style', 'script', 'iframe', 'video'], safe_attrs_only=False, remove_unknown_tags=False) img_cleaner = Cleaner(kill_tags=['img'], remove_unknown_tags=False)
f = ArchiveIterator(options.input.buffer) else: f = ArchiveIterator(open(options.input, 'rb')) if options.output == sys.stdout: fo = WARCWriter(options.output.buffer, gzip=True) else: fo = WARCWriter(open(options.output, 'wb'), gzip=True) if options.pdfpass is not None: po = WARCWriter(open(options.pdfpass, 'wb'), gzip=True) if not options.pdfpass and options.pdfextract: extractor = ExtrP() cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False) if options.output == sys.stdout: filename = options.input else: filename = options.output fo.write_record(fo.create_warcinfo_record(filename=filename, info={'software': 'bitextor/bitextor-warc2htmlwarc.py', 'format': 'WARC File Format 1.0'})) for record in f: # Initial checks if record.rec_type != 'response' and record.rec_type != 'resource': continue if record.rec_headers.get_header('WARC-Target-URI')[0] == '<' and record.rec_headers.get_header('WARC-Target-URI')[-1] == '>': url = record.rec_headers.get_header('WARC-Target-URI')[1:-1] else:
def clean_html(text): cleaner = Cleaner(style=False) return cleaner.clean_html(text)
class HTMLSupport(object): """Provides helpers for HTML file context extraction.""" # this is from lxml/apihelpers.pxi RE_XML_ENCODING = re.compile( ur'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U) # noqa cleaner = Cleaner( page_structure=True, scripts=True, javascript=True, style=True, links=True, embedded=True, forms=True, frames=True, meta=True, # remove_tags=['a'], kill_tags=['head']) def get_meta(self, doc, field): for field_attr in ('property', 'name'): for el in doc.findall('.//meta[@%s="%s"]' % (field_attr, field)): content = collapse_spaces(el.get('content')) if content is not None and len(content): return content def extract_html_header(self, doc): """Get metadata from the HTML head element.""" self.update('summary', self.get_meta(doc, 'og:title')) self.update('title', doc.findtext('.//title')) self.update('summary', self.get_meta(doc, 'og:description')) self.update('summary', self.get_meta(doc, 'description')) self.update('author', self.get_meta(doc, 'author')) self.update('author', self.get_meta(doc, 'og:site_name')) self.update('published_at', self.get_meta(doc, 'artcile:published_time')) # noqa self.update('modified_at', self.get_meta(doc, 'artcile:modified_time')) for field in ['keywords', 'news_keywords']: content = self.get_meta(doc, field) if content is not None: content = [collapse_spaces(c) for c in content.split(',')] content = [c for c in content if len(c)] self.result.keywords.extend(content) def extract_html_text(self, doc): """Get all text from a DOM, also used by the XML parser.""" text = ' '.join(self.extract_html_elements(doc)) text = collapse_spaces(text) if len(text): return text def extract_html_elements(self, el): yield el.text or ' ' for child in el: for text in self.extract_html_elements(child): yield text yield el.tail or ' ' def extract_html_content(self, html_body, fix_html=True): """Ingestor implementation.""" if html_body is None: return try: try: doc = html.fromstring(html_body) except ValueError: # Ship around encoding declarations. # https://stackoverflow.com/questions/3402520 html_body = self.RE_XML_ENCODING.sub('', html_body, count=1) doc = html.fromstring(html_body) except (ParserError, ParseError, ValueError): raise ProcessingException("HTML could not be parsed.") self.extract_html_header(doc) self.cleaner(doc) text = self.extract_html_text(doc) self.result.flag(self.result.FLAG_HTML) self.result.emit_html_body(html_body, text)
from lxml.html.clean import Cleaner from urlparse import urlparse, parse_qs import numpy as np from sklearn.base import BaseEstimator, TransformerMixin from sklearn.feature_extraction import DictVectorizer from sklearn.feature_extraction.text import CountVectorizer from .functions import parent_tag, block_length, number_pattern, url_edit_distance from .preprocess import Tagset _cleaner = Cleaner(style=True, scripts=True, embedded=True, links=True, page_structure=False, remove_unknown_tags=False, meta=False, safe_attrs_only=False) def tokenize(text): return text.split() def get_text(anchor): return anchor.text def get_attr_text(anchor): return anchor.get('class', '') + anchor.get('id', '')
require(request.authz.can(collection['id'], action)) return collection def get_url_path(url): try: return url_parse(url).replace(netloc='', scheme='').to_url() or '/' except Exception: return '/' CLEANER = Cleaner(style=True, meta=True, links=False, remove_tags=['body', 'form'], kill_tags=[ 'area', 'audio', 'base', 'bgsound', 'embed', 'frame', 'frameset', 'head', 'img', 'iframe', 'input', 'link', 'map', 'meta', 'nav', 'object', 'plaintext', 'track', 'video' ]) def sanitize_html(html_text, base_url, encoding=None): """Remove anything from the given HTML that must not show up in the UI.""" if html_text is None or not len(html_text.strip()): return try: cleaned = CLEANER.clean_html(html_text) encoding = encoding or 'utf-8' parser = html.HTMLParser(encoding=encoding) data = cleaned.encode(encoding, 'replace')
from lxml.html.clean import Cleaner import lxml from lxml import etree from lxml.html import HtmlElement from lxml.html import tostring as lxml_html_tostring from lxml.html.soupparser import fromstring as soup_parse from parse import search as parse_search from parse import findall, Result from w3lib.encoding import html_to_unicode DEFAULT_ENCODING = 'utf-8' DEFAULT_URL = 'https://example.org/' DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8' DEFAULT_NEXT_SYMBOL = ['next', 'more', 'older'] cleaner = Cleaner() cleaner.javascript = True cleaner.style = True useragent = None # Typing. _Find = Union[List['Element'], 'Element'] _XPath = Union[List[str], List['Element'], str, 'Element'] _Result = Union[List['Result'], 'Result'] _HTML = Union[str, bytes] _BaseHTML = str _UserAgent = str _DefaultEncoding = str _URL = str _RawHTML = bytes
telefono = "".join(links[1].text_content().split()) fax = "".join(links[2].text_content().split()) if len(links[3].cssselect("a")[0].attrib['href']) > len('http://'): web = links[3].cssselect("a")[0].attrib['href'] else: web = "" return direccion, telefono, fax, web cleaner = Cleaner() cleaner.kill_tags = ['strong'] for i in range(1, 45): base_url = 'http://planetafan.com/cas/site/tiendas.asp?prov=0&loc=0&pag=' + str( i) html = scraperwiki.scrape(base_url) root = lxml.html.fromstring(html) links = root.cssselect("ul#listado-productos li") for link in links: record = {} name = link.cssselect("a")[0].text_content()
def mainPage(board): url = "https://www.backpackers.com.tw/forum/forumdisplay.php?f=60&prefixid={0}".format( board) #url = "https://www.backpackers.com.tw/forum/forumdisplay.php?f=60&prefixid=voyage" useragnet = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36' header = {'User-Agent': useragnet} #proxies = {"http": "http://spys.one/en/","https": "https://free-proxy-list.net/",} for i in range(2, 3): res = requests.get(url, headers=header) cleaner = Cleaner(style=True, scripts=True, comments=True, javascript=True, page_structure=False, safe_attrs_only=False) content = cleaner.clean_html( res.content.decode('utf-8')).encode('utf-8') #print(content) bs = BeautifulSoup(content, 'html.parser') body = bs.find("tbody", {"id": "threadbits_forum_60"}) print(type(body)) trSec = body.find_all("tr") print(trSec) artileList = [] for tr in trSec: articleDict = {} t = tr.find_all("td", {"id": re.compile(r"^td_threadtitle_[0-9]+")}) tdSec1 = tr.select("td", {"class": "alt1"}) tag = tdSec1[1].select("div") # print(tag) tdSec2 = tr.select("td", {"class": "alt2", "title": "回覆"}) # print(tdSec) for td in tdSec2: date = td.select("div.smallfont") if len(date) > 0: #print(date[0].text.strip()) articleDict["date"] = date[0].text.strip().split("\n")[0] timer = td.select("div.smallfont > span.time") if len(timer) > 0: #print(time[0].text.strip()) articleDict["time"] = timer[0].text.strip() user = td.select("div.smallfont > span.byx") if len(user) > 0: #print(user[0].text.strip()) articleDict["author"] = user[0].text.strip() href = td.select("a") # print(href) for hr in href: if hr["href"] != "#": print("title:", hr.text) if hr.text == "": continue articleDict["title"] = hr.text articleDict[ "url"] = "https://www.backpackers.com.tw/forum/{0}".format( hr["href"]) time.sleep(random.randint(1, 5)) # 爬每篇文章,回傳內文跟圖片連結 article, imgList = CrawlArticle( articleDict["url"], header) # 儲存圖片&回傳儲存路徑 ipath = getImage(imgList) articleDict["content"] = article articleDict["imgPath"] = ipath if len(articleDict.keys()) > 0: artileList.append(articleDict) print(artileList) #for a in artileList: #print(a) # InsertMongo(a, "Backpacker") time.sleep(random.randint(1, 5)) url = "https://www.backpackers.com.tw/forum/forumdisplay.php?f=60&prefixid={0}&order=desc&page={1}".format( board, i)
from pyquery import PyQuery as pq from lxml import html import re from lxml.html.clean import Cleaner import logging from urllib.parse import urlsplit, parse_qs cleaner = Cleaner(javascript=True, scripts=True, style=True) SinglePost = re.compile(r"http:\/\/www\.news\.uestc\.edu\.cn\/\?n\=UestcNews\.Front\.Document\.ArticlePage\&Id\=(\d+)") Column = re.compile(r"http:\/\/www\.news\.uestc\.edu\.cn\/\?n\=UestcNews.Front.Category.Page&CatId=42") logger = logging.getLogger("parser") def makeParser(content, encoding="utf8"): content = content.decode(encoding, "ignore") if isinstance(content, bytes) else str(content) return pq(content) def tostring(node): """ Convert to the html string, and clean the html """ return cleaner.clean_html(html.tostring(node, method="html", encoding="utf8").decode()).strip() def convertUrl(url, strict=False): logger.debug(url)
return [item[0].text for item in root] @staticmethod def _text_cleanup(text): global cleaner raw_html = cleaner.clean_html(text) # TODO: remove links # TODO: remove code # TODO: remove Latin # TODO: remove garbage # TODO: replace Latin omographs with Cyrillic # TODO: unify qmarks if len(raw_html) > 0: unwrap_regex = re.compile(r'\A<div>(.*?)</div>\Z', flags=re.MULTILINE | re.DOTALL) cut_html = unwrap_regex.match(raw_html).group(1) return cut_html else: return None @staticmethod def texts_cleanup(texts): return list( filter(lambda s: len(s) > 0, map(ScrapyImport._text_cleanup, texts))) cleaner = Cleaner(allow_tags=[''], remove_unknown_tags=None, kill_tags=['code', 'blockquote', 's', 'strike'])
#print(content.decode('utf-8')) data['text'] = content.decode('utf-8') except Exception as e: raise e return data if __name__ == '__main__': url = 'http://www.lajyzx.cn/Bulletin/BulletinBrowse.aspx?id=9888' HEADERS = {#'Accept':"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 'User-Agent': 'Mozilla/5.0 (Linux; U; Android 7.1.1; zh-cn; MI 6 Build/NMF26X) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30', } cleaner = Cleaner(page_structure=False, links=False, style=True, scripts=True) response = requests.get(url, headers=HEADERS) response.encoding = 'utf-8' time.sleep(2) #print(response.text) body = lxml.html.fromstring(response.text) # #element = element.replace('DownLoad(','').replace(')','') #做判断 #杭州 #elements = re.findall('onclick="DownLoad\((.*?)\)"',response.text) #del_ele = tree.xpath('//div[@class="MainTitle"]') #for ele in del_ele: # ele.clear() #node_elems = tree.xpath('//div/ul/li/a[@href="javascript:void(0);"]')
plainTextFile = lzma.open(options.outDir + "/" + options.prefix + "plain_text.xz", "w") # Boilerpipe cleaning is optional if options.boilerpipe: deboilFile = lzma.open(options.outDir + "/" + options.prefix + "deboilerplate_html.xz", "w") for record in f: # We convert into UTF8 first of all orig_encoding, text = convert_encoding(record.payload.read()) url = record.url if orig_encoding is None: logging.info("Encoding of document " + url + " could not be identified") if len(text) > 0: # HTML is then normalized cleaner = Cleaner(style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False) tree="" try: cleanhtml = cleaner.clean_html(re.sub('encoding *= *"[^"]+"', '', text, flags=re.IGNORECASE)) document = html5lib.parse(ftfy.fix_text(cleanhtml), treebuilder="lxml", namespaceHTMLElements=False) tree = etree.tostring(document) except: continue tree = etree.tostring(document) cleantree = tree.decode("utf8").replace(" ", " ") cleantree = cleantree.replace("\t", " ") # lang id lang = guess_lang_from_data2(cleantree)