def html_sanitize(src): if not src: return src src = ustr(src, errors='replace') # html encode email tags part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL) src = part.sub(lambda m: cgi.escape(m.group(1)), src) # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) try: cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, kill_tags=tags_to_kill, remove_tags=tags_to_remove) cleaned = cleaner.clean_html(src) except TypeError: # lxml.clean version < 2.3.1 does not have a kill_tags attribute # to remove in 2014 cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, remove_tags=tags_to_kill + tags_to_remove) cleaned = cleaner.clean_html(src) except Exception, e: if isinstance(e, etree.ParserError) and 'empty' in str(e): return "" _logger.warning('html_sanitize failed to parse %s' % (src)) cleaned = '<p>Impossible to parse</p>'
def parse(self, response): title = response.xpath('//head/title/text()').extract_first() title = self.pattern.sub(" ",title).strip() if title != None else "" keywords = response.xpath('//head/meta[@name="keywords"]/@content').extract_first() keywords = self.pattern.sub(" ",keywords).strip() if keywords != None else "" description = response.xpath('//head/meta[@name="description"]/@content').extract_first() description = self.pattern.sub(" ",description).strip() if description != None else "" cleaner = clean.Cleaner(style=True,scripts=True,page_structure=False,safe_attrs_only=False) html1 = cleaner.clean_html(response.text) # print(response.url) doc = html.fromstring(html1) content =self.pattern.sub(" ",doc.xpath('//body')[0].text_content()) # docset = dict() docset['title'] = title docset['keyword'] = keywords docset['description'] = description docset['content'] = content item = JsonSpiderItem() one = response.meta['one'] item['name'] = one['name'] item['nid'] = one['nid'] item['keyword'] = one['keyword'] item['kid'] = one['kid'] item['requrl'] = one['requrl'] item['rid'] = one['rid'] item['title'] = one['title'] item['baiduurl'] = one['baiduurl'] item['realurl'] = one['realurl'] item['abstract']= one['abstract'] item['doc'] = docset yield item
def pull_html_from_email_content(email_content): if email_content is None: return None idx = email_content.find('<html') if idx == -1: return None end_idx = email_content.find('</html>', idx + 5) if end_idx == -1: return None html_content = email_content[idx:end_idx + 7] # remove line breaks strings_to_remove = ['=\r\n', '<o:p>', '</o:p>'] for to_remove in strings_to_remove: html_content = html_content.replace(to_remove, '') # remove <img ...> and <span...> tags html_content = removetags(html_content, 'p b img span') # remove extra attributes safe_attrs = clean.defs.safe_attrs clean.defs.safe_attrs = frozenset() cleaner = clean.Cleaner(safe_attrs_only=True) html_content = cleaner.clean_html(html_content) html_content = html_content.encode('ascii', 'ignore') return html_content
def preprocess_html(context): if isinstance(context, dict): for key, value in context.items(): context[key] = preprocess_html(value) return context elif isinstance(context, list): return [preprocess_html(v) for v in context] elif isinstance(context, tuple): return (preprocess_html(v) for v in context) elif isinstance(context, (str, unicode)): # clean html first cleaner = clean.Cleaner() cleaner.safe_attrs_only = True cleaner.safe_attrs = ('style', 'class') cleaner.allow_tags = ('p', 'a', 'br', 'span', 'strong', 'h1', 'h2', 'h3', 'h4', 'i', 'ul', 'li', 'br', 'pre') cleaner.remove_unknown_tags = False h = cleaner.clean_html(context) h = html.fromstring(h) # transform to docx code if h.find('p') is not None or h.find('span') is not None or\ h.find('strong') is not None or h.find('a') is not None: value = transform_html(h, True) else: # remove enclosing tag roottag = h.tag value = etree.tostring(h) value = value[len(roottag) + 2:-(len(roottag) + 3)] return value else: return context
def remove_html_attributes(string): html = lxml.html.fromstring(string) safe_attrs = clean.defs.safe_attrs cleaner = clean.Cleaner(safe_attrs_only=True, safe_attrs=frozenset()) cleansed = cleaner.clean_html(html) string = lxml.html.tostring(cleansed) return re.sub(r'&#\d+;', '', string)
def clean_descriptions(selectors): selectors = selectors or [] safe_attrs = set(['src', 'alt', 'href', 'title', 'width', 'height']) kill_tags = ['object', 'iframe'] cleaner = clean.Cleaner(safe_attrs_only=True, safe_attrs=safe_attrs, kill_tags=kill_tags) return [cleaner.clean_html(selector.extract()).strip() for selector in selectors]
def get_forwardlink_snapshots(parent_site): """ @type index: string @param index: the index.html page from which to extract forward links @type year: int @param year: the year to extract archives from """ try: parsed_parent_site = html.parse(ARCHIVE_DOMAIN + parent_site) except IOError: print "Did not get extract links in ", ARCHIVE_DOMAIN + parent_site return [] #cleaner = html.clean.Cleaner(scripts=True, javascript=True,style=True, kill_tags = ["img"]) cleaner = clean.Cleaner(scripts=True, javascript=True, comments=True, style=True, meta=True, processing_instructions=True, embedded=True, frames=True, forms=True, kill_tags=["noscript", "iframe", "img"]) parsed_parent_site = cleaner.clean_html(parsed_parent_site) # spec archival year # check to see if the archival year of a forwark link # is that of the parent (ie. 2000|2005|2010) all_forwardlinks = parsed_parent_site.xpath('//a[starts-with(@href,"' + parent_site[:9] + '")]/@href') return all_forwardlinks
def sanitize_html(html): """Sanitize HTML :param str html: unsafe HTML markup :return str: sanitized HTML """ if not html: return "" blacklist = ["script", "style", "head"] root_elem = lxml_html.fromstring(html) cleaner = clean.Cleaner( add_nofollow=False, kill_tags=blacklist ) cleaned_xhtml = cleaner.clean_html(root_elem) safe_html = etree.tostring(cleaned_xhtml, encoding="unicode") # the following code is legacy (pre-lxml) if safe_html == ", -": return "" return safe_html
def cleaner(file_name): content = open('E:/ZBSource/zhaobiao16/%s' % file_name).read() article_id = re.search('(\d+)\.html', file_name).group(1) cleaner = clean.Cleaner(style=True, scripts=True, comments=True, javascript=True, page_structure=False, safe_attrs_only=False) content = cleaner.clean_html(content.decode('utf-8')).encode('utf-8') content = content.replace('<p>', '\n<p>') content = content.replace('<tr>', '\n<tr>') content = content.replace('<div>', '\n<div>') content = content.replace('<span>', '\n<span>') content = content.replace('<td>', '<td> ') reg = re.compile("<[^>]*>") content = reg.sub('', content) f = open('E:/ZBSource/zhaobiao16/clean/%s.txt' % article_id, 'wb+') lines = content.split('\n') for line in lines: line = line.strip() if len(line) != 0: f.write(line + '\n') f.close()
def parse_body(self, response): item = response.meta['item'] body = response.xpath(".//*[@class='viewpointWrap']") tags = body.xpath( ".//*[@id='post_industry']/a[@class='post_industry_item']/text()" ).extract() # 标签 review = body.xpath(".//div[@id='post_brief']/text()").extract() if len(review) > 0: item['review'] = review[0] thumb_url = body.xpath( ".//div[@id='post_thumbnail']/img/@src").extract() if len(thumb_url) > 0: item['thumb_url'] = thumb_url[0] if len(tags) > 0: item['tags'] = tags origin_html = body.xpath(".//div[@id='post_description']/p").extract() content = list() cleaner = clean.Cleaner(safe_attrs_only=True, safe_attrs=self.__safe_attrs, kill_tags=self.__kill_tags) for html_string in origin_html: cleaned_html = cleaner.clean_html(html_string) content.append(cleaned_html) item['content'] = content yield item
def pull_html_from_email_file(fname): content = open(fname, 'r').read() idx = content.find('<html') if idx == -1: return None end_idx = content.find('</html>', idx + 5) if end_idx == -1: return None html_content = content[idx:end_idx + 7] # remove line breaks strings_to_remove = ['=\r\n', '<o:p>', '</o:p>'] for to_remove in strings_to_remove: html_content = html_content.replace(to_remove, '') # remove <img ...> and <span...> tags html_content = removetags(html_content, 'p b img span') # remove extra attributes safe_attrs = clean.defs.safe_attrs clean.defs.safe_attrs = frozenset() cleaner = clean.Cleaner(safe_attrs_only=True) html_content = cleaner.clean_html(html_content) return html_content
def __init__(self): self.headers = { 'Accept-Encoding': 'gzip,deflate', 'Accept-Language': 'zh-CN;q=1.0,*;q=0.5', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', } self.configs = { 'embedded': False, 'safe_attrs_only': True, 'safe_attrs': ['src', 'href', 'height', 'width', 'alt'], 'remove_tags': ['span'], } self.pattern = re.compile( '[' u'\U0001F600-\U0001F64F' u'\U0001F300-\U0001F5FF' u'\U0001F680-\U0001F6FF' u'\U0001F1E0-\U0001F1FF' ']+', flags=re.UNICODE) self.hash = lambda x: format(zlib.adler32(x.encode('utf-8'), 0x00), 'x' ) self.session = requests.Session() self.cleaner = clean.Cleaner(**self.configs)
def html_sanitize(src, silent=True, strict=False): if not src: return src src = ustr(src, errors='replace') logger = logging.getLogger(__name__ + '.html_sanitize') # html encode email tags part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL) src = part.sub(lambda m: cgi.escape(m.group(1)), src) kwargs = { 'page_structure': True, 'style': False, # do not remove style attributes 'forms': True, # remove form tags 'remove_unknown_tags': False, 'allow_tags': allowed_tags, } if etree.LXML_VERSION >= (2, 3, 1): # kill_tags attribute has been added in version 2.3.1 kwargs.update({ 'kill_tags': tags_to_kill, 'remove_tags': tags_to_remove, }) else: kwargs['remove_tags'] = tags_to_kill + tags_to_remove if strict: if etree.LXML_VERSION >= (3, 1, 0): # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style" kwargs.update({ 'safe_attrs_only': True, 'safe_attrs': safe_attrs, }) else: kwargs['safe_attrs_only'] = False # keep oe-data attributes + style kwargs[ 'frames'] = False, # do not remove frames (embbed video in CMS blogs) try: # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) cleaner = clean.Cleaner(**kwargs) cleaned = cleaner.clean_html(src) # MAKO compatibility: $, { and } inside quotes are escaped, preventing correct mako execution cleaned = cleaned.replace('%24', '$') cleaned = cleaned.replace('%7B', '{') cleaned = cleaned.replace('%7D', '}') cleaned = cleaned.replace('%20', ' ') cleaned = cleaned.replace('%5B', '[') cleaned = cleaned.replace('%5D', ']') except etree.ParserError, e: if 'empty' in str(e): return "" if not silent: raise logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True) cleaned = '<p>ParserError when sanitizing</p>'
def remove_attributes_from_tags(text): """Removes attributes from e.g. MathML tags""" if text: try: cleaner = clean.Cleaner(safe_attrs_only=True, remove_unknown_tags=False) text = cleaner.clean_html(text) except lxml.etree.ParserError: return text return text
def html_sanitize(src): if not src: return src src = ustr(src, errors='replace') # html encode email tags part = re.compile(r"(<[^<>]+@[^<>]+>)", re.IGNORECASE | re.DOTALL) src = part.sub(lambda m: cgi.escape(m.group(1)), src) # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) try: cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, kill_tags=tags_to_kill, remove_tags=tags_to_remove) cleaned = cleaner.clean_html(src) except TypeError, e: # lxml.clean version < 2.3.1 does not have a kill_tags attribute # to remove in 2014 cleaner = clean.Cleaner(page_structure=True, style=False, safe_attrs_only=False, forms=False, remove_tags=tags_to_kill+tags_to_remove) cleaned = cleaner.clean_html(src)
def _org_ruby_convert(self): _, html = utils.shell_command('org-ruby {} --translate html'.format(self.path)) cleaner = clean.Cleaner(safe_attrs_only=True, safe_attrs=frozenset()) html = cleaner.clean_html(html) # Fix horizontal rule. # With org-ruby, it converts dash-lines to '<hr>', which is invalid # ENML, converting it to <hr/> html = html.replace('<hr>', '<hr/>') return html
def clean_highlighted_code(html): """strip html from syntax-highlighted code (pre and code tags) """ cleaner = clean.Cleaner(allow_tags=['pre'], remove_unknown_tags=False) for el in html.findall('.//pre'): p = el.getparent() cleaned = cleaner.clean_html(el) p.replace(el, cleaned)
def get_description(self): request_url = self.base_url + '/description' r = requests.get(request_url) soup = BeautifulSoup(r.text, 'html.parser') description = soup.find('meta', {'name': 'description'})['content'] doc = html.fromstring(description) cleaner = clean.Cleaner(style=True) doc = cleaner.clean_html(doc) return doc.text_content()
def clean_html(html): html = lxml.html.fromstring(html) #clean.defs.safe_attrs=frozenset(['href','src']) cleaner = clean.Cleaner(allow_tags=[ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'span', 'pre', 'code', 'img', 'a', 'br' ], safe_attrs_only=True, remove_unknown_tags=False) cleaned = cleaner.clean_html(html) return lxml.html.tostring(cleaned)
def extract_upinfo(self, sel): upinfo_xpath = '//div[@class="upinfo"]' cleaner = clean.Cleaner( scripts=True, javascript=True, safe_attrs_only=True, safe_attrs=["class", "href", "src", "card", "mid", "title"], links=False) content = ''.join(sel.xpath(upinfo_xpath).extract()) clean_data = cleaner.clean_html(lxml.html.fromstring(content)) data = lxml.html.tostring(clean_data, encoding="utf8") return self.remove_space(data)
def __init__(self, html_file, removal_tag_list = []): ''' Initialize the Tag statistics ''' # clean the html self.fptr = open(html_file, 'r') self.html = self.fptr.read() cleaner = clean.Cleaner() cleaner.remove_tags = removal_tag_list cleaned_html = cleaner.clean_html(self.html) self.html = cleaned_html wfptr = open('output.html', 'w') wfptr.writelines(self.html) self._create_final_html()
def _clean_js_and_styles(html): cleaner = clean.Cleaner(javascript=True, style=True) try: html = str( tostring(cleaner.clean_html(fromstring(html))), encoding='utf8' ) except Exception: html = 'Failed to clean js and styles.' return html
def prepare(htmlstring, source_url=None, safe_attrs=['src', 'href']): """ Cleanse an htmlstring of it's attributes, absolutify images and links, ascii-dammify it, and clean whitespace. """ if not htmlstring: return None cleaner = clean.Cleaner(safe_attrs_only=True, safe_attrs=set(safe_attrs)) cleansed = cleaner.clean_html(htmlstring) soup = make_abs(cleansed, source_url) cleansed = get_inner(soup) return text.prepare(cleansed)
def remove_tags(html): try: kill_tags = ['a', 'img', 'strong', 'em'] allow_tags = ['p', 'span', 'br'] cleaner = clean.Cleaner(safe_attrs_only=True, safe_attrs=frozenset(), whitelist_tags=set(allow_tags), remove_tags=kill_tags) results = cleaner.clean_html(html) return results except Exception as e: print e
def parse(self, params): item = { 'parser': 'CSS', 'title': '', 'pdate': '', 'content': '', 'showcontent': '' } parserTable = self._parserTable html = params['html'] url = params['url'] try: url_ext = tldextract.extract(url).domain parser = getsafedictvalue(parserTable, url_ext + "/parser", None) cnname = getsafedictvalue(parserTable, url_ext + "/name", "") if parser is None: return item linkurl = url docrsp = doc(html, url) pubtimeint = 0 pubtimetxt = '' for CSS in parser: contraw = docrsp(CSS["content"]).remove("a").remove( "script").remove("style") if contraw == None: continue item['content'] = contraw.text() item['title'] = docrsp(CSS["title"]).text() pubtimetxt = docrsp(CSS["date"]).text() pubtimeint = parsedate(pubtimetxt) if (len(item['content']) > 0) and (len(item['title']) > 0) and (pubtimeint > 0): break if contraw: cleaner = clean.Cleaner(page_structure=True) showcont = cleaner.clean_html( contraw.remove_attr('id').remove_attr('class').wrapAll( '<div></div>').html()) showcont = re.sub(r'id=".*?"|class=".*?"', '', showcont) showcont = re.sub(r'[\s+]*?>', '>', showcont) showcont = showcont.replace("\n", "").replace( "\t", "").replace("<div>", "").replace("</div>", "") item['showcontent'] = showcont if (pubtimeint > 0): item['pdate'] = ValidateTime(pubtimeint) except Exception as e: pass return item
def __init__( self , url): #Obtener pagina self.pagina = html.fromstring(urllib.urlopen(url).read()) #Corregir links self.pagina.make_links_absolute(base_url=url,resolve_base_href=True) #Obtener links y guardarlos en una lista self.links= [] for link in self.pagina.xpath("//a"): test=link.get("href") if test!=None and test[0:4]=='http': self.links.append(link.get("href")) #Obtener texto limpio (sin html, javascript) cls = clean.Cleaner(links=False,page_structure=False) self.pagina = cls.clean_html(self.pagina) self.texto = lxml.html.tostring(self.pagina,encoding='utf-8',pretty_print=True, method='text').split() #Contar palabras global total self.palabras= {} self.tam=0 for pala in self.texto: pala = re.sub(r'\W+', '', pala) #Elminar caracteres no alfanumericos pala=pala.lower() if not pala=='' and pala not in irrelev: if not self.palabras.has_key(pala): self.palabras[pala]=1.0 else: self.palabras[pala]+=1.0 self.tam+=1 total+=1 if not palabras.has_key(pala): palabras[pala]=1.0 else: palabras[pala]+=1.0 #Relacion de cada palabra con el total para este documento for i in self.palabras: self.palabras[i]=self.palabras[i]/self.tam #Guardar global count f=open("crawl"+str(count),'w') count+=1 print>>f,url for item in sorted(self.palabras.iteritems(),key=operator.itemgetter(1),reverse=True): print>>f, item f.close()
def clean_html(value): """ Clean html value and strip potentially dangerous code using :class:`lxml.html.clean.Cleaner` """ cleaned = '' if value and value.strip(): cleaner = clean.Cleaner(safe_attrs_only=True, safe_attrs=frozenset(['href'])) cleaned = cleaner.clean_html(value) # Cleaner wraps with p tag, it should be removed if cleaned.startswith('<p>') and cleaned.endswith('</p>'): cleaned = cleaned[3:-4] return cleaned
def get_info(text: str): cleaner = clean.Cleaner(scripts=True, javascript=True, page_structure=False, safe_attrs_only=False, style=True, kill_tags=['span', 'font', 'a']) text = cleaner.clean_html(text) html = etree.HTML(text) print(html.xpath('//h3[@class="t"]/a/em')[0].text) print(html.xpath('//h3[@class="t"]/a/text()')) print(html.xpath('string(//h3[@class="t"]/a)')) for i in html.xpath('//h3[@class="t"]/a'): print(i.xpath('string()'))
def html_sanitize(src, silent=True): if not src: return src src = ustr(src, errors='replace') logger = _logger.getChild('html_sanitize') # html encode email tags part = re.compile(r"(<(([^a<>]|a[^<>\s])[^<>]*)@[^<>]+>)", re.IGNORECASE | re.DOTALL) src = part.sub(lambda m: cgi.escape(m.group(1)), src) kwargs = { 'page_structure': True, 'style': False, # do not remove style attributes 'forms': True, # remove form tags } if etree.LXML_VERSION >= (2, 3, 1): # kill_tags attribute has been added in version 2.3.1 kwargs.update({ 'kill_tags': tags_to_kill, 'remove_tags': tags_to_remove, }) else: kwargs['remove_tags'] = tags_to_kill + tags_to_remove if etree.LXML_VERSION >= (3, 1, 0): kwargs.update({ 'safe_attrs_only': True, 'safe_attrs': clean.defs.safe_attrs | set(['style']), }) else: # lxml < 3.1.0 does not allow to specify safe_attrs. We keep all attributes in order to keep "style" kwargs['safe_attrs_only'] = False try: # some corner cases make the parser crash (such as <SCRIPT/XSS SRC=\"http://ha.ckers.org/xss.js\"></SCRIPT> in test_mail) cleaner = clean.Cleaner(**kwargs) cleaned = cleaner.clean_html(src) except etree.ParserError: if not silent: raise logger.warning('ParserError obtained when sanitizing %r', src, exc_info=True) cleaned = '<p>ParserError when sanitizing</p>' except Exception: if not silent: raise logger.warning('unknown error obtained when sanitizing %r', src, exc_info=True) cleaned = '<p>Unknown error when sanitizing</p>' return cleaned
def download(url): browser.get(url) content = browser.page_source cleaner = clean.Cleaner() content = cleaner.clean_html(content) doc = html.fromstring(content) soup = BeautifulSoup(content, 'html.parser') img = soup.find("img",{"id": "landingImage"})['src'] print(img) name=str(uuid.uuid4())+".jpg" urllib.request.urlretrieve(img,"product_images/"+name) return "UploadedContent/"+name