def sanitizeHtml(html, allowed_tags=None): if not allowed_tags: allowed_tags = DEFAULT_ALLOWED_TAGS soup = BeautifulSoup(html) comments = soup.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in comments] # Some markup can be crafted to slip through BeautifulSoup's parser, so # we run this repeatedly until it generates the same output twice. newoutput = soup.renderContents() while 1: oldoutput = newoutput soup = BeautifulSoup(newoutput) for tag in soup.findAll(True): if tag.name not in allowed_tags: tag.hidden = True else: attrs = {} for attr, value in tag.attrs.items(): if attr in allowed_tags[tag.name]: attrs[attr] = value if attr[:5] == 'data-': attrs[attr] = value newoutput = soup.renderContents() if oldoutput == newoutput: break return newoutput.decode('utf-8')
def sanitize_html(value, valid_tags=VALID_TAGS): soup = BeautifulSoup(value,'lxml') comments = soup.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in comments] # Some markup can be crafted to slip through BeautifulSoup's parser, so # we run this repeatedly until it generates the same output twice. newoutput = soup.renderContents() while 1: oldoutput = newoutput soup = BeautifulSoup(newoutput,'lxml') for tag in soup.findAll(True): if tag.name not in valid_tags: tag.hidden = True #tag.replaceWith('') tag.extract() else: if tag.name == 'div' and tag.attrs.get(u'id','') != u'content': tag.hidden = True #tag.replaceWith('') #tag.extract() #print "--- ",tag.name, tag.attrs #else: #if tag.name == 'div' : #pass #print "+++ ",tag.name, tag.attrs newoutput = soup.renderContents() if oldoutput == newoutput: break return newoutput
def sanitize_html(value, valid_tags=VALID_TAGS): soup = BeautifulSoup(value) comments = soup.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in comments] # Some markup can be crafted to slip through BeautifulSoup's parser, so # we run this repeatedly until it generates the same output twice. newoutput = soup.renderContents() while 1: oldoutput = newoutput soup = BeautifulSoup(newoutput) for tag in soup.findAll(True): if tag.name not in valid_tags: tag.hidden = True else: print tag.name,'***', tag.attrs print tag.name,'###', [x for x in tag.attrs ] m={} for k in tag.attrs.keys(): if k in valid_tags[tag.name]: m[k] = tag.attrs[k] tag.attrs = m print tag.name,'===', m #tag.attrs = [(attr, value) for attr, value in tag.attrs if attr in valid_tags[tag.name]] newoutput = soup.renderContents() if oldoutput == newoutput: break return newoutput
def tables(content): if isinstance(content, contents.Static): return soup = BeautifulSoup(content._content, 'html.parser') classes = ('table', 'table-bordered', 'table-responsive') for table in soup.findAll('table'): table.attrs['class'] = ' '.join(classes) soup.renderContents() content._content = soup.decode()
def _compare_demultiplex_stats(self): """Compare the elements in two Demultiplex_Stats.htm files. """ with open(pjoin(self.unaligned, self.basecall_dir, 'Demultiplex_Stats.htm')) as f: ds_merged = BeautifulSoup(f.read()) with open(pjoin(self.unaligned_expected, self.basecall_dir, 'Demultiplex_Stats.htm')) as f: ds_expected = BeautifulSoup(f.read()) #Compare the content of the htm return re.sub('\s+', '', ds_merged.renderContents()) == \ re.sub('\s+', '', ds_expected.renderContents())
def div_around_tables(content): """ Surround <table> tags with <div> to allow scrolling horizontally. """ if isinstance(content, contents.Static): return soup = BeautifulSoup(content._content, "html.parser") for table in soup.findAll("table"): table.wrap(soup.new_tag("div", attrs={"style": "overflow-x: auto"})) soup.renderContents() content._content = soup.decode()
def _compare_demultiplex_stats(self): """Compare the elements in two Demultiplex_Stats.htm files. """ with open( pjoin(self.unaligned, self.basecall_dir, 'Demultiplex_Stats.htm')) as f: ds_merged = BeautifulSoup(f.read()) with open( pjoin(self.unaligned_expected, self.basecall_dir, 'Demultiplex_Stats.htm')) as f: ds_expected = BeautifulSoup(f.read()) #Compare the content of the htm return re.sub('\s+', '', ds_merged.renderContents()) == \ re.sub('\s+', '', ds_expected.renderContents())
def html_preview(html): soup = BeautifulSoup(html, 'html.parser') for tag in soup.findAll(True): tag.hidden = True return str(soup.renderContents().decode('ascii'))
def get_phones(self, url): for _ in range(3): response = requests.get(url) if response.ok: break else: raise HttpError log.debug('%02d: %s downloaded', self.number, url) soup = BeautifulSoup(response.content, 'lxml') for tag in soup.recursiveChildGenerator(): if isinstance(tag, element.Tag): tag.hidden = True elif isinstance(tag, element.Comment): tag.extract() phones = set() text = soup.renderContents().decode('utf8') for match in re.findall(PHONE_RE, text): numbers = re.sub(r'\D', '', match) phone = self.canonize(numbers) if phone: phones.add(phone) return phones
def sanitizeHTML(value, base_url=None): rjs = r'[\s]*(&#x.{1,7})?'.join(list('javascript:')) rvb = r'[\s]*(&#x.{1,7})?'.join(list('vbscript:')) re_scripts = re.compile('(%s)|(%s)' % (rjs, rvb), re.IGNORECASE) validTags = 'p i strong b u a h1 h2 h3 pre br img input'.split() validAttrs = 'href src width height class name id type value'.split() urlAttrs = 'href src'.split() # Attributes which should have a URL soup = BeautifulSoup(value) for comment in soup.findAll(text=lambda text: isinstance(text, Comment)): # Get rid of comments comment.extract() for tag in soup.findAll(True): if tag.name not in validTags: tag.hidden = True attrs = dict(tag.attrs) tag.attrs = {} for attr, val in attrs.iteritems(): if attr in validAttrs: val = re_scripts.sub('', val) # Remove scripts (vbs & js) if attr in urlAttrs: val = urljoin(base_url, val) # Calculate the absolute url tag.attrs[attr] = val ret = soup.renderContents().decode('utf8') #if strip_quotes: # ret = re.sub(r"[\"']", '', ret) return ret
def render_excerpt(post): from bs4 import BeautifulSoup from django.utils import text, html VALID_TAGS = ['p'] content = post.content has_more = content.find('<!-- more -->') if has_more == -1: has_more = content.find('<!--more-->') # Might be Wordpress style if has_more > -1: content = content[:has_more] content = re.sub(r"(\[caption)([^\]]*)(])(.*)(\[/caption\])", '', content) content = re.sub(r'(\[source((code)*? lang(uage)*?)*?=([\'"]*?)(python)([\'"]*?)])(.*?)(\[/source(code)*?\])', '', content, flags=re.MULTILINE|re.DOTALL) content = re.sub(r"(\[caption)([^\]]*)(])(.*)(\[/caption\])", '', content) content = re.sub(r"(\[youtube:)([^\]]*)(])", '', content) soup = BeautifulSoup(content, "html.parser") for tag in soup.findAll(True): if tag.name not in VALID_TAGS: tag.replaceWithChildren() stripped_html = force_unicode(soup.renderContents()) return force_unicode(text.truncate_html_words(stripped_html, 50))
def sanitize_html(value, base_url=None): value=value.strip('\n\r') rjs = r'[\s]*(&#x.{1,7})?'.join(list('javascript:')) rvb = r'[\s]*(&#x.{1,7})?'.join(list('vbscript:')) re_scripts = re.compile('(%s)|(%s)' % (rjs, rvb), re.IGNORECASE) validTags = 'p i strong b u a h1 h2 h3 pre br img'.split() validAttrs = 'href src width height'.split() urlAttrs = 'href src'.split() # Attributes which should have a URL soup = BeautifulSoup(value) for comment in soup.findAll(text=lambda text: isinstance(text, Comment)): # Get rid of comments comment.extract() for tag in soup.findAll(True): if tag.name not in validTags: tag.hidden = True attrs = tag.attrs tag.attrs = [] for attr, val in attrs: if attr in validAttrs: val = re_scripts.sub('', val) # Remove scripts (vbs & js) if attr in urlAttrs: val = urljoin(base_url, val) # Calculate the absolute url tag.attrs.append((attr, val)) return soup.renderContents().decode('utf8')
def html_sanitizer(html): """ Sanitize HTML filter, borrowed from http://djangosnippets.org/snippets/205/""" rjs = r'[\s]*(&#x.{1,7})?'.join(list('javascript:')) rvb = r'[\s]*(&#x.{1,7})?'.join(list('vbscript:')) re_scripts = re.compile('(%s)|(%s)' % (rjs, rvb), re.IGNORECASE) valid_tags = ['a', 'br', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'ol', 'p', 'strong', 'table', 'tr', 'td', 'th', 'u', 'ul', 'thead', 'tbody', 'tfoot', 'em', 'dd', 'dt', 'dl', 'span', 'div', 'del', 'add', 'i', 'hr', 'pre', 'blockquote', 'address', 'code', 'caption', 'abbr', 'acronym', 'cite', 'dfn', 'q', 'ins', 'sup', 'sub', 'samp', 'tt', 'small', 'big', 'video', 'audio', 'canvas'] valid_attrs = ['href', 'src', 'width', 'height'] soup = BeautifulSoup(html) for comment in soup.findAll(text=lambda text: isinstance(text, Comment)): comment.extract() for tag in soup.findAll(True): if tag.name not in valid_tags: tag.hidden = True attrs = tag.attrs tag.attrs = {} for attr in attrs: if attr in valid_attrs: val = re_scripts.sub('', attrs[attr]) # Remove scripts (vbs & js) tag.attrs[attr] = val return soup.renderContents().decode('utf8')
def _process_comment(self, comment): """ Comment enters as a bs4 Tag and returns as a string """ comment_string = comment.__str__() #quote comment_string = re.sub('<quote>', '<span class="quote">', comment_string) comment_string = re.sub('</quote>', '</span>', comment_string) #line comment_string = re.sub('<l>', '<span class="in-comment-line">', comment_string) comment_string = re.sub('<l>', '</span>', comment_string) comment_soup = BeautifulSoup(comment_string) comment = "" VALID_TAGS = ["div2", "span"] for tag in comment_soup.findAll(True): if tag.name not in VALID_TAGS: tag.hidden = True comment = comment_soup.renderContents().decode("utf-8") # <br>'s for \n's comment = re.sub("\n", " ", comment) comment = re.sub(" ", " ", comment) comment = re.sub("<div2[^<]+?>", "", comment) comment = re.sub("</div2>", "", comment) comment = comment.strip() return comment
def sanitizeHTML(value, mode='none'): """ Удаляет из value html-теги. Если mode==none - все теги Если mode==strict - все теги кроме разрешенных """ if mode == 'strict': valid_tags = 'p i em strong b u a h1 h2 h3 h4 pre br div span ul ol li img ' \ 'blockquote object param embed iframe ' \ 'table thead tbody tr td'.split() else: valid_tags = [] valid_attrs = 'href src pic user page class text title alt style colspan rowspan rel'.split() # параметры видеороликов valid_attrs += 'width height classid codebase id name value flashvars webkitallowfullscreen mozallowfullscreen ' \ 'allowfullscreen allowscriptaccess ' \ 'quality src type bgcolor base seamlesstabbing swLiveConnect pluginspage data frameborder'.split() soup = BeautifulSoup(value.encode('utf8'), from_encoding='utf8') for tag in soup.recursiveChildGenerator(): if isinstance(tag, element.Tag): if tag.name in valid_tags: tag.attrs = dict((attr, val) for attr, val in tag.attrs.items() if attr in valid_attrs) else: tag.hidden = True elif isinstance(tag, element.Comment): tag.extract() return soup.renderContents().decode('utf8')
def truncate_longwords_html(value, length=27): """ Couper les mots beaucoup trop longs, de type « soupe de touches » Permet de combattre les pratiques de sabotage des mises en pages ex. abcdefghijklmnopqrstuvwxyzabc devient abcdefghijklmnopqrstuvwxyza bc :param value: Texte à reformater :param length: Longueur maximale d'une suite de caractères sans espace """ re.DEBUG = settings.DEBUG soup = BeautifulSoup(value, 'lxml') texts = soup.findAll(text=True) # Sous fonction : récupère un match et le coupe def cut_match(match): portion = list(match.group()) portion.insert(length - 1, ' ') return "".join(portion) # Supprimer toutes les séquences de la même lettre par cut_match pattern = r"\S{{{0}}}".format(int(length)) for text in texts: new_text = re.sub(pattern, cut_match, text) text.replaceWith(new_text) return mark_safe(soup.renderContents().decode('utf8'))
def __sanitize_str(self, text, base_url=None): """ sanitize data when string :param text: :param base_url: :return: """ soup = BeautifulSoup(text, features='html.parser') for comment in soup.findAll( text=lambda text: isinstance(text, Comment)): comment.extract() for tag in soup.findAll(True): if tag.name not in self.valid_tags: tag.hidden = True attrs = tag.attrs tag.attrs = [] for attr, val in attrs.items(): if attr in self.valid_attrs: val = self.re_scripts.sub('', val) # Remove scripts (vbs & js) if attr in self.url_attrs: val = urljoin(base_url, val) # Calculate the absolute url tag.attrs.append((attr, val)) return soup.renderContents().decode('utf8')
def content_pages(self): marker = '--m-a-r-k-er--' soup = BeautifulSoup(self.content) elems = soup.find_all('div', attrs={'style': 'page-break-after: always;'}) for elem in elems: elem.replace_with(marker) return soup.renderContents().split(marker)
def parse_post(post): # Only keep the first line; no newlines please soup = BeautifulSoup(post["body"]) for tag in soup.findAll(True): tag.hidden = True post["body"] = soup.renderContents() return post
def content_pages(self): marker = "--m-a-r-k-er--" soup = BeautifulSoup(self.content, "html.parser") elems = soup.find_all("hr") for elem in elems: elem.replace_with(marker) return soup.renderContents().split(marker)
def sanitize_html(value): soup = BeautifulSoup(value, "lxml") for tag in soup.findAll(True): if tag.name not in VALID_TAGS: tag.hidden = True return soup.renderContents()
def scrub_HTML(html): soup = BeautifulSoup(html) for tag in soup.findAll(True): if tag.name not in VALID_TAGS: tag.hidden = True return soup.renderContents()
def track_links(content, context): """ Convert all links in the template for the user to track his navigation """ if not context.get('uidb36'): return content soup = BeautifulSoup(content) for link_markup in soup('a'): if link_markup.get('href') and 'no-track' not in link_markup.get('rel', ''): if TRACKING_IGNORE_ANCHOR: if '#' in link_markup.get('href')[0]: continue link_href = link_markup['href'] if link_href.startswith("http"): link_title = link_markup.get('title', link_href) link, created = Link.objects.get_or_create(url=link_href, defaults={'title': link_title}) link_markup['href'] = '%s%s' % ( context['base_url'], reverse( 'newsletter_newsletter_tracking_link', args=[context['newsletter'].slug, context['uidb36'], context['token'], link.pk] ) ) if USE_PRETTIFY: return soup.prettify() else: return soup.renderContents()
def process(self, value, **kwargs): soup = BeautifulSoup(value, self.parser) self.remove_comments(soup) for tag in soup.findAll(True): self.process_tag(tag) value = soup.renderContents().decode('utf8') return super(HTMLProcessor, self).process(value, **kwargs)
def _process_comment(self, comment): """ Comment enters as a bs4 Tag and returns as a string """ comment_string = comment.__str__() #quote comment_string = re.sub('<quote>', '<span class="quote">', comment_string) comment_string = re.sub('</quote>', '</span>', comment_string) #line comment_string = re.sub('<l>', '<span class="in-comment-line">', comment_string) comment_string = re.sub('<l>', '</span>', comment_string) comment_soup = BeautifulSoup( comment_string ) comment = "" VALID_TAGS = ["div2", "span"] for tag in comment_soup.findAll(True): if tag.name not in VALID_TAGS: tag.hidden = True comment = comment_soup.renderContents().decode("utf-8") # <br>'s for \n's comment = re.sub("\n", " ", comment) comment = re.sub(" ", " ", comment) comment = re.sub("<div2[^<]+?>", "", comment) comment = re.sub("</div2>", "", comment) comment = comment.strip() return comment
def sanitizeHTML(value, mode='none'): """ Удаляет из value html-теги. Если mode==none - все теги Если mode==strict - все теги кроме разрешенных """ if mode == 'strict': valid_tags = 'ol ul li p i strong b u a h1 h2 h3 pre br div span img blockquote youtube object param embed iframe'.split() else: valid_tags = [] valid_attrs = 'href src pic user page class text title alt'.split() # параметры видеороликов valid_attrs += 'width height classid codebase id name value flashvars allowfullscreen allowscriptaccess quality src type bgcolor base seamlesstabbing swLiveConnect pluginspage data frameborder'.split() soup = BeautifulSoup(value) for comment in soup.findAll( text=lambda text: isinstance(text, HtmlComment)): comment.extract() for tag in soup.findAll(True): if tag.name not in valid_tags: tag.hidden = True tag.attrs = [ (attr, val) for attr, val in tag.attrs if attr in valid_attrs ] result = soup.renderContents().decode('utf8') return result
def create_zip(zip_name, html, context): # Create a buffer to write the zipfile into zip_buffer = io.BytesIO() with HTMLWriter(zip_buffer) as writer: html = render_to_string(html, context) soup = BeautifulSoup(html, "html.parser") # Add links to zip for link in soup.find_all('link'): download_path = get_file_path(link["href"]) link["href"] = writer.write_file(download_path) # Add scripts to zip for script in soup.find_all('script'): download_path = get_file_path(script["src"]) script["src"] = writer.write_file(download_path) writer.write_index_contents(soup.renderContents()) writer.zf.printdir() # Create the HttpResponse object with the appropriate HTML header. response = HttpResponse(zip_buffer.getvalue(), content_type='application/x-zip-compressed') response['Content-Disposition'] = 'attachment; filename="{}"'.format( zip_name) return response
def proxy(url): if not 'http://' in url and not 'https://' in url: useable_url = 'http://' + url else: useable_url = url print(useable_url) try: r = requests.get(useable_url) c = r.content soup = BeautifulSoup(c, "html.parser") newtag_base = soup.new_tag('base') base_url = "{0.scheme}://{0.netloc}/".format(urlsplit(r.url)) newtag_base.attrs['href'] = base_url soup.head.insert(0, newtag_base) newtag_link = soup.new_tag('link') newtag_link.attrs['rel'] = "stylesheet" newtag_link.attrs['type'] = "text/css" newtag_link.attrs['href'] = "/static/style.css" soup.head.insert(0, newtag_link) return soup.renderContents() except: return "Please provide a valid url"
def NABRE_reader (book, c1, v1, c2, v2): opener = urllib.FancyURLopener({}) url = "http://www.usccb.org/bible/%s/%s%s.htm" % (book, book, c1) f = opener.open(url) html = f.read() content = BeautifulSoup(html) # Strip unwanted tags for tag in content.findAll('sup'): tag.extract() for tag in content.findAll('a'): tag.extract() for div in content.findAll('p','fn'): div.extract() for div in content.findAll('p','fncon'): div.extract() for div in content.findAll('p','en'): div.extract() for div in content.findAll('table'): div.extract() for thing in content.find('span','bcv',text=v1).find_parent().find_all_previous(): thing.extract() if content.find('span','bcv',text=str(int(v2)+1)): for thing in content.find('span','bcv',text=str(int(v2)+1)).find_all_next(): thing.extract() content.find('span','bcv',text=str(int(v2)+1)).parent.extract() return content.renderContents().strip()
def crawl(url, prevLevel=0): if prevLevel > 1: return None try: page = urllib2.urlopen(url) except (urllib2.URLError, ValueError): return None try: soup = BeautifulSoup(page, "lxml") except UnicodeEncodeError: return None root = {} root["url"] = url root["children"] = [] anchors = soup.findAll('a') for a in anchors: global link_arr2 link = a.get('href') if link is not None: child = crawl(a['href'], prevLevel + 1) if child is not None: link2 = child["url"] #for i in words: if 'network' in link2: link_arr2.append(link2) print child["url"] root["children"].append(child) root["content"] = soup.renderContents() return root
def _zoomImage(self, more_or_less: bool): """Zoom the grphical abstract""" soup = BeautifulSoup(self.toHtml(), "html.parser") try: # Find the current width of the image width = float(soup.findAll('img')[-1]['width']) if more_or_less: size = width + 0.1 * self.ini_width else: size = width - 0.1 * self.ini_width # Modify the width in the html soup.findAll('img')[-1]['width'] = size except IndexError: # Article has no graphical abstract self.parent.l.debug("_zoomImage, no image") # Clean the style attribute of the body tag, otherwise zooming is not # possible anymore soup.body['style'] = '' return soup.renderContents().decode()
def sanitize_html(self, value): soup = BeautifulSoup(value) for tag in soup.findAll(True): if tag.name not in VALID_HTML_TAGS: tag.hidden = True return soup.renderContents().decode('utf-8')
def _clean_data(cls, data): from bs4 import BeautifulSoup soup = BeautifulSoup(data) for tag in soup.findAll(True): if tag.name not in WHITELIST_TAGS: tag.extract() new = soup.renderContents() print(new) return new
def sanitize_html(value): soup = BeautifulSoup(value, features="html.parser") for tag in soup.findAll(True): if tag.name not in VALID_TAGS: tag.hidden = True return soup.renderContents()
def get_html(self, url): try: page = requests.get(url).text soup = BS(page, "html.parser") soup = soup.renderContents().decode() return soup except KeyboardInterrupt: raise KeyboardInterrupt
def sanitize_html(value): soup = BeautifulSoup(value, "lxml").html.body for tag in soup.findAll(True): if tag.name not in VALID_TAGS: tag.extract() return soup.renderContents().decode()
def process(self, content): soup = BeautifulSoup(content, "lxml") # lxml解析器 for tag in soup.find_all(recursive=True): if tag.name in self.valid_tags: print(tag.name) tag.hidden = True tag.clean() return soup.renderContents()
def sanitize_html(value): VALID_TAGS = ['table', 'em', 'p', 'tr', 'th', 'td', 'br'] soup = BeautifulSoup(value) for tag in soup.findAll(True): if tag.name not in VALID_TAGS: tag.hidden = True return soup.renderContents()
def clean_html(text, convert_newlines=True): """ Several steps to clean HTML input by user: 1. formats unformatted links 2. sets all links to target="_blank" 3. fixes broken lists (missing closing ul tags etc) 4. removes script tags """ # format unformatted links # http://stackoverflow.com/questions/32937126/beautifulsoup-replacewith-method-adding-escaped-html-want-it-unescaped/32937561?noredirect=1#comment53702552_32937561 soup = BeautifulSoup(text, "html.parser") text_nodes = soup.find_all(text=True) # https://stackoverflow.com/questions/53588107/prevent-beautifulsoups-find-all-from-converting-escaped-html-tags/53592575?noredirect=1#comment94061687_53592575 # text_nodes2 = [escape(x) for x in soup.strings] for textNode in text_nodes: escaped_text = escape(textNode) if convert_newlines: escaped_text = '<br>'.join(escaped_text.splitlines()) if textNode.parent and getattr(textNode.parent, 'name') == 'a': continue # skip already formatted links urlized_text = urlize(escaped_text, trim_url_limit=50) textNode.replace_with(BeautifulSoup(urlized_text, "html.parser")) # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#unicode-dammit soup = BeautifulSoup(soup.renderContents(), "html.parser", from_encoding="UTF-8") # All links in comments: force open in new tab links = soup.find_all('a') for link in links: link['target'] = '_blank' # Add missing ul tags (raw <li> elements can break the page!) # https://stackoverflow.com/questions/55619920/how-to-fix-missing-ul-tags-in-html-list-snippet-with-python-and-beautiful-soup ulgroup = 0 uls = [] for li in soup.findAll('li'): previous_element = li.findPrevious() # if <li> already wrapped in <ul>, do nothing if previous_element and previous_element.name == 'ul': continue # if <li> is the first element of a <li> group, wrap it in a new <ul> if not previous_element or previous_element.name != 'li': ulgroup += 1 ul = soup.new_tag("ul") li.wrap(ul) uls.append(ul) # append rest of <li> group to previously created <ul> elif ulgroup > 0: uls[ulgroup - 1].append(li) # Remove script tags [s.extract() for s in soup('script')] return str(soup)
def sanitize_html(html): soup = BeautifulSoup(html, 'html.parser') for tag in soup.findAll(True): if tag.name not in VALID_TAGS: tag.hidden = True return str(soup.renderContents().decode('ascii'))
def __str__(self): raw_html = BeautifulSoup(self.__render_element()) if self.prettify: out = ''.join(raw_html.prettify()) return out.encode('utf8') else: out = raw_html.renderContents() return out
def cron_gettr(): idx = Counter.objects.get(id='DL').number while True: soup = BeautifulSoup('', 'lxml') soup.is_xml = True envelope = soup.handle_starttag( 'Envelope', None, 'soapenv', { 'xmlns:soapenv': 'http://schemas.xmlsoap.org/soap/envelope/', 'xmlns:typ': 'http://isirpublicws.cca.cz/types/'}) header = soup.new_tag('Header', None, 'soapenv') envelope.append(header) body = soup.new_tag('Body', None, 'soapenv') envelope.append(body) req = soup.new_tag('getIsirWsPublicIdDataRequest', None, 'typ') body.append(req) idPodnetu = soup.new_tag('idPodnetu', None, None) idPodnetu.append(str(idx)) req.append(idPodnetu) url = 'https://isir.justice.cz:8443/isir_public_ws/IsirWsPublicService' headers = { 'content-type': 'text/xml; charset=utf-8', 'SOAPAction': '"http://isirpublicws.cca.cz/types/"', } res = post(url, soup.renderContents(), headers=headers) xml = res.content.decode('utf-8') soup = BeautifulSoup(xml, 'lxml') soup.is_xml = True if not (soup.stav and soup.stav.string == 'OK' and soup.find('data')): break lst = [] for t_data in soup.find_all('data'): idx = int(t_data.id.string) lst.append(Transaction( id=idx, datumZalozeniUdalosti=convdt(t_data.datumzalozeniudalosti), datumZverejneniUdalosti=convdt(t_data.datumzverejneniudalosti), dokumentUrl=(t_data.dokumenturl.string.strip() if t_data.dokumenturl else None), spisovaZnacka=t_data.spisovaznacka.string.strip(), typUdalosti=t_data.typudalosti.string.strip(), popisUdalosti=t_data.popisudalosti.string.strip(), oddil=(t_data.oddil.string.strip() if t_data.oddil else None), cisloVOddilu=(int(t_data.cislovoddilu.string) if t_data.cislovoddilu else None), poznamkaText=(t_data.poznamka.string.strip() if t_data.poznamka else None), error=False)) Transaction.objects.bulk_create(lst) LOGGER.debug('Read {:d} transaction(s)'.format(len(lst)))
def create_comment(self, user=None, text=None, path=None, target=None, parent=None): if not path: raise ValueError("Must include a path when adding a comment") if not user: raise ValueError("Must include a user when adding a comment") # format unformatted links # http://stackoverflow.com/questions/32937126/beautifulsoup-replacewith-method-adding-escaped-html-want-it-unescaped/32937561?noredirect=1#comment53702552_32937561 soup = BeautifulSoup(text, "html.parser") text_nodes = soup.find_all(text=True) for textNode in text_nodes: if textNode.parent and getattr(textNode.parent, 'name') == 'a': continue # skip already formatted links urlized_text = urlize(textNode, trim_url_limit=50) textNode.replace_with(BeautifulSoup(urlized_text, "html.parser")) # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#unicode-dammit soup = BeautifulSoup(soup.renderContents(), "html.parser", from_encoding="UTF-8") # All links in comments: force open in new tab links = soup.find_all('a') for link in links: link['target'] = '_blank' text = str(soup) comment = self.model( user=user, path=path, text=text, ) if target is not None: comment.target_content_type = ContentType.objects.get_for_model( target) comment.target_object_id = target.id # if quest is not None: # comment.quest = quest if parent is not None: comment.parent = parent comment.save(using=self._db) # add anchor target to Comment path now that id assigned when saved comment.path += "#comment-" + str(comment.id) comment.save(using=self._db) return comment
def sanitize_html(value): soup = BeautifulSoup(value) for tag in soup.findAll(True): if tag.name not in VALID_TAGS: tag.decompose() else: clean_attrs(tag.attrs) return soup.renderContents()
def sanitize(value): #Strip HTML soup = BeautifulSoup(value, "html.parser") for tag in soup.findAll(True): tag.hidden = True content = soup.renderContents() #Remove duplicate whitespaces/newlines content = content.replace("\n", " ") return ' '.join(content.split()) + "\n"
def sanitize_html(value, elements=acceptable_elements): soup = BeautifulSoup(value) for comment in soup.findAll( text=lambda text: isinstance(text, Comment)): comment.extract() for tag in soup.findAll(True): if tag.name not in elements: tag.hidden = True tag.attrs = sanitize_attrs(tag.attrs) return soup.renderContents().decode('utf8').replace('javascript:', '')
def sanitize_html(value): VALID_TAGS = ['b', 'i', 'u', 's', 'ding', 'br', 'font'] from bs4 import BeautifulSoup soup = BeautifulSoup(value) for tag in soup.findAll(True): if tag.name not in VALID_TAGS: tag.hidden = True return soup.renderContents().decode()
def sanitise_html(html): soup = BeautifulSoup(html, "html.parser") for tag in soup.findAll(True): if tag.name not in [ 'strong', 'em', 'b', 'i', 'p', 'ul', 'ol', 'li', 'br', 'p' ]: tag.hidden = True elif tag.attrs: tag.attrs = [] return soup.renderContents()
def get_words_text (s): if s == "": return u"" soup = BeautifulSoup(s) for st in soup("script"): st.extract() text = lxml.html.fromstring(soup.renderContents()).text_content() text = "".join(["\n" if a not in valid_letters else a for a in text.lower()]) return [a for a in text.split("\n") if a != ""]
def better_tables(content): if isinstance(content, contents.Static): return soup = BeautifulSoup(content._content, 'html.parser') for table in soup.findAll('table'): # table's "border" is so 1996 del(table['border']) # col widths. not only /infuriating/ it's also not in HTML5 for tag in table.findAll('colgroup'): tag.extract() # tbody and thead's valign for tag in table.findAll(['tbody', 'thead']): del(tag['valign']) soup.renderContents() content._content = soup.decode()
def sanitize (html, valid_tags=[]): soup = BeautifulSoup(html, 'html.parser') # get rid of comments for comment in soup.findAll( text=lambda text: isinstance(text, Comment)): comment.extract() for tag in soup.findAll(True): if tag.name not in valid_tags: tag.hidden = True return soup.renderContents().decode('utf8')
def add_class(html, css_class): soup = BeautifulSoup(unicode(html), 'html.parser') for tag in soup.children: if tag.name != 'script': if 'class' in tag: tag['class'].append(css_class) else: tag['class'] = [css_class] return mark_safe(soup.renderContents())
def process_html(content, generator): if isinstance(content, contents.Static): return # random parts of obfuscation algorithm random.seed(os.path.split(content.source_path)[1]) origsizepart, keysize = random.randint(10, 117), random.randint(8, 20) origin, size = origsizepart, 127 - origsizepart chars = string.ascii_lowercase key = ''.join(random.choice(chars) for _ in range(keysize)) insert_decrypt = False # html = re.sub(_regex, obfuscate_mail, content._content) html = content._content soup = BeautifulSoup(html, 'html.parser') for link in soup.findAll('a'): href = None for k in link.attrs: if k.lower() == 'href': href = k if href: if not link.attrs[href].startswith('mailto:'): continue log.debug('Obfuscating {0} in {1}'.format(link.attrs[href], content.source_path)) mailto = link.attrs[href] link.attrs[href] = 'click:address.will.be.decrypted.by.javascript' link.attrs['onclick'] = 'openMailer(this);' link.attrs['gaia'] = encrypt_mail(mailto, origin, size, key) insert_decrypt = True soup.renderContents() content._content = soup.decode() # insert JavaScript functions into <body/> if insert_decrypt: r_ = generator.settings.get('OBFUSCATE_MAILTO_REPLACE_TEXTCONTENT') content._content += decrypt_function(origin, size, key, r_)
def sanitize_html(value, valid_tags=VALID_TAGS): """ HTML 富文本过滤 参考: https://stackoverflow.com/questions/699468/python-html-sanitizer-scrubber-filter """ soup = BeautifulSoup(value) comments = soup.findAll(text=lambda text: isinstance(text, Comment)) [comment.extract() for comment in comments] out = soup.renderContents() while 1: out = out soup = BeautifulSoup(out) for tag in soup.findAll(True): if tag.name not in valid_tags: tag.hidden = True else: # attrs is a dict tag.attrs = __valid_attr(tag.name, tag.attrs) out = soup.renderContents() if out == out: break return out