Example #1
0
def sanitizeHtml(html, allowed_tags=None):
  if not allowed_tags:
    allowed_tags = DEFAULT_ALLOWED_TAGS
  soup = BeautifulSoup(html)
  comments = soup.findAll(text=lambda text:isinstance(text, Comment))
  [comment.extract() for comment in comments]
  # Some markup can be crafted to slip through BeautifulSoup's parser, so
  # we run this repeatedly until it generates the same output twice.
  newoutput = soup.renderContents()
  while 1:
    oldoutput = newoutput
    soup = BeautifulSoup(newoutput)
    for tag in soup.findAll(True):
      if tag.name not in allowed_tags:
        tag.hidden = True
      else:
        attrs = {}
        for attr, value in tag.attrs.items():
          if attr in allowed_tags[tag.name]:
            attrs[attr] = value
          if attr[:5] == 'data-':
            attrs[attr] = value
    newoutput = soup.renderContents()
    if oldoutput == newoutput:
      break
  return newoutput.decode('utf-8')
Example #2
0
def sanitize_html(value, valid_tags=VALID_TAGS):
	soup = BeautifulSoup(value,'lxml')
	comments = soup.findAll(text=lambda text:isinstance(text, Comment))
	[comment.extract() for comment in comments]
	# Some markup can be crafted to slip through BeautifulSoup's parser, so
	# we run this repeatedly until it generates the same output twice.
	newoutput = soup.renderContents()
	while 1:
		oldoutput = newoutput
		soup = BeautifulSoup(newoutput,'lxml')
		for tag in soup.findAll(True):
			if tag.name not in valid_tags:
				tag.hidden = True
				#tag.replaceWith('')
				tag.extract()
			else:
				if tag.name == 'div' and tag.attrs.get(u'id','') != u'content':
					tag.hidden = True
					#tag.replaceWith('')
					#tag.extract()
					#print "--- ",tag.name, tag.attrs
				#else:
					#if tag.name == 'div' :
						#pass
						#print "+++ ",tag.name, tag.attrs
		newoutput = soup.renderContents()
		if oldoutput == newoutput:
			break
	return newoutput
Example #3
0
def sanitize_html(value, valid_tags=VALID_TAGS):
    soup = BeautifulSoup(value)
    comments = soup.findAll(text=lambda text:isinstance(text, Comment))
    [comment.extract() for comment in comments]
    # Some markup can be crafted to slip through BeautifulSoup's parser, so
    # we run this repeatedly until it generates the same output twice.
    newoutput = soup.renderContents()
    while 1:
        oldoutput = newoutput
        soup = BeautifulSoup(newoutput)
        for tag in soup.findAll(True):
            if tag.name not in valid_tags:
                tag.hidden = True
            else:
                print tag.name,'***', tag.attrs
                print tag.name,'###', [x for x in tag.attrs ]
                m={}
                for k in tag.attrs.keys():
                    if k  in valid_tags[tag.name]:
                        m[k] = tag.attrs[k]
                tag.attrs = m
                print tag.name,'===', m
                #tag.attrs = [(attr, value) for attr, value in tag.attrs if attr in valid_tags[tag.name]]
        newoutput = soup.renderContents()
        if oldoutput == newoutput:
            break
    return newoutput
Example #4
0
def tables(content):
    if isinstance(content, contents.Static):
        return

    soup = BeautifulSoup(content._content, 'html.parser')
    classes = ('table', 'table-bordered', 'table-responsive')

    for table in soup.findAll('table'):
        table.attrs['class'] = ' '.join(classes)

    soup.renderContents()
    content._content = soup.decode()
Example #5
0
    def _compare_demultiplex_stats(self):
        """Compare the elements in two Demultiplex_Stats.htm files.
        """
        with open(pjoin(self.unaligned, self.basecall_dir,
            'Demultiplex_Stats.htm')) as f:
            ds_merged = BeautifulSoup(f.read())
        with open(pjoin(self.unaligned_expected, self.basecall_dir,
            'Demultiplex_Stats.htm')) as f:
            ds_expected = BeautifulSoup(f.read())

        #Compare the content of the htm
        return re.sub('\s+', '', ds_merged.renderContents()) == \
                        re.sub('\s+', '', ds_expected.renderContents())
Example #6
0
def div_around_tables(content):
    """
    Surround <table> tags with <div> to allow scrolling horizontally.
    """
    if isinstance(content, contents.Static):
        return

    soup = BeautifulSoup(content._content, "html.parser")

    for table in soup.findAll("table"):
        table.wrap(soup.new_tag("div", attrs={"style": "overflow-x: auto"}))

    soup.renderContents()
    content._content = soup.decode()
Example #7
0
    def _compare_demultiplex_stats(self):
        """Compare the elements in two Demultiplex_Stats.htm files.
        """
        with open(
                pjoin(self.unaligned, self.basecall_dir,
                      'Demultiplex_Stats.htm')) as f:
            ds_merged = BeautifulSoup(f.read())
        with open(
                pjoin(self.unaligned_expected, self.basecall_dir,
                      'Demultiplex_Stats.htm')) as f:
            ds_expected = BeautifulSoup(f.read())

        #Compare the content of the htm
        return re.sub('\s+', '', ds_merged.renderContents()) == \
                        re.sub('\s+', '', ds_expected.renderContents())
Example #8
0
def html_preview(html):
    soup = BeautifulSoup(html, 'html.parser')

    for tag in soup.findAll(True):
        tag.hidden = True

    return str(soup.renderContents().decode('ascii'))
Example #9
0
    def get_phones(self, url):
        for _ in range(3):
            response = requests.get(url)
            if response.ok:
                break
        else:
            raise HttpError

        log.debug('%02d: %s downloaded', self.number, url)

        soup = BeautifulSoup(response.content, 'lxml')

        for tag in soup.recursiveChildGenerator():
            if isinstance(tag, element.Tag):
                tag.hidden = True

            elif isinstance(tag, element.Comment):
                tag.extract()

        phones = set()
        text = soup.renderContents().decode('utf8')
        for match in re.findall(PHONE_RE, text):
            numbers = re.sub(r'\D', '', match)

            phone = self.canonize(numbers)
            if phone:
                phones.add(phone)

        return phones
Example #10
0
def sanitizeHTML(value, base_url=None):
    rjs = r'[\s]*(&#x.{1,7})?'.join(list('javascript:'))
    rvb = r'[\s]*(&#x.{1,7})?'.join(list('vbscript:'))
    re_scripts = re.compile('(%s)|(%s)' % (rjs, rvb), re.IGNORECASE)
    validTags = 'p i strong b u a h1 h2 h3 pre br img input'.split()
    validAttrs = 'href src width height class name id type value'.split()
    urlAttrs = 'href src'.split() # Attributes which should have a URL
    soup = BeautifulSoup(value)
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        # Get rid of comments
        comment.extract()
    for tag in soup.findAll(True):
        if tag.name not in validTags:
            tag.hidden = True
        attrs = dict(tag.attrs)
        tag.attrs = {}
        for attr, val in attrs.iteritems():
            if attr in validAttrs:
                val = re_scripts.sub('', val) # Remove scripts (vbs & js)
                if attr in urlAttrs:
                    val = urljoin(base_url, val) # Calculate the absolute url
                tag.attrs[attr] = val
    ret = soup.renderContents().decode('utf8')
    #if strip_quotes:
    #  ret = re.sub(r"[\"']", '', ret)
    return ret
Example #11
0
def render_excerpt(post):
    from bs4 import BeautifulSoup
    from django.utils import text, html
    
    VALID_TAGS = ['p']

    content = post.content

    has_more = content.find('<!-- more -->')
    if has_more == -1:
        has_more = content.find('<!--more-->') # Might be Wordpress style

    if has_more > -1:
        content = content[:has_more]

    content = re.sub(r"(\[caption)([^\]]*)(])(.*)(\[/caption\])", '', content)
    content = re.sub(r'(\[source((code)*? lang(uage)*?)*?=([\'"]*?)(python)([\'"]*?)])(.*?)(\[/source(code)*?\])', '', content, flags=re.MULTILINE|re.DOTALL)

    content = re.sub(r"(\[caption)([^\]]*)(])(.*)(\[/caption\])", '', content)
    content = re.sub(r"(\[youtube:)([^\]]*)(])", '', content)
    
    soup = BeautifulSoup(content, "html.parser")

    for tag in soup.findAll(True):
        if tag.name not in VALID_TAGS:
            tag.replaceWithChildren()
            
    stripped_html = force_unicode(soup.renderContents())
    return force_unicode(text.truncate_html_words(stripped_html, 50))
Example #12
0
def sanitize_html(value, base_url=None):

    value=value.strip('\n\r') 

    rjs = r'[\s]*(&#x.{1,7})?'.join(list('javascript:'))
    rvb = r'[\s]*(&#x.{1,7})?'.join(list('vbscript:'))
    re_scripts = re.compile('(%s)|(%s)' % (rjs, rvb), re.IGNORECASE)
    validTags = 'p i strong b u a h1 h2 h3 pre br img'.split()
    validAttrs = 'href src width height'.split()
    urlAttrs = 'href src'.split() # Attributes which should have a URL
    soup = BeautifulSoup(value)
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        # Get rid of comments
        comment.extract()
    for tag in soup.findAll(True):
        if tag.name not in validTags:
            tag.hidden = True
        attrs = tag.attrs
        tag.attrs = []
        for attr, val in attrs:
            if attr in validAttrs:
                val = re_scripts.sub('', val) # Remove scripts (vbs & js)
                if attr in urlAttrs:
                    val = urljoin(base_url, val) # Calculate the absolute url
                tag.attrs.append((attr, val))

    return soup.renderContents().decode('utf8')
Example #13
0
def html_sanitizer(html):
    """ Sanitize HTML filter, borrowed from http://djangosnippets.org/snippets/205/"""

    rjs = r'[\s]*(&#x.{1,7})?'.join(list('javascript:'))
    rvb = r'[\s]*(&#x.{1,7})?'.join(list('vbscript:'))
    re_scripts = re.compile('(%s)|(%s)' % (rjs, rvb), re.IGNORECASE)

    valid_tags = ['a', 'br', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'img', 'li', 'ol',
                  'p', 'strong', 'table', 'tr', 'td', 'th', 'u', 'ul', 'thead', 'tbody', 'tfoot',
                  'em', 'dd', 'dt', 'dl', 'span', 'div', 'del', 'add', 'i', 'hr', 'pre', 'blockquote',
                  'address', 'code', 'caption', 'abbr', 'acronym', 'cite', 'dfn', 'q', 'ins', 'sup', 'sub',
                  'samp', 'tt', 'small', 'big', 'video', 'audio', 'canvas']
    valid_attrs = ['href', 'src', 'width', 'height']

    soup = BeautifulSoup(html)
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        comment.extract()
    for tag in soup.findAll(True):
        if tag.name not in valid_tags:
            tag.hidden = True
        attrs = tag.attrs
        tag.attrs = {}
        for attr in attrs:
            if attr in valid_attrs:
                val = re_scripts.sub('', attrs[attr])  # Remove scripts (vbs & js)
                tag.attrs[attr] = val
    return soup.renderContents().decode('utf8')
Example #14
0
    def _process_comment(self, comment):
        """
		Comment enters as a bs4 Tag and returns as a string

		"""
        comment_string = comment.__str__()

        #quote
        comment_string = re.sub('<quote>', '<span class="quote">',
                                comment_string)
        comment_string = re.sub('</quote>', '</span>', comment_string)

        #line
        comment_string = re.sub('<l>', '<span class="in-comment-line">',
                                comment_string)
        comment_string = re.sub('<l>', '</span>', comment_string)

        comment_soup = BeautifulSoup(comment_string)
        comment = ""
        VALID_TAGS = ["div2", "span"]
        for tag in comment_soup.findAll(True):
            if tag.name not in VALID_TAGS:
                tag.hidden = True
        comment = comment_soup.renderContents().decode("utf-8")

        # <br>'s for \n's
        comment = re.sub("\n", " ", comment)
        comment = re.sub("  ", " ", comment)
        comment = re.sub("<div2[^<]+?>", "", comment)
        comment = re.sub("</div2>", "", comment)
        comment = comment.strip()

        return comment
Example #15
0
def sanitizeHTML(value, mode='none'):
    """ Удаляет из value html-теги.
        Если mode==none - все теги
        Если mode==strict - все теги кроме разрешенных
    """
    if mode == 'strict':
        valid_tags = 'p i em strong b u a h1 h2 h3 h4 pre br div span ul ol li img ' \
                     'blockquote object param embed iframe ' \
                     'table thead tbody tr td'.split()
    else:
        valid_tags = []

    valid_attrs = 'href src pic user page class text title alt style colspan rowspan rel'.split()
    # параметры видеороликов
    valid_attrs += 'width height classid codebase id name value flashvars webkitallowfullscreen mozallowfullscreen ' \
                   'allowfullscreen allowscriptaccess ' \
                   'quality src type bgcolor base seamlesstabbing swLiveConnect pluginspage data frameborder'.split()

    soup = BeautifulSoup(value.encode('utf8'), from_encoding='utf8')
    for tag in soup.recursiveChildGenerator():
        if isinstance(tag, element.Tag):
            if tag.name in valid_tags:
                tag.attrs = dict((attr, val) for attr, val in tag.attrs.items() if attr in valid_attrs)
            else:
                tag.hidden = True

        elif isinstance(tag, element.Comment):
            tag.extract()

    return soup.renderContents().decode('utf8')
Example #16
0
def truncate_longwords_html(value, length=27):
    """
    Couper les mots beaucoup trop longs, de type « soupe de touches »
    Permet de combattre les pratiques de sabotage des mises en pages
    ex. abcdefghijklmnopqrstuvwxyzabc devient abcdefghijklmnopqrstuvwxyza bc

    :param value: Texte à reformater
    :param length: Longueur maximale d'une suite de caractères sans espace
    """
    re.DEBUG = settings.DEBUG
    soup = BeautifulSoup(value, 'lxml')
    texts = soup.findAll(text=True)

    # Sous fonction : récupère un match et le coupe
    def cut_match(match):
        portion = list(match.group())
        portion.insert(length - 1, ' ')
        return "".join(portion)

    # Supprimer toutes les séquences de la même lettre par cut_match
    pattern = r"\S{{{0}}}".format(int(length))
    for text in texts:
        new_text = re.sub(pattern, cut_match, text)
        text.replaceWith(new_text)
    return mark_safe(soup.renderContents().decode('utf8'))
Example #17
0
    def __sanitize_str(self, text, base_url=None):
        """
        sanitize data when string
        :param text:
        :param base_url:
        :return:
        """
        soup = BeautifulSoup(text, features='html.parser')
        for comment in soup.findAll(
                text=lambda text: isinstance(text, Comment)):
            comment.extract()
        for tag in soup.findAll(True):

            if tag.name not in self.valid_tags:
                tag.hidden = True

            attrs = tag.attrs
            tag.attrs = []
            for attr, val in attrs.items():
                if attr in self.valid_attrs:
                    val = self.re_scripts.sub('',
                                              val)  # Remove scripts (vbs & js)
                    if attr in self.url_attrs:
                        val = urljoin(base_url,
                                      val)  # Calculate the absolute url
                    tag.attrs.append((attr, val))

        return soup.renderContents().decode('utf8')
Example #18
0
 def content_pages(self):
     marker = '--m-a-r-k-er--'
     soup = BeautifulSoup(self.content)
     elems = soup.find_all('div', attrs={'style': 'page-break-after: always;'})
     for elem in elems:
         elem.replace_with(marker)
     return soup.renderContents().split(marker)
Example #19
0
def parse_post(post):
  # Only keep the first line; no newlines please
  soup = BeautifulSoup(post["body"])
  for tag in soup.findAll(True):
    tag.hidden = True
  post["body"] = soup.renderContents()
  return post
Example #20
0
 def content_pages(self):
     marker = "--m-a-r-k-er--"
     soup = BeautifulSoup(self.content, "html.parser")
     elems = soup.find_all("hr")
     for elem in elems:
         elem.replace_with(marker)
     return soup.renderContents().split(marker)
Example #21
0
def sanitize_html(value):
    soup = BeautifulSoup(value, "lxml")
    for tag in soup.findAll(True):
        if tag.name not in VALID_TAGS:
            tag.hidden = True

    return soup.renderContents()
Example #22
0
def scrub_HTML(html):
	soup = BeautifulSoup(html)
	for tag in soup.findAll(True):
		if tag.name not in VALID_TAGS:
			tag.hidden = True

	return soup.renderContents()
def track_links(content, context):
    """
    Convert all links in the template for the user to track his navigation
    """
    if not context.get('uidb36'):
        return content

    soup = BeautifulSoup(content)
    for link_markup in soup('a'):
        if link_markup.get('href') and 'no-track' not in link_markup.get('rel', ''):
            if TRACKING_IGNORE_ANCHOR:
                if '#' in link_markup.get('href')[0]:
                    continue
            link_href = link_markup['href']

            if link_href.startswith("http"):
                link_title = link_markup.get('title', link_href)
                link, created = Link.objects.get_or_create(url=link_href, defaults={'title': link_title})
                link_markup['href'] = '%s%s' % (
                    context['base_url'], 
                    reverse(
                        'newsletter_newsletter_tracking_link', 
                        args=[context['newsletter'].slug, context['uidb36'], context['token'], link.pk]
                    )
                )

    if USE_PRETTIFY:
        return soup.prettify()
    else:
        return soup.renderContents()
Example #24
0
 def process(self, value, **kwargs):
     soup = BeautifulSoup(value, self.parser)
     self.remove_comments(soup)
     for tag in soup.findAll(True):
         self.process_tag(tag)
     value = soup.renderContents().decode('utf8')
     return super(HTMLProcessor, self).process(value, **kwargs)
	def _process_comment(self, comment):
		"""
		Comment enters as a bs4 Tag and returns as a string

		"""
		comment_string = comment.__str__()

		#quote
		comment_string = re.sub('<quote>', '<span class="quote">', comment_string)
		comment_string = re.sub('</quote>', '</span>', comment_string)

		#line
		comment_string = re.sub('<l>', '<span class="in-comment-line">', comment_string)
		comment_string = re.sub('<l>', '</span>', comment_string)

		comment_soup = BeautifulSoup( comment_string )
		comment = ""
		VALID_TAGS = ["div2", "span"]
		for tag in comment_soup.findAll(True):
			if tag.name not in VALID_TAGS:
				tag.hidden = True
		comment = comment_soup.renderContents().decode("utf-8")

		# <br>'s for \n's
		comment = re.sub("\n", " ", comment)
		comment = re.sub("  ", " ", comment)
		comment = re.sub("<div2[^<]+?>", "", comment)
		comment = re.sub("</div2>", "", comment)
		comment = comment.strip()

		return comment
Example #26
0
def sanitizeHTML(value, mode='none'):
    """ Удаляет из value html-теги.
        Если mode==none - все теги
        Если mode==strict - все теги кроме разрешенных
    """
    if mode == 'strict':
        valid_tags = 'ol ul li p i strong b u a h1 h2 h3 pre br div span img blockquote youtube object param embed iframe'.split()
    else:
        valid_tags = []
    valid_attrs = 'href src pic user page class text title alt'.split()
    # параметры видеороликов
    valid_attrs += 'width height classid codebase id name value flashvars allowfullscreen allowscriptaccess quality src type bgcolor base seamlesstabbing swLiveConnect pluginspage data frameborder'.split()

    soup = BeautifulSoup(value)
    for comment in soup.findAll(
        text=lambda text: isinstance(text, HtmlComment)):
        comment.extract()
    for tag in soup.findAll(True):
        if tag.name not in valid_tags:
            tag.hidden = True
        tag.attrs = [
            (attr, val) for attr, val in tag.attrs
            if attr in valid_attrs
        ]
    result = soup.renderContents().decode('utf8')
    return result
Example #27
0
def create_zip(zip_name, html, context):
    # Create a buffer to write the zipfile into
    zip_buffer = io.BytesIO()

    with HTMLWriter(zip_buffer) as writer:
        html = render_to_string(html, context)

        soup = BeautifulSoup(html, "html.parser")
        # Add links to zip
        for link in soup.find_all('link'):
            download_path = get_file_path(link["href"])
            link["href"] = writer.write_file(download_path)

        # Add scripts to zip
        for script in soup.find_all('script'):
            download_path = get_file_path(script["src"])
            script["src"] = writer.write_file(download_path)

        writer.write_index_contents(soup.renderContents())

        writer.zf.printdir()

    # Create the HttpResponse object with the appropriate HTML header.
    response = HttpResponse(zip_buffer.getvalue(),
                            content_type='application/x-zip-compressed')
    response['Content-Disposition'] = 'attachment; filename="{}"'.format(
        zip_name)

    return response
Example #28
0
def proxy(url):
    if not 'http://' in url and not 'https://' in url:
        useable_url = 'http://' + url
    else:
        useable_url = url
    print(useable_url)
    try:
        r = requests.get(useable_url)
        c = r.content
        soup = BeautifulSoup(c, "html.parser")

        newtag_base = soup.new_tag('base')
        base_url = "{0.scheme}://{0.netloc}/".format(urlsplit(r.url))
        newtag_base.attrs['href'] = base_url
        soup.head.insert(0, newtag_base)

        newtag_link = soup.new_tag('link')
        newtag_link.attrs['rel'] = "stylesheet"
        newtag_link.attrs['type'] = "text/css"
        newtag_link.attrs['href'] = "/static/style.css"
        soup.head.insert(0, newtag_link)

        return soup.renderContents()
    except:
        return "Please provide a valid url"
Example #29
0
def parse_post(post):
    # Only keep the first line; no newlines please
    soup = BeautifulSoup(post["body"])
    for tag in soup.findAll(True):
        tag.hidden = True
    post["body"] = soup.renderContents()
    return post
Example #30
0
def NABRE_reader (book, c1, v1, c2, v2):

    opener = urllib.FancyURLopener({})
    
    url = "http://www.usccb.org/bible/%s/%s%s.htm" % (book, book, c1)
    
    f = opener.open(url)
    
    html = f.read()
    
    content = BeautifulSoup(html)
    
    # Strip unwanted tags
    for tag in content.findAll('sup'):
        tag.extract()
    for tag in content.findAll('a'):
        tag.extract()
    for div in content.findAll('p','fn'):
        div.extract()
    for div in content.findAll('p','fncon'):
        div.extract()
    for div in content.findAll('p','en'):
        div.extract()
    for div in content.findAll('table'):
        div.extract()
    for thing in content.find('span','bcv',text=v1).find_parent().find_all_previous():
        thing.extract()
    if content.find('span','bcv',text=str(int(v2)+1)):
        for thing in content.find('span','bcv',text=str(int(v2)+1)).find_all_next():
            thing.extract()
        content.find('span','bcv',text=str(int(v2)+1)).parent.extract()

    return content.renderContents().strip()
Example #31
0
    def crawl(url, prevLevel=0):
        if prevLevel > 1:
            return None
        try:
            page = urllib2.urlopen(url)
        except (urllib2.URLError, ValueError):
            return None

        try:
            soup = BeautifulSoup(page, "lxml")
        except UnicodeEncodeError:
            return None
        root = {}
        root["url"] = url
        root["children"] = []

        anchors = soup.findAll('a')
        for a in anchors:
            global link_arr2
            link = a.get('href')
            if link is not None:
                child = crawl(a['href'], prevLevel + 1)
                if child is not None:
                    link2 = child["url"]
                    #for i in words:
                    if 'network' in link2:
                        link_arr2.append(link2)
                        print child["url"]
                        root["children"].append(child)

        root["content"] = soup.renderContents()
        return root
Example #32
0
    def _zoomImage(self, more_or_less: bool):

        """Zoom the grphical abstract"""

        soup = BeautifulSoup(self.toHtml(), "html.parser")

        try:
            # Find the current width of the image
            width = float(soup.findAll('img')[-1]['width'])
            if more_or_less:
                size = width + 0.1 * self.ini_width
            else:
                size = width - 0.1 * self.ini_width

            # Modify the width in the html
            soup.findAll('img')[-1]['width'] = size
        except IndexError:
            # Article has no graphical abstract
            self.parent.l.debug("_zoomImage, no image")

        # Clean the style attribute of the body tag, otherwise zooming is not
        # possible anymore
        soup.body['style'] = ''

        return soup.renderContents().decode()
Example #33
0
    def sanitize_html(self, value):
        soup = BeautifulSoup(value)

        for tag in soup.findAll(True):
            if tag.name not in VALID_HTML_TAGS:
                tag.hidden = True

        return soup.renderContents().decode('utf-8')
Example #34
0
 def _clean_data(cls, data):
     from bs4 import BeautifulSoup
     soup = BeautifulSoup(data)
     for tag in soup.findAll(True):
         if tag.name not in WHITELIST_TAGS:
             tag.extract()
     new = soup.renderContents()
     print(new)
     return new
Example #35
0
def sanitize_html(value):

    soup = BeautifulSoup(value, features="html.parser")

    for tag in soup.findAll(True):
        if tag.name not in VALID_TAGS:
            tag.hidden = True

    return soup.renderContents()
Example #36
0
    def get_html(self, url):
        try:
            page = requests.get(url).text
            soup = BS(page, "html.parser")
            soup = soup.renderContents().decode()
            return soup

        except KeyboardInterrupt:
            raise KeyboardInterrupt
def sanitize_html(value):

    soup = BeautifulSoup(value, "lxml").html.body

    for tag in soup.findAll(True):
        if tag.name not in VALID_TAGS:
            tag.extract()

    return soup.renderContents().decode()
Example #38
0
    def process(self, content):
        soup = BeautifulSoup(content, "lxml")  # lxml解析器
        for tag in soup.find_all(recursive=True):
            if tag.name in self.valid_tags:
                print(tag.name)
                tag.hidden = True
                tag.clean()

        return soup.renderContents()
def sanitize_html(value):
    VALID_TAGS = ['table', 'em', 'p', 'tr', 'th', 'td', 'br']
    soup = BeautifulSoup(value)

    for tag in soup.findAll(True):
        if tag.name not in VALID_TAGS:
            tag.hidden = True

    return soup.renderContents()
Example #40
0
def clean_html(text, convert_newlines=True):
    """ Several steps to clean HTML input by user:
    1. formats unformatted links
    2. sets all links to target="_blank"
    3. fixes broken lists (missing closing ul tags etc)
    4. removes script tags
    """
    # format unformatted links
    # http://stackoverflow.com/questions/32937126/beautifulsoup-replacewith-method-adding-escaped-html-want-it-unescaped/32937561?noredirect=1#comment53702552_32937561

    soup = BeautifulSoup(text, "html.parser")
    text_nodes = soup.find_all(text=True)
    # https://stackoverflow.com/questions/53588107/prevent-beautifulsoups-find-all-from-converting-escaped-html-tags/53592575?noredirect=1#comment94061687_53592575
    # text_nodes2 = [escape(x) for x in soup.strings]
    for textNode in text_nodes:
        escaped_text = escape(textNode)
        if convert_newlines:
            escaped_text = '<br>'.join(escaped_text.splitlines())

        if textNode.parent and getattr(textNode.parent, 'name') == 'a':
            continue  # skip already formatted links
        urlized_text = urlize(escaped_text, trim_url_limit=50)
        textNode.replace_with(BeautifulSoup(urlized_text, "html.parser"))

    # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#unicode-dammit
    soup = BeautifulSoup(soup.renderContents(),
                         "html.parser",
                         from_encoding="UTF-8")

    # All links in comments: force open in new tab
    links = soup.find_all('a')
    for link in links:
        link['target'] = '_blank'

    # Add missing ul tags (raw <li> elements can break the page!)
    # https://stackoverflow.com/questions/55619920/how-to-fix-missing-ul-tags-in-html-list-snippet-with-python-and-beautiful-soup
    ulgroup = 0
    uls = []
    for li in soup.findAll('li'):
        previous_element = li.findPrevious()
        # if <li> already wrapped in <ul>, do nothing
        if previous_element and previous_element.name == 'ul':
            continue
        # if <li> is the first element of a <li> group, wrap it in a new <ul>
        if not previous_element or previous_element.name != 'li':
            ulgroup += 1
            ul = soup.new_tag("ul")
            li.wrap(ul)
            uls.append(ul)
        # append rest of <li> group to previously created <ul>
        elif ulgroup > 0:
            uls[ulgroup - 1].append(li)

    # Remove script tags
    [s.extract() for s in soup('script')]

    return str(soup)
Example #41
0
def sanitize_html(html):

    soup = BeautifulSoup(html, 'html.parser')

    for tag in soup.findAll(True):
        if tag.name not in VALID_TAGS:
            tag.hidden = True

    return str(soup.renderContents().decode('ascii'))
Example #42
0
    def __str__(self):
        raw_html = BeautifulSoup(self.__render_element())

        if self.prettify:
            out =  ''.join(raw_html.prettify())
            return out.encode('utf8')
        else:
            out = raw_html.renderContents()
            return out
Example #43
0
def cron_gettr():

    idx = Counter.objects.get(id='DL').number
    while True:
        soup = BeautifulSoup('', 'lxml')
        soup.is_xml = True

        envelope = soup.handle_starttag(
            'Envelope', None,
            'soapenv', {
                'xmlns:soapenv': 'http://schemas.xmlsoap.org/soap/envelope/',
                'xmlns:typ': 'http://isirpublicws.cca.cz/types/'})
        header = soup.new_tag('Header', None, 'soapenv')
        envelope.append(header)
        body = soup.new_tag('Body', None, 'soapenv')
        envelope.append(body)
        req = soup.new_tag('getIsirWsPublicIdDataRequest', None, 'typ')
        body.append(req)
        idPodnetu = soup.new_tag('idPodnetu', None, None)
        idPodnetu.append(str(idx))
        req.append(idPodnetu)
        url = 'https://isir.justice.cz:8443/isir_public_ws/IsirWsPublicService'

        headers = {
            'content-type': 'text/xml; charset=utf-8',
            'SOAPAction': '"http://isirpublicws.cca.cz/types/"',
        }

        res = post(url, soup.renderContents(), headers=headers)

        xml = res.content.decode('utf-8')

        soup = BeautifulSoup(xml, 'lxml')
        soup.is_xml = True

        if not (soup.stav and soup.stav.string == 'OK' and soup.find('data')):
            break

        lst = []
        for t_data in soup.find_all('data'):
            idx = int(t_data.id.string)
            lst.append(Transaction(
                id=idx,
                datumZalozeniUdalosti=convdt(t_data.datumzalozeniudalosti),
                datumZverejneniUdalosti=convdt(t_data.datumzverejneniudalosti),
                dokumentUrl=(t_data.dokumenturl.string.strip() if t_data.dokumenturl else None),
                spisovaZnacka=t_data.spisovaznacka.string.strip(),
                typUdalosti=t_data.typudalosti.string.strip(),
                popisUdalosti=t_data.popisudalosti.string.strip(),
                oddil=(t_data.oddil.string.strip() if t_data.oddil else None),
                cisloVOddilu=(int(t_data.cislovoddilu.string) if t_data.cislovoddilu else None),
                poznamkaText=(t_data.poznamka.string.strip() if t_data.poznamka else None),
                error=False))

        Transaction.objects.bulk_create(lst)
        LOGGER.debug('Read {:d} transaction(s)'.format(len(lst)))
Example #44
0
    def create_comment(self,
                       user=None,
                       text=None,
                       path=None,
                       target=None,
                       parent=None):
        if not path:
            raise ValueError("Must include a path when adding a comment")
        if not user:
            raise ValueError("Must include a user  when adding a comment")

        # format unformatted links
        # http://stackoverflow.com/questions/32937126/beautifulsoup-replacewith-method-adding-escaped-html-want-it-unescaped/32937561?noredirect=1#comment53702552_32937561

        soup = BeautifulSoup(text, "html.parser")
        text_nodes = soup.find_all(text=True)
        for textNode in text_nodes:
            if textNode.parent and getattr(textNode.parent, 'name') == 'a':
                continue  # skip already formatted links
            urlized_text = urlize(textNode, trim_url_limit=50)
            textNode.replace_with(BeautifulSoup(urlized_text, "html.parser"))

        # https://www.crummy.com/software/BeautifulSoup/bs4/doc/#unicode-dammit
        soup = BeautifulSoup(soup.renderContents(),
                             "html.parser",
                             from_encoding="UTF-8")

        # All links in comments: force open in new tab
        links = soup.find_all('a')
        for link in links:
            link['target'] = '_blank'

        text = str(soup)

        comment = self.model(
            user=user,
            path=path,
            text=text,
        )
        if target is not None:
            comment.target_content_type = ContentType.objects.get_for_model(
                target)
            comment.target_object_id = target.id
        # if quest is not None:
        #     comment.quest = quest

        if parent is not None:
            comment.parent = parent

        comment.save(using=self._db)

        # add anchor target to Comment path now that id assigned when saved
        comment.path += "#comment-" + str(comment.id)
        comment.save(using=self._db)

        return comment
Example #45
0
def sanitize_html(value):
    soup = BeautifulSoup(value)

    for tag in soup.findAll(True):
        if tag.name not in VALID_TAGS:
            tag.decompose()
        else:
            clean_attrs(tag.attrs)

    return soup.renderContents()
Example #46
0
def sanitize(value):
    #Strip HTML
    soup = BeautifulSoup(value, "html.parser")
    for tag in soup.findAll(True):
        tag.hidden = True
    content = soup.renderContents()

    #Remove duplicate whitespaces/newlines
    content = content.replace("\n", " ")
    return ' '.join(content.split()) + "\n"
Example #47
0
def sanitize_html(value, elements=acceptable_elements):
    soup = BeautifulSoup(value)
    for comment in soup.findAll(
        text=lambda text: isinstance(text, Comment)):
        comment.extract()
    for tag in soup.findAll(True):
        if tag.name not in elements:
            tag.hidden = True
        tag.attrs = sanitize_attrs(tag.attrs)
    return soup.renderContents().decode('utf8').replace('javascript:', '')
Example #48
0
File: util.py Project: ov1d1u/cyuf
def sanitize_html(value):
    VALID_TAGS = ['b', 'i', 'u', 's', 'ding', 'br', 'font']
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(value)

    for tag in soup.findAll(True):
        if tag.name not in VALID_TAGS:
            tag.hidden = True

    return soup.renderContents().decode()
Example #49
0
def sanitize(value):
    #Strip HTML
    soup = BeautifulSoup(value, "html.parser")
    for tag in soup.findAll(True):
        tag.hidden = True
    content = soup.renderContents()

    #Remove duplicate whitespaces/newlines
    content = content.replace("\n", " ")
    return ' '.join(content.split()) + "\n"
Example #50
0
def sanitize_html(value):
    VALID_TAGS = ['b', 'i', 'u', 's', 'ding', 'br', 'font']
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(value)

    for tag in soup.findAll(True):
        if tag.name not in VALID_TAGS:
            tag.hidden = True

    return soup.renderContents().decode()
def sanitise_html(html):
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup.findAll(True):
        if tag.name not in [
                'strong', 'em', 'b', 'i', 'p', 'ul', 'ol', 'li', 'br', 'p'
        ]:
            tag.hidden = True
        elif tag.attrs:
            tag.attrs = []

    return soup.renderContents()
def get_words_text (s):
	if s == "":
		return u""
	soup = BeautifulSoup(s)
	for st in soup("script"):
		st.extract()
	
	text = lxml.html.fromstring(soup.renderContents()).text_content()
	
	text = "".join(["\n" if a not in valid_letters else a for a in text.lower()])
	return [a for a in text.split("\n") if a != ""]
def better_tables(content):
    if isinstance(content, contents.Static):
        return

    soup = BeautifulSoup(content._content, 'html.parser')

    for table in soup.findAll('table'):
        # table's "border" is so 1996
        del(table['border'])

        # col widths. not only /infuriating/ it's also not in HTML5
        for tag in table.findAll('colgroup'):
            tag.extract()

        # tbody and thead's valign
        for tag in table.findAll(['tbody', 'thead']):
            del(tag['valign'])

    soup.renderContents()
    content._content = soup.decode()
Example #54
0
def sanitize (html, valid_tags=[]):
    soup = BeautifulSoup(html, 'html.parser')
    # get rid of comments
    for comment in soup.findAll(
        text=lambda text: isinstance(text, Comment)):
        comment.extract()
    for tag in soup.findAll(True):
        if tag.name not in valid_tags:
            tag.hidden = True

    return soup.renderContents().decode('utf8')
Example #55
0
def add_class(html, css_class):
    soup = BeautifulSoup(unicode(html), 'html.parser')

    for tag in soup.children:
        if tag.name != 'script':
            if 'class' in tag:
                tag['class'].append(css_class)
            else:
                tag['class'] = [css_class]

    return mark_safe(soup.renderContents())
def process_html(content, generator):
    if isinstance(content, contents.Static):
        return

    # random parts of obfuscation algorithm
    random.seed(os.path.split(content.source_path)[1])
    origsizepart, keysize = random.randint(10, 117),  random.randint(8, 20)
    origin, size = origsizepart, 127 - origsizepart
    chars = string.ascii_lowercase
    key = ''.join(random.choice(chars) for _ in range(keysize))

    insert_decrypt = False

    # html = re.sub(_regex, obfuscate_mail, content._content)
    html = content._content

    soup = BeautifulSoup(html, 'html.parser')

    for link in soup.findAll('a'):
        href = None
        for k in link.attrs:
            if k.lower() == 'href':
                href = k
        if href:
            if not link.attrs[href].startswith('mailto:'):
                continue
            log.debug('Obfuscating {0} in {1}'.format(link.attrs[href],
                                                      content.source_path))
            mailto = link.attrs[href]
            link.attrs[href] = 'click:address.will.be.decrypted.by.javascript'
            link.attrs['onclick'] = 'openMailer(this);'
            link.attrs['gaia'] = encrypt_mail(mailto, origin, size, key)
            insert_decrypt = True

    soup.renderContents()
    content._content = soup.decode()

    # insert JavaScript functions into <body/>
    if insert_decrypt:
        r_ = generator.settings.get('OBFUSCATE_MAILTO_REPLACE_TEXTCONTENT')
        content._content += decrypt_function(origin, size, key, r_)
Example #57
0
def sanitize_html(value, valid_tags=VALID_TAGS):
    """ HTML 富文本过滤
    参考: https://stackoverflow.com/questions/699468/python-html-sanitizer-scrubber-filter
    """
    soup = BeautifulSoup(value)
    comments = soup.findAll(text=lambda text: isinstance(text, Comment))
    [comment.extract() for comment in comments]
    out = soup.renderContents()
    while 1:
        out = out
        soup = BeautifulSoup(out)
        for tag in soup.findAll(True):
            if tag.name not in valid_tags:
                tag.hidden = True
            else:
                # attrs is a dict
                tag.attrs = __valid_attr(tag.name, tag.attrs)
        out = soup.renderContents()
        if out == out:
            break
    return out